import re
import pywikibot
# Read the list of pages from pages.txt
with open('pages.txt', 'r', encoding='utf-8') as file:
page_list = [line.strip() for line in file]
def get_lang_codes(page):
site = pywikibot.Site("ku", "wiktionary")
page_ku = pywikibot.Page(site, page)
page_text = page_ku.text
lang_codes = re.findall(r'{{ziman\|([^\}]+)}}', page_text)
return lang_codes
def log_page(page_title, dublicate):
if dublicate:
with open('pages_dub.txt', 'a', encoding='utf-8') as file:
file.write(page_title + '\n')
else:
with open('pages_skipped.txt', 'a', encoding='utf-8') as file:
file.write(page_title + '\n')
def check_and_write_duplicates(page_list):
duplicates = set()
for page in page_list:
print(f"\n<<{page}>>\n")
lang_codes = get_lang_codes(page)
print(f"lang_codes: {lang_codes}")
if lang_codes:
seen = set()
for lang_code in lang_codes:
if lang_code in seen:
duplicates.add(page)
log_page(page, True)
print(f"{lang_code} logged")
break
else:
seen.add(lang_code)
log_page(page, False)
print(f"{lang_code} skipped")
# Add notification for how many pages were found
print(f"All pages processed. Found {len(duplicates)} pages.")
check_and_write_duplicates(page_list)