import pywikibot
import re
def get_page_titles_from_file(filename):
with open(filename, 'r', encoding='utf-8') as file:
return [line.strip() for line in file]
def extract_page_title(text):
pattern = r"\{\{bnr(?:2)?\|(.*?)\}\}"
matches = re.findall(pattern, text)
return matches
def is_page_in_categories(page_title, categories_to_check):
site = pywikibot.Site("ku", "wiktionary")
page = pywikibot.Page(site, page_title)
page_categories = [cat.title(with_ns=False) for cat in page.categories()]
for category in categories_to_check:
if category in page_categories:
return category
return None
def extract_cat_type_and_lang(cleared_category):
match = re.match(rf"^(.+) bi (.+)$", cleared_category)
if match:
return match.group(1), match.group(2)
return None, None
def update_page_content(page_title, extracted_titles, cleared_category, lang_code):
site = pywikibot.Site("ku", "wiktionary")
page = pywikibot.Page(site, page_title)
new_content = "== {{ziman|" + lang_code + "}} ==\n\n=== " + cleared_category + " ===\n"
new_content += "{{" + cleared_category.lower() + "|" + lang_code
if cleared_category == "Navdêr" and lang_code == "ku":
new_content += "|z=-"
if lang_code == "sdh":
new_content += "|sc=Latn"
new_content += "}}\n# {{guharto|" + lang_code + "|" + extracted_titles
if lang_code == "sdh":
new_content += "|sc=Latn"
new_content += "}}"
page.text = new_content
page.save("+{{[[Şablon:guharto|guharto]]}} (bi riya [[Bikarhêner:Balyozxane/guharto.py|guharto.py]])")
def log_skipped_page(page_title):
with open('skipped_pages.txt', 'a', encoding='utf-8') as file:
file.write(page_title + '\n')
# Define language names and their corresponding codes
languages = {
"kurdî": "ku",
"tirkî": "tr",
"îngilîzî": "en",
"erebî": "ar",
"zazakî": "zza",
"soranî": "ckb",
"almanî": "de",
"kurdiya başûrî": "sdh"
}
# Define category types
category_types = ["Lêker", "Navdêr", "Rengdêr", "Serenav", "Hoker", "Cînav", "Artîkel", "Baneşan", "Bazinedaçek", "Biwêj", "Daçek", "Girêdek", "Gotineke pêşiyan", "Hejmar", "Hevok", "Kurtenav", "Navgir", "Paşdaçek", "Paşgir", "Pêşdaçek", "Pêşgir", "Pirtik", "Reh", "Sembol", "Tîp"]
# Step 1: Get list of page titles from file
pages = get_page_titles_from_file('guhartolist.txt')
# Step 2 and 3: Process each page
for page_title in pages:
content = pywikibot.Page(pywikibot.Site('ku', 'wiktionary'), page_title).text
extracted_titles = extract_page_title(content)
pywikibot.output(f"extracted_titles '{extracted_titles}'.")
if extracted_titles:
# Create categories to check
categories_to_check = [f"{category} bi {lang_name}" for category in category_types for lang_name, _ in languages.items()]
pywikibot.output(f"categories_to_check '{categories_to_check}'.")
cleared_category = is_page_in_categories(extracted_titles[0], categories_to_check)
if cleared_category:
cat_type, lang_name = extract_cat_type_and_lang(cleared_category)
pywikibot.output(f"cat_type '{cat_type}', lang_name '{lang_name}'.")
lang_code = languages[lang_name]
pywikibot.output(f"lang_code '{lang_code}'.")
update_page_content(page_title, extracted_titles[0], cat_type, lang_code)
print(f"Updated page content for {page_title}")
else:
print(f"{page_title} is not in the specified categories")
log_skipped_page(page_title)
else:
print(f"No page title extracted from {page_title}")
log_skipped_page(page_title)
print("Finished processing all pages.")