Bikarhêner:Balyozxane/zimande.py

from bs4 import BeautifulSoup

def has_ziman_de_twice(content):
    return content.count('{{ziman|de}}') == 2

def extract_pages_with_ziman_de(xml_file_path):
    with open(xml_file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')

        pages_with_ziman_de = []

        for i, page in enumerate(soup.find_all('page'), 1):
            title = page.find('title').text
            revision = page.find('revision')
            if revision is not None:
                content = revision.find('text').text

                if content is not None and has_ziman_de_twice(content):
                    pages_with_ziman_de.append(title)
                    print(f'Found page {i}: {title}')

    return pages_with_ziman_de

xml_file_path = r'C:\Users\Kombers\Desktop\kuwiktionary\kuwiktionary-20231101-pages-articles-multistream.xml'

pages_with_ziman_de = extract_pages_with_ziman_de(xml_file_path)

with open('pages.txt', 'w', encoding='utf-8') as file:
    for page in pages_with_ziman_de:
        file.write(page + '\n')