from bs4 import BeautifulSoup
def has_ziman_de_twice(content):
return content.count('{{ziman|de}}') == 2
def extract_pages_with_ziman_de(xml_file_path):
with open(xml_file_path, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'lxml')
pages_with_ziman_de = []
for i, page in enumerate(soup.find_all('page'), 1):
title = page.find('title').text
revision = page.find('revision')
if revision is not None:
content = revision.find('text').text
if content is not None and has_ziman_de_twice(content):
pages_with_ziman_de.append(title)
print(f'Found page {i}: {title}')
return pages_with_ziman_de
xml_file_path = r'C:\Users\Kombers\Desktop\kuwiktionary\kuwiktionary-20231101-pages-articles-multistream.xml'
pages_with_ziman_de = extract_pages_with_ziman_de(xml_file_path)
with open('pages.txt', 'w', encoding='utf-8') as file:
for page in pages_with_ziman_de:
file.write(page + '\n')