import pywikibot
import mwparserfromhell
import json
import re
def extract_section_info(page_title):
"""
Extract section information (wikilinks) from a Wikipedia page.
Args:
page_title (str): The title of the Wikipedia page.
Returns:
dict: A dictionary containing information about wikilinks found in the page.
"""
print(f"Extracting section info for page: {page_title}")
site = pywikibot.Site('ku', 'wikipedia')
page = pywikibot.Page(site, page_title)
text = page.text
wikicode = mwparserfromhell.parse(text)
match = re.search(r'Level/(\d+)', page_title)
level = int(match.group(1)) if match else 3
if level == 4:
topic = re.sub(r'Wîkîpediya:Gotarên bingehîn/Level/4/', '', page_title)
sections = list(wikicode.get_sections(levels=[1, 2, 3, 4, 5, 6]))
wikilinks_info = {}
section_titles_count = {}
for section in sections:
section_headings = section.filter_headings()
if not section_headings:
print("No section headings found.")
continue
section_title = str(section_headings[0].title.strip())
if section_title in section_titles_count:
section_titles_count[section_title] += 1
else:
section_titles_count[section_title] = 1
if section_titles_count[section_title] > 1:
new_section_title = f"{section_title} {section_titles_count[section_title]}"
else:
new_section_title = section_title
for template in section.filter_templates():
template_name = template.name.strip_code().strip()
if template_name == "lgb":
if "en" in [param.name.strip_code().strip() for param in template.params]:
continue # Skip templates with "en" parameter
wikilink_title = template.get(1).value.strip_code()
if "Wîkîpediya:" in wikilink_title or "Wikipedia:" in wikilink_title or "Kategorî:" in wikilink_title:
continue
wikilink_info = {"level": level}
if level == 4:
wikilink_info["topic"] = topic
wikilink_info["section"] = str(new_section_title)
wikilinks_info[wikilink_title] = wikilink_info
return wikilinks_info
def save_page_content(site, page_title, content):
"""
Save content to a page on the Kurdish Wikipedia.
"""
page = pywikibot.Page(site, page_title)
page.text = content
page.save(summary="[[Bikarhêner:Balyozxane/skrîpt/py/jsonGbingehin.py|Bot]]: Naveroka rûpelê hat rojanekirin")
def process_wikilinks_info(wikilinks_info, root, site):
"""
Process the wikilinks information and save them alphabetically on the Kurdish Wikipedia.
"""
output = {}
kurdish_alphabet = "ABCÇDEÊFGHIÎJKLMNOPQRSŞTUÛVWXYZ"
for wikilink_title, info in wikilinks_info.items():
first_letter = wikilink_title[0].upper() # Get the first letter and convert it to uppercase
if first_letter not in kurdish_alphabet:
first_letter = "yên din" # Use "Yên din" for letters not in the Kurdish alphabet
if first_letter not in output:
output[first_letter] = {}
output[first_letter][wikilink_title] = info
for letter, items in output.items():
page_title = f"{root}/{letter}.json"
content = json.dumps(items, ensure_ascii=False, indent=4)
save_page_content(site, page_title, content)
if __name__ == "__main__":
pages = [
"Wîkîpediya:Gotarên bingehîn/Level/1",
"Wîkîpediya:Gotarên bingehîn/Level/2",
"Wîkîpediya:Gotarên bingehîn",
"Wîkîpediya:Gotarên bingehîn/Level/4/Biyolojî û zanistên sihetê",
"Wîkîpediya:Gotarên bingehîn/Level/4/Civak û zanistên civakî",
"Wîkîpediya:Gotarên bingehîn/Level/4/Cografya",
"Wîkîpediya:Gotarên bingehîn/Level/4/Felsefe û dîn",
"Wîkîpediya:Gotarên bingehîn/Level/4/Huner",
"Wîkîpediya:Gotarên bingehîn/Level/4/Jiyana rojane",
"Wîkîpediya:Gotarên bingehîn/Level/4/Matematîk",
"Wîkîpediya:Gotarên bingehîn/Level/4/Mirov",
"Wîkîpediya:Gotarên bingehîn/Level/4/Tarîx",
"Wîkîpediya:Gotarên bingehîn/Level/4/Teknolojî",
"Wîkîpediya:Gotarên bingehîn/Level/4/Zanistên fizîkî"
]
wikilinks_info_all = {}
for page_title in pages:
wikilinks_info = extract_section_info(page_title)
for wikilink_title, info in wikilinks_info.items():
if wikilink_title not in wikilinks_info_all:
wikilinks_info_all[wikilink_title] = info
root = "Wîkîpediya:Gotarên bingehîn/dane"
site = pywikibot.Site('ku', 'wikipedia')
process_wikilinks_info(wikilinks_info_all, root, site)