#!/usr/bin/env python3
"""
python pwb.py updatewin -file:"mytools.py" -s:"+fix"
Tools:
ucfirst
lcfirst
without_comments
get_cur_month_year Returns current month and year as a string in Kurdish
get_cat_members Retrieve all members of a category that belong to a given namespace
get_unhidden_categories Fetches the unhidden categories for a page
get_sitelinks Retrieve sitelinks for the title and dbNames
get_sitelinks_qid Retrieve sitelinks for the QID and language codes.
get_template_redirects Return a list of redirects of the given template.
is_template_in_page Check if a given template or its redirects is included in the page text.
is_category_in_page Check if a given category is included in the categories of a page.
remove_template Remove template from wiki text.
remove_sitil_class
zaravayen_din Kontrol bike eger sayfe di kategoriyên zaravayan de ye.
matrix_to_wikitable
"""
import re
import datetime
import requests
import pywikibot
import mwparserfromhell
from bs4 import BeautifulSoup
from functools import lru_cache
from typing import List, Union, Iterator
from pywikibot.tools import first_lower, first_upper
VERBOSE = True
SITIL = ['Stub', 'Kurt', 'Şitlek', 'Şitl', 'Şitil']
KALIK = ['WPB', 'WikiProject banner shell', 'WikiProject Shell', 'Bannershell', 'WPBS', 'WikiProject banner',
'Kalika Wîkîprojeyê', 'Kalika wîkîprojeyê']
UNCAT_TEMPL = ['Uncategorized', 'Bêkat', 'Uncategorized stub', 'Bêkategorî']
ku_months = {
1: 'kanûna paşîn',
2: 'sibat',
3: 'adar',
4: 'nîsan',
5: 'gulan',
6: 'hezîran',
7: 'tîrmeh',
8: 'tebax',
9: 'îlon',
10: 'çiriya pêşîn',
11: 'çiriya paşîn',
12: 'kanûna pêşîn'
}
def get_cur_month_year() -> str:
"""
Returns current month and year as a string in Kurdish
:return: month year
"""
current_month_number = datetime.datetime.now().month
current_year = datetime.datetime.now().year
month_name = ku_months[current_month_number]
month_year = f"{month_name} {current_year}"
return month_year
@lru_cache(maxsize=None)
def get_cat_members(site: pywikibot.site.BaseSite, category_name: str, namespace: int, itr_page=False,
recursive=False) -> List[str]:
"""
Retrieve all members of a category that belong to a given namespace.
:param site: The Pywikibot site object representing the target wiki.
:param category_name: The name of the category from which to retrieve members.
:param namespace: The namespace number to filter the category members by.
:param itr_page: If true, return list of pywikibot page object
:param recursive: Recurse all subcategories
:return: A list of titles (without namespace, if itr_page not given) of the category members that belong to the specified namespace.
"""
category = pywikibot.Category(site, 'Category:' + category_name)
members_list = []
for member in category.members(recurse=recursive):
if member.namespace() == namespace:
if itr_page:
members_list.append(member)
else:
members_list.append(member.title(with_ns=False))
return members_list
@lru_cache(maxsize=None)
def get_template_redirects(site, template_title):
"""
Return a list of redirects of the given template.
:param site: pywikibot Site
:param template_title: without "Şablon:"
:return: list of redirects including the given template
"""
template_title = "Template:" + template_title
template_page = pywikibot.Page(site, template_title)
redirects = template_page.backlinks(filter_redirects=True, namespaces=[10])
redirect_titles = [redirect.title(with_ns=False) for redirect in redirects]
redirect_titles.append(template_page.title(with_ns=False))
return redirect_titles
def is_liste(site: pywikibot.site.BaseSite, categories: List[str]) -> bool:
"""
Checks if a given set of categories contains Lîste or its subcategories.
:return: True if the categories contain a list category, otherwise False
:param categories: A set of category names to check.
:param site: Pywikibot site object
"""
categories = set(categories)
# Fetch members of the 'Lîste' category, limited to 10 members
list_cats = set(get_cat_members(site, 'Lîste', 14, recursive=True))
# Add 'Lîste' itself to the set of list categories
list_cats.add('Lîste')
# Find intersection between the given categories and the list categories
list_intersection = categories.intersection(list_cats)
# Return True if there is any intersection, False otherwise
return len(list_intersection) > 0
def remove_template(text: str, template_redirects) -> str:
"""
Remove specified template from wiki text.
:param text: Wiki text
:param template_redirects: List of template names or a single template name as a string
:return: str Wiki text
"""
if isinstance(template_redirects, str):
template_redirects = [template_redirects]
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
template_name = template.name.strip()
template_name = template_name[0].upper() + template_name[1:]
if template_name in template_redirects:
wikicode.remove(template)
return str(wikicode)
def is_template_in_page(text: str, template_redirects) -> bool:
"""
Check if a given template and its redirects is included in the page text.
:param text : wikitext
:param template_redirects : If str is given turns into a list
:return: True if the category is included in the page categories, False otherwise.
"""
if isinstance(template_redirects, str):
template_redirects = [template_redirects]
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
template_name = template.name.strip()
template_name = template_name[0].upper() + template_name[1:]
if template_name in template_redirects:
return True
return False
def is_category_in_page(page: pywikibot.page.BasePage, category_title: str) -> bool:
"""
Check if a given category is included in the categories of a page.
:param page: A Pywikibot page object.
:param category_title: The title of the category to check. (Kategorî: ferq nake)
:return: True if the category is included in the page categories, False otherwise.
"""
if not page or not page.exists():
return False
category_title = category_title.strip()
category = pywikibot.Category(page.site, category_title)
# Iterate through the categories of the page
for page_category in page.categories():
# Check if the titles of the categories match
if page_category.title(with_ns=False) == category.title(with_ns=False):
return True
return False
def zaravayen_din(categories: Iterator[pywikibot.Page]) -> bool:
"""
Kontrol bike eger sayfe di kategoriyên zaravayan de ye.
:param categories: pywikibot.Categories
:return: True eger kategoriya zaravayan di rûpelê de be, wekî din False.
"""
kurdish_categories = [
"Gotara bi soranî",
"Gotara bi kirmaşanî",
"Gotara bi kurdiya başûr",
"Gotarên bi kurmanciya behdînî",
"Gotara bi zazakî"
]
page_categories = {c.title(with_ns=False) for c in categories}
return any(cat in page_categories for cat in kurdish_categories)
def ucfirst(parsed) -> str:
"""
:param parsed: text parsed by mwparserfromhell
:return: First char uppercase string stripped. Use first_upper for other strings
"""
text = first_upper(str(parsed).strip())
text = text.replace('\u200e', '')
return text
def lcfirst(parsed) -> str:
"""
:param parsed: text parsed by mwparserfromhell
:return: First char lowercase string stripped Use first_lower for other strings
"""
text = first_lower(str(parsed).strip())
text = text.replace('\u200e', '')
return text
def get_sitelink(from_site, to_site, page_title):
"""
Retrieve the sitelink title for a page from one site to another site using Wikidata.
Args:
from_site (str): (e.g., 'enwiki' for English Wikipedia) use self.site.dbName().
to_site (str): (e.g., 'kuwiki' for Kurdish Wikipedia) use self.site.dbName().
page_title (str): The title of the page on the source site.
Returns:
str or None: The title of the page on the target site if found, otherwise None.
Raises:
requests.exceptions.RequestException: If there's an issue with the network request.
ValueError: If the response from Wikidata does not contain the expected data.
"""
url = "https://www.wikidata.org/w/api.php"
params = {
"action": "wbgetentities",
"sites": from_site,
"titles": page_title,
"props": "sitelinks",
"format": "json"
}
try:
response = requests.get(url, params=params)
response.raise_for_status() # Raise an HTTPError for bad responses
data = response.json()
# Check if the response contains the entities data
if "entities" not in data:
raise ValueError("The response does not contain 'entities'.")
entity = next(iter(data["entities"].values()))
# Check if the sitelinks exist in the entity and the target site is present
if "sitelinks" in entity and to_site in entity["sitelinks"]:
found_title = entity["sitelinks"][to_site]["title"]
return found_title
else:
return None
except requests.exceptions.RequestException as e:
print(f"An error occurred while making the request: {e}")
return None
except ValueError as e:
print(f"An error occurred with the response data: {e}")
return None
except KeyError as e:
print(f"An expected key is missing in the response data: {e}")
return None
def get_sitelinks_qid(qid: str, lang_codes: Union[str, List[str]]) -> dict:
"""
Retrieve sitelinks for the specified Wikidata QID and language codes.
:param qid: Wikidata QID
:param lang_codes: String or list of language codes (without 'wiki' suffix). If a single language code is provided as a string, 'wiki' suffix will be appended automatically.
:return: If ['ku', 'en'] send, returns dictionary with kuwiki, enwiki
"""
url = f"https://www.wikidata.org/w/api.php"
params = {
"action": "wbgetentities",
"format": "json",
"ids": qid,
"props": "sitelinks"
}
# Convert lang_codes to a list if it's a string
if isinstance(lang_codes, str):
lang_codes = [lang_codes]
try:
# Sending the API request
response = requests.get(url, params=params)
data = response.json()
result = {}
# Extracting titles of sitelinks for each language code
if 'sitelinks' in data['entities'][qid]:
sitelinks = data['entities'][qid]['sitelinks']
for lang_code in lang_codes:
lang_code_with_wiki = lang_code + 'wiki'
site_data = sitelinks.get(lang_code_with_wiki, None)
result[lang_code_with_wiki] = site_data['title'] if site_data else None
return result
else:
return {lang_code + 'wiki': None for lang_code in lang_codes}
except Exception as e:
print(f"An error occurred: {e}")
return {lang_code + 'wiki': None for lang_code in lang_codes}
# from #https://github.com/ashotjanibekyan/WikiPyScripts/blob/master/helpers.py
def without_comments(wiki_text):
if wiki_text is None:
return None
wikicode = mwparserfromhell.parse(wiki_text)
for node in wikicode.nodes[:]:
if isinstance(node, mwparserfromhell.nodes.Comment):
wikicode.remove(node)
return str(wikicode).strip()
# from #https://github.com/ashotjanibekyan/WikiPyScripts/blob/master/helpers.py
def matrix_to_wikitable(matrix):
text = '{| class="wikitable sortable"\n'
text += '!' + '!!'.join(matrix[0]) + '\n'
for i in range(1, len(matrix)):
if isinstance(matrix[i], list) and len(matrix[i]) == len(matrix[0]):
row = (str(x) if x or x == 0 else ' ' for x in matrix[i])
text += '|-\n|' + '||'.join(row) + '\n'
text += '|}'
return text
def get_unhidden_categories(lang_code, page_title, withNS=False):
"""
Fetches the unhidden categories for a given Wikipedia page.
Parameters:
page_title (str): The title of the Wikipedia page to retrieve categories for.
Returns:
list: A list of unhidden category titles associated with the page.
"""
url = f"https://{lang_code}.wikipedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"prop": "categories",
"titles": page_title,
"clshow": "!hidden",
"cllimit": "max" # Increase the limit to get more categories if available
}
response = requests.get(url, params=params)
data = response.json()
pages = data.get("query", {}).get("pages", {})
unhidden_categories = []
for page_id, page_data in pages.items():
if 'categories' in page_data:
for category in page_data['categories']:
if withNS is True:
cat_title = category['title']
else:
cat_title = category['title'].replace("Kategorî:", "")
unhidden_categories.append(cat_title)
return unhidden_categories
def get_qid(site, title):
# Construct the Wikidata API URL
api_url = 'https://www.wikidata.org/w/api.php'
db_name = site.dbName()
params = {
'action': 'wbgetentities',
'sites': db_name,
'titles': title,
'props': 'claims|sitelinks',
'format': 'json'
}
# Make the API request
try:
response = requests.get(api_url, params=params)
response.raise_for_status() # Raise an exception for bad responses
except requests.exceptions.RequestException as e:
print(f"Error fetching data from Wikidata: {e}")
return None
data = response.json()
# Check if the response contains the item ID
entities = data.get('entities')
if not entities:
return None
# Extract the item ID
item_id = next(iter(entities))
return item_id
def get_pvalue(site, title, pvalue):
# Construct the Wikidata API URL
api_url = 'https://www.wikidata.org/w/api.php'
db_name = site.dbName()
params = {
'action': 'wbgetentities',
'sites': db_name,
'titles': title,
'props': 'claims|sitelinks',
'format': 'json'
}
# Make the API request
try:
response = requests.get(api_url, params=params)
response.raise_for_status() # Raise an exception for bad responses
except requests.exceptions.RequestException as e:
print(f"Error fetching data from Wikidata: {e}")
return None
data = response.json()
# Check if the response contains the item ID
entities = data.get('entities')
if not entities:
print("no entiti")
return None
# Extract the item ID
item_id = next(iter(entities))
item_data = entities[item_id]
claims = item_data.get('claims', {})
property_claims = claims.get(pvalue, [])
if not property_claims:
print("tine")
return None
# Get the target value from the claim
property_claim = property_claims[0]
mainsnak = property_claim.get('mainsnak', {})
datavalue = mainsnak.get('datavalue', {})
value_type = datavalue.get('type', {})
value = datavalue.get('value', {})
if value_type == "wikibase-entityid":
property_value = value.get('id')
elif value_type == "string":
property_value = value
elif value_type == "monolingualtext":
property_value = value.get('text'), value.get('language')
else:
property_value = datavalue
return property_value
# tweaked from https://gist.github.com/hesyifei/00f6ee0890ac3477b58e4d6b9c712fc2#file-deletepersondata-py-L29
def referring_page_generator(referred_page, follow_redirects=False,
with_template_inclusion=True,
only_template_inclusion=False,
total=None, content=False):
"""
Return a list of all pages that refer to or embed the page.
If you need a full list of referring pages, use pages = list(s.getReferences())
:param referred_page: Template name withNS=True
:param follow_redirects: if True, also iterate pages that link to a redirect pointing to the page. (default true)
:param with_template_inclusion: if True, also iterate pages where self is used as a template. (default False)
:param only_template_inclusion: if True, only iterate pages where self is used as a template.
:param total: iterate no more than this number of pages in total
:param content: if True, retrieve the content of the current version of each referring page (default False)
:return: a list of Pages
"""
gen = referred_page.getReferences(
follow_redirects=follow_redirects,
with_template_inclusion=with_template_inclusion,
only_template_inclusion=only_template_inclusion,
total=total, content=content)
page_list = []
for page in gen:
page_list.append(page)
return page_list
def remove_sitil_class(page):
talk_page = page.toggleTalkPage()
if not talk_page.exists():
return
text = talk_page.text
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
template_name = ucfirst(template.name)
if template_name in KALIK and template.has('sinif'):
sinif_val = lcfirst(template.get('sinif').value)
if sinif_val == 'şitil':
template.add('sinif', '')
new_text = str(wikicode)
if new_text != text:
talk_page.text = new_text
talk_page.save(summary=f'Bot: Sinifa şitil hat rakirin')
def get_wordcount(page):
# tried to imitate en.wikipedia.org/wiki/Wikipedia:Prosesize except for Roj û sal pages
html_content = page.get_parsed_page()
soup = BeautifulSoup(html_content, 'html.parser')
word_count = 0
# Find the .mw-parser-output div
parser_output = soup.find('div', class_='mw-parser-output')
if not parser_output:
if VERBOSE:
print("No .mw-parser-output found")
return False
# Calculate prose size and word count for each direct child paragraph within .mw-parser-output
for paragraph in parser_output.find_all('p', recursive=False):
# Remove <span class="mwe-math-element"> if it exists
for span in paragraph.find_all('span', class_='mwe-math-element'):
span.decompose()
# Get the innerHTML of the paragraph
paragraph_text = paragraph.get_text()
# Strip HTML tags using regex similar to the JavaScript code
clean_content = re.sub(r'<[^>]+>', '', paragraph_text)
clean_content = clean_content.strip()
if VERBOSE:
print(clean_content)
words = clean_content.split()
word_count += len(words)
contains_roj = is_category_in_page(page, "Rojên salê")
sal_templates = ["Gotara salê", "Gotara salê b.z."]
contains_sal = is_template_in_page(page.text, sal_templates)
if contains_roj or contains_sal:
# Find all <li> elements within .mw-parser-output and add their word counts
for li in parser_output.find_all('li'):
li_text = li.get_text()
if VERBOSE:
print(li_text)
words = li_text.split()
word_count += len(words)
if VERBOSE:
print("word_count: ", word_count)
return word_count
class TagHelpers:
@staticmethod
def is_sewi(page: pywikibot.page.BasePage) -> bool:
"""
Checks if a page is sêwî or not
:param page: pywikibot.page.BasePage
:return: True if sêwî else False.
"""
incoming_links = list(page.getReferences(namespaces=[0]))
if len(incoming_links) < 1:
return True
for link in incoming_links:
# If the only incoming link is the page itself, return sewî
if link.title() == page.title() and len(incoming_links) == 1:
return True
contains_zarava = zaravayen_din(link.categories())
# Return not sewî if the link is not in zarava categories, not a redirect, and not a disambiguation page
if not contains_zarava and not link.isRedirectPage() and not link.isDisambig():
return False
# If all links are redirects or disambiguations, return sewî
return True
@staticmethod
def is_sitil(page):
def check_wordcount():
# en:WP:AWB adds stub tags for 300 words
return get_wordcount(page) <= 300
def is_sitil_class():
talk_page = page.toggleTalkPage()
if not talk_page.exists():
return None
text = talk_page.text
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
template_name = ucfirst(template.name)
if template_name in KALIK and template.has('sinif'):
sinif_val = lcfirst(template.get('sinif').value)
if not sinif_val:
return None
if sinif_val == 'şitil':
return True
elif sinif_val == 'lîste':
return 'lîste'
else:
return False
if is_sitil_class() == "lîste":
if VERBOSE:
print("lîste")
return "lîste"
elif check_wordcount() is True and is_sitil_class() is True:
if VERBOSE:
print("both true")
return True
elif check_wordcount() is True and is_sitil_class() is None:
if VERBOSE:
print("yek none")
return True
else:
if VERBOSE:
print(is_sitil_class())
print("both false or ")
return False
@staticmethod
def is_bekategori(page):
"""
Returns true if bêkategorî, false if all categories are hidden, return idk if a cat doesnt exist. For ns:0 only
"""
lang_code = page.site.code
title = page.title()
unhidden_cats = get_unhidden_categories(lang_code, title, withNS=True)
if page.namespace() == 14:
# kategorî ye
if page.isCategoryRedirect():
return False
if page.title() == "Kategorî:!Serkategorî":
# !Serkategorî her tim bêkategorî ye loma ne hewce ye
return False
all_cats = [cat.title() for cat in page.categories()]
if len(all_cats) == 0:
# hîç kat tine loma bêkategorî ye
return True
vesarti_heye = is_category_in_page(page, 'Kategoriyên veşartî')
if vesarti_heye:
# veşartî ye loma ne bêkategorî ye
return False
if not vesarti_heye and len(unhidden_cats) == 0:
# ne veşartî ye û kategoriyên neveşartî tine loma bêkategorî
return True
for cat in unhidden_cats:
cat_page = pywikibot.Page(page.site, cat)
if not cat_page.exists():
# kategoriyek hîn nehatiye çêkirin loma em nizanin
return 'idk'
# Heya vir hat loma ne bêkategorî ye
return False
elif page.namespace() == 0:
# gotar e
if len(unhidden_cats) == 0:
# hîç kategoriya neveşartî tine loma bêkategorî ye
return True
for cat in unhidden_cats:
cat_page = pywikibot.Page(page.site, cat)
if not cat_page.exists():
# kategoriyek hîn nehatiye çêkirin loma em nizanin
return 'idk'
# Heya vir hat loma ne bêkategorî ye
return False
else:
return 'idk'