Bikarhêner:Balyozxane/skrîpt/py/mytools.py

#!/usr/bin/env python3
"""
python pwb.py updatewin -file:"mytools.py" -s:"+fix"


Tools:

ucfirst

lcfirst

without_comments

get_cur_month_year	Returns current month and year as a string in Kurdish

get_cat_members	Retrieve all members of a category that belong to a given namespace

get_unhidden_categories Fetches the unhidden categories for a page

get_sitelinks   Retrieve sitelinks for the title and dbNames
get_sitelinks_qid	Retrieve sitelinks for the QID and language codes.

get_template_redirects	Return a list of redirects of the given template.

is_template_in_page	Check if a given template or its redirects is included in the page text.

is_category_in_page	Check if a given category is included in the categories of a page.

remove_template	Remove template from wiki text.

remove_sitil_class

zaravayen_din	Kontrol bike eger sayfe di kategoriyên zaravayan de ye.

matrix_to_wikitable

"""
import re
import datetime
import requests
import pywikibot
import mwparserfromhell
from bs4 import BeautifulSoup
from functools import lru_cache
from typing import List, Union, Iterator
from pywikibot.tools import first_lower, first_upper

VERBOSE = True

SITIL = ['Stub', 'Kurt', 'Şitlek', 'Şitl', 'Şitil']
KALIK = ['WPB', 'WikiProject banner shell', 'WikiProject Shell', 'Bannershell', 'WPBS', 'WikiProject banner',
         'Kalika Wîkîprojeyê', 'Kalika wîkîprojeyê']
UNCAT_TEMPL = ['Uncategorized', 'Bêkat', 'Uncategorized stub', 'Bêkategorî']

ku_months = {
    1: 'kanûna paşîn',
    2: 'sibat',
    3: 'adar',
    4: 'nîsan',
    5: 'gulan',
    6: 'hezîran',
    7: 'tîrmeh',
    8: 'tebax',
    9: 'îlon',
    10: 'çiriya pêşîn',
    11: 'çiriya paşîn',
    12: 'kanûna pêşîn'
}


def get_cur_month_year() -> str:
    """
    Returns current month and year as a string in Kurdish
    :return: month year
    """
    current_month_number = datetime.datetime.now().month
    current_year = datetime.datetime.now().year
    month_name = ku_months[current_month_number]
    month_year = f"{month_name} {current_year}"
    return month_year


@lru_cache(maxsize=None)
def get_cat_members(site: pywikibot.site.BaseSite, category_name: str, namespace: int, itr_page=False,
                    recursive=False) -> List[str]:
    """
    Retrieve all members of a category that belong to a given namespace.

    :param site: The Pywikibot site object representing the target wiki.
    :param category_name: The name of the category from which to retrieve members.
    :param namespace: The namespace number to filter the category members by.
    :param itr_page: If true, return list of pywikibot page object
    :param recursive: Recurse all subcategories
    :return: A list of titles (without namespace, if itr_page not given) of the category members that belong to the specified namespace.
    """
    category = pywikibot.Category(site, 'Category:' + category_name)
    members_list = []

    for member in category.members(recurse=recursive):
        if member.namespace() == namespace:
            if itr_page:
                members_list.append(member)
            else:
                members_list.append(member.title(with_ns=False))

    return members_list


@lru_cache(maxsize=None)
def get_template_redirects(site, template_title):
    """
    Return a list of redirects of the given template.

    :param site: pywikibot Site
    :param template_title: without "Şablon:"
    :return: list of redirects including the given template
    """
    template_title = "Template:" + template_title
    template_page = pywikibot.Page(site, template_title)
    redirects = template_page.backlinks(filter_redirects=True, namespaces=[10])
    redirect_titles = [redirect.title(with_ns=False) for redirect in redirects]
    redirect_titles.append(template_page.title(with_ns=False))

    return redirect_titles


def is_liste(site: pywikibot.site.BaseSite, categories: List[str]) -> bool:
    """
    Checks if a given set of categories contains Lîste or its subcategories.


    :return: True if the categories contain a list category, otherwise False
    :param categories: A set of category names to check.
    :param site: Pywikibot site object
    """
    categories = set(categories)

    # Fetch members of the 'Lîste' category, limited to 10 members
    list_cats = set(get_cat_members(site, 'Lîste', 14, recursive=True))
    # Add 'Lîste' itself to the set of list categories
    list_cats.add('Lîste')

    # Find intersection between the given categories and the list categories
    list_intersection = categories.intersection(list_cats)

    # Return True if there is any intersection, False otherwise
    return len(list_intersection) > 0


def remove_template(text: str, template_redirects) -> str:
    """
    Remove specified template from wiki text.

    :param text: Wiki text
    :param template_redirects: List of template names or a single template name as a string
    :return: str Wiki text
    """
    if isinstance(template_redirects, str):
        template_redirects = [template_redirects]

    wikicode = mwparserfromhell.parse(text)

    for template in wikicode.filter_templates():
        template_name = template.name.strip()
        template_name = template_name[0].upper() + template_name[1:]
        if template_name in template_redirects:
            wikicode.remove(template)

    return str(wikicode)


def is_template_in_page(text: str, template_redirects) -> bool:
    """
    Check if a given template and its redirects is included in the page text.
    :param text : wikitext
    :param template_redirects : If str is given turns into a list
    :return: True if the category is included in the page categories, False otherwise.
    """
    if isinstance(template_redirects, str):
        template_redirects = [template_redirects]

    wikicode = mwparserfromhell.parse(text)

    for template in wikicode.filter_templates():
        template_name = template.name.strip()
        template_name = template_name[0].upper() + template_name[1:]
        if template_name in template_redirects:
            return True

    return False


def is_category_in_page(page: pywikibot.page.BasePage, category_title: str) -> bool:
    """
    Check if a given category is included in the categories of a page.

    :param page: A Pywikibot page object.
    :param category_title: The title of the category to check. (Kategorî: ferq nake)
    :return: True if the category is included in the page categories, False otherwise.
    """
    if not page or not page.exists():
        return False

    category_title = category_title.strip()
    category = pywikibot.Category(page.site, category_title)

    # Iterate through the categories of the page
    for page_category in page.categories():
        # Check if the titles of the categories match
        if page_category.title(with_ns=False) == category.title(with_ns=False):
            return True

    return False


def zaravayen_din(categories: Iterator[pywikibot.Page]) -> bool:
    """
    Kontrol bike eger sayfe di kategoriyên zaravayan de ye.

    :param categories: pywikibot.Categories
    :return: True eger kategoriya zaravayan di rûpelê de be, wekî din False.
    """
    kurdish_categories = [
        "Gotara bi soranî",
        "Gotara bi kirmaşanî",
        "Gotara bi kurdiya başûr",
        "Gotarên bi kurmanciya behdînî",
        "Gotara bi zazakî"
    ]

    page_categories = {c.title(with_ns=False) for c in categories}

    return any(cat in page_categories for cat in kurdish_categories)


def ucfirst(parsed) -> str:
    """
    :param parsed: text parsed by mwparserfromhell
    :return: First char uppercase string stripped. Use first_upper for other strings
    """
    text = first_upper(str(parsed).strip())
    text = text.replace('\u200e', '')
    return text


def lcfirst(parsed) -> str:
    """
    :param parsed: text parsed by mwparserfromhell
    :return: First char lowercase string stripped Use first_lower for other strings
    """
    text = first_lower(str(parsed).strip())
    text = text.replace('\u200e', '')
    return text


def get_sitelink(from_site, to_site, page_title):
    """
    Retrieve the sitelink title for a page from one site to another site using Wikidata.

    Args:
        from_site (str): (e.g., 'enwiki' for English Wikipedia) use self.site.dbName().
        to_site (str):  (e.g., 'kuwiki' for Kurdish Wikipedia) use self.site.dbName().
        page_title (str): The title of the page on the source site.

    Returns:
        str or None: The title of the page on the target site if found, otherwise None.

    Raises:
        requests.exceptions.RequestException: If there's an issue with the network request.
        ValueError: If the response from Wikidata does not contain the expected data.
    """
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "sites": from_site,
        "titles": page_title,
        "props": "sitelinks",
        "format": "json"
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        data = response.json()

        # Check if the response contains the entities data
        if "entities" not in data:
            raise ValueError("The response does not contain 'entities'.")

        entity = next(iter(data["entities"].values()))

        # Check if the sitelinks exist in the entity and the target site is present
        if "sitelinks" in entity and to_site in entity["sitelinks"]:
            found_title = entity["sitelinks"][to_site]["title"]
            return found_title
        else:
            return None

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making the request: {e}")
        return None
    except ValueError as e:
        print(f"An error occurred with the response data: {e}")
        return None
    except KeyError as e:
        print(f"An expected key is missing in the response data: {e}")
        return None


def get_sitelinks_qid(qid: str, lang_codes: Union[str, List[str]]) -> dict:
    """
    Retrieve sitelinks for the specified Wikidata QID and language codes.

    :param qid: Wikidata QID
    :param lang_codes: String or list of language codes (without 'wiki' suffix). If a single language code is provided as a string, 'wiki' suffix will be appended automatically.
    :return: If ['ku', 'en'] send, returns dictionary with kuwiki, enwiki
    """
    url = f"https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": qid,
        "props": "sitelinks"
    }

    # Convert lang_codes to a list if it's a string
    if isinstance(lang_codes, str):
        lang_codes = [lang_codes]

    try:
        # Sending the API request
        response = requests.get(url, params=params)
        data = response.json()
        result = {}
        # Extracting titles of sitelinks for each language code
        if 'sitelinks' in data['entities'][qid]:
            sitelinks = data['entities'][qid]['sitelinks']
            for lang_code in lang_codes:
                lang_code_with_wiki = lang_code + 'wiki'
                site_data = sitelinks.get(lang_code_with_wiki, None)
                result[lang_code_with_wiki] = site_data['title'] if site_data else None
            return result
        else:
            return {lang_code + 'wiki': None for lang_code in lang_codes}
    except Exception as e:
        print(f"An error occurred: {e}")
        return {lang_code + 'wiki': None for lang_code in lang_codes}


# from #https://github.com/ashotjanibekyan/WikiPyScripts/blob/master/helpers.py
def without_comments(wiki_text):
    if wiki_text is None:
        return None
    wikicode = mwparserfromhell.parse(wiki_text)
    for node in wikicode.nodes[:]:
        if isinstance(node, mwparserfromhell.nodes.Comment):
            wikicode.remove(node)
    return str(wikicode).strip()


# from #https://github.com/ashotjanibekyan/WikiPyScripts/blob/master/helpers.py
def matrix_to_wikitable(matrix):
    text = '{| class="wikitable sortable"\n'
    text += '!' + '!!'.join(matrix[0]) + '\n'
    for i in range(1, len(matrix)):
        if isinstance(matrix[i], list) and len(matrix[i]) == len(matrix[0]):
            row = (str(x) if x or x == 0 else ' ' for x in matrix[i])
            text += '|-\n|' + '||'.join(row) + '\n'
    text += '|}'
    return text


def get_unhidden_categories(lang_code, page_title, withNS=False):
    """
    Fetches the unhidden categories for a given Wikipedia page.

    Parameters:
    page_title (str): The title of the Wikipedia page to retrieve categories for.

    Returns:
    list: A list of unhidden category titles associated with the page.
    """
    url = f"https://{lang_code}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "categories",
        "titles": page_title,
        "clshow": "!hidden",
        "cllimit": "max"  # Increase the limit to get more categories if available
    }

    response = requests.get(url, params=params)
    data = response.json()
    pages = data.get("query", {}).get("pages", {})

    unhidden_categories = []
    for page_id, page_data in pages.items():
        if 'categories' in page_data:
            for category in page_data['categories']:
                if withNS is True:
                    cat_title = category['title']
                else:
                    cat_title = category['title'].replace("Kategorî:", "")
                unhidden_categories.append(cat_title)

    return unhidden_categories


def get_qid(site, title):
    # Construct the Wikidata API URL
    api_url = 'https://www.wikidata.org/w/api.php'
    db_name = site.dbName()
    params = {
        'action': 'wbgetentities',
        'sites': db_name,
        'titles': title,
        'props': 'claims|sitelinks',
        'format': 'json'
    }

    # Make the API request
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()  # Raise an exception for bad responses
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from Wikidata: {e}")
        return None

    data = response.json()

    # Check if the response contains the item ID
    entities = data.get('entities')
    if not entities:
        return None

    # Extract the item ID
    item_id = next(iter(entities))

    return item_id


def get_pvalue(site, title, pvalue):
    # Construct the Wikidata API URL
    api_url = 'https://www.wikidata.org/w/api.php'
    db_name = site.dbName()
    params = {
        'action': 'wbgetentities',
        'sites': db_name,
        'titles': title,
        'props': 'claims|sitelinks',
        'format': 'json'
    }

    # Make the API request
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()  # Raise an exception for bad responses
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from Wikidata: {e}")
        return None

    data = response.json()
    # Check if the response contains the item ID
    entities = data.get('entities')
    if not entities:
        print("no entiti")
        return None

    # Extract the item ID
    item_id = next(iter(entities))
    item_data = entities[item_id]

    claims = item_data.get('claims', {})
    property_claims = claims.get(pvalue, [])
    if not property_claims:
        print("tine")
        return None

    # Get the target value from the claim
    property_claim = property_claims[0]
    mainsnak = property_claim.get('mainsnak', {})
    datavalue = mainsnak.get('datavalue', {})

    value_type = datavalue.get('type', {})
    value = datavalue.get('value', {})
    if value_type == "wikibase-entityid":
        property_value = value.get('id')
    elif value_type == "string":
        property_value = value
    elif value_type == "monolingualtext":
        property_value = value.get('text'), value.get('language')
    else:
        property_value = datavalue

    return property_value


# tweaked from https://gist.github.com/hesyifei/00f6ee0890ac3477b58e4d6b9c712fc2#file-deletepersondata-py-L29
def referring_page_generator(referred_page, follow_redirects=False,
                             with_template_inclusion=True,
                             only_template_inclusion=False,
                             total=None, content=False):
    """
    Return a list of all pages that refer to or embed the page.
    If you need a full list of referring pages, use pages = list(s.getReferences())

    :param referred_page: Template name withNS=True
    :param follow_redirects: if True, also iterate pages that link to a redirect pointing to the page. (default true)
    :param with_template_inclusion: if True, also iterate pages where self is used as a template. (default False)
    :param only_template_inclusion: if True, only iterate pages where self is used as a template.
    :param total: iterate no more than this number of pages in total
    :param content: if True, retrieve the content of the current version of each referring page (default False)
    :return: a list of Pages
    """
    gen = referred_page.getReferences(
        follow_redirects=follow_redirects,
        with_template_inclusion=with_template_inclusion,
        only_template_inclusion=only_template_inclusion,
        total=total, content=content)

    page_list = []
    for page in gen:
        page_list.append(page)
    return page_list


def remove_sitil_class(page):
    talk_page = page.toggleTalkPage()
    if not talk_page.exists():
        return
    text = talk_page.text
    wikicode = mwparserfromhell.parse(text)
    for template in wikicode.filter_templates():
        template_name = ucfirst(template.name)
        if template_name in KALIK and template.has('sinif'):
            sinif_val = lcfirst(template.get('sinif').value)
            if sinif_val == 'şitil':
                template.add('sinif', '')

    new_text = str(wikicode)
    if new_text != text:
        talk_page.text = new_text
        talk_page.save(summary=f'Bot: Sinifa şitil hat rakirin')


def get_wordcount(page):
    # tried to imitate en.wikipedia.org/wiki/Wikipedia:Prosesize except for Roj û sal pages
    html_content = page.get_parsed_page()
    soup = BeautifulSoup(html_content, 'html.parser')
    word_count = 0

    # Find the .mw-parser-output div
    parser_output = soup.find('div', class_='mw-parser-output')

    if not parser_output:
        if VERBOSE:
            print("No .mw-parser-output found")
        return False

    # Calculate prose size and word count for each direct child paragraph within .mw-parser-output
    for paragraph in parser_output.find_all('p', recursive=False):
        # Remove <span class="mwe-math-element"> if it exists
        for span in paragraph.find_all('span', class_='mwe-math-element'):
            span.decompose()

        # Get the innerHTML of the paragraph
        paragraph_text = paragraph.get_text()

        # Strip HTML tags using regex similar to the JavaScript code
        clean_content = re.sub(r'<[^>]+>', '', paragraph_text)

        clean_content = clean_content.strip()

        if VERBOSE:
            print(clean_content)

        words = clean_content.split()
        word_count += len(words)

    contains_roj = is_category_in_page(page, "Rojên salê")
    sal_templates = ["Gotara salê", "Gotara salê b.z."]
    contains_sal = is_template_in_page(page.text, sal_templates)
    if contains_roj or contains_sal:
        # Find all <li> elements within .mw-parser-output and add their word counts
        for li in parser_output.find_all('li'):
            li_text = li.get_text()
            if VERBOSE:
                print(li_text)
            words = li_text.split()
            word_count += len(words)

    if VERBOSE:
        print("word_count: ", word_count)
    return word_count


class TagHelpers:

    @staticmethod
    def is_sewi(page: pywikibot.page.BasePage) -> bool:
        """
        Checks if a page is sêwî or not

        :param page: pywikibot.page.BasePage
        :return: True if sêwî else False.
        """
        incoming_links = list(page.getReferences(namespaces=[0]))

        if len(incoming_links) < 1:
            return True

        for link in incoming_links:

            # If the only incoming link is the page itself, return sewî
            if link.title() == page.title() and len(incoming_links) == 1:
                return True

            contains_zarava = zaravayen_din(link.categories())

            # Return not sewî if the link is not in zarava categories, not a redirect, and not a disambiguation page
            if not contains_zarava and not link.isRedirectPage() and not link.isDisambig():
                return False

        # If all links are redirects or disambiguations, return sewî
        return True

    @staticmethod
    def is_sitil(page):

        def check_wordcount():
            # en:WP:AWB adds stub tags for 300 words
            return get_wordcount(page) <= 300

        def is_sitil_class():

            talk_page = page.toggleTalkPage()
            if not talk_page.exists():
                return None

            text = talk_page.text
            wikicode = mwparserfromhell.parse(text)
            for template in wikicode.filter_templates():
                template_name = ucfirst(template.name)
                if template_name in KALIK and template.has('sinif'):
                    sinif_val = lcfirst(template.get('sinif').value)
                    if not sinif_val:
                        return None

                    if sinif_val == 'şitil':
                        return True
                    elif sinif_val == 'lîste':
                        return 'lîste'
                    else:
                        return False
        if is_sitil_class() == "lîste":
            if VERBOSE:
                print("lîste")
            return "lîste"
        elif check_wordcount() is True and is_sitil_class() is True:
            if VERBOSE:
                print("both true")
            return True
        elif check_wordcount() is True and is_sitil_class() is None:
            if VERBOSE:
                print("yek none")
            return True
        else:
            if VERBOSE:
                print(is_sitil_class())
                print("both false or ")
            return False

    @staticmethod
    def is_bekategori(page):
        """
        Returns true if bêkategorî, false if all categories are hidden, return idk if a cat doesnt exist. For ns:0 only
        """
        lang_code = page.site.code
        title = page.title()
        unhidden_cats = get_unhidden_categories(lang_code, title, withNS=True)

        if page.namespace() == 14:
            # kategorî ye

            if page.isCategoryRedirect():
                return False

            if page.title() == "Kategorî:!Serkategorî":
                # !Serkategorî her tim bêkategorî ye loma ne hewce ye
                return False

            all_cats = [cat.title() for cat in page.categories()]
            if len(all_cats) == 0:
                # hîç kat tine loma bêkategorî ye
                return True

            vesarti_heye = is_category_in_page(page, 'Kategoriyên veşartî')

            if vesarti_heye:
                # veşartî ye loma ne bêkategorî ye
                return False

            if not vesarti_heye and len(unhidden_cats) == 0:
                # ne veşartî ye û kategoriyên neveşartî tine loma bêkategorî
                return True

            for cat in unhidden_cats:
                cat_page = pywikibot.Page(page.site, cat)
                if not cat_page.exists():
                    # kategoriyek hîn nehatiye çêkirin loma em nizanin
                    return 'idk'
            # Heya vir hat loma ne bêkategorî ye
            return False
        elif page.namespace() == 0:
            # gotar e

            if len(unhidden_cats) == 0:
                # hîç kategoriya neveşartî tine loma bêkategorî ye
                return True

            for cat in unhidden_cats:
                cat_page = pywikibot.Page(page.site, cat)
                if not cat_page.exists():
                    # kategoriyek hîn nehatiye çêkirin loma em nizanin
                    return 'idk'
            # Heya vir hat loma ne bêkategorî ye
            return False
        else:
            return 'idk'