#!/usr/bin/python3
import pywikibot
import pymysql
import re
import os
def list_intersection(list1, list2):
list3 = [value for value in list1 if value in list2]
return list3
def check_eligibility_en(candidate):
"""Determine if the category is addable."""
cat = pywikibot.Page(pywikibot.Site("en", "wikipedia"), "Category:%s" % candidate)
cat_cats = get_existing_cats(cat)
ineligible_parents = [
"Hidden categories",
"Tracking categories",
"Turkish people by occupation",
"Cities in Turkey",
"Turkish Kurdistan",
"Iraqi Kurdistan",
"Syrian Kurdistan",
"Iranian Kurdistan",
"Stub categories"
]
# Exclude certain patterns from skipping
excluded_patterns = ["articles", "Wikiproject", "User"]
if any(pattern in candidate.lower() for pattern in excluded_patterns):
return False # Include even if it matches excluded patterns
if len(list_intersection(ineligible_parents, cat_cats)) > 0:
return False
return True
def get_existing_cats(page):
"""Get a list() of categories the page is in."""
cats = list(page.categories())
cat_titles = list()
for c in cats:
cat_titles.append(c.title(with_ns=False))
return cat_titles
def getList(lang_code, query_name, file_name, query_params=None):
# Database connection details
# Note: If you are using Toolforge, you may ignore the database username and password
db_hostname_format = lang_code + "wiki.analytics.db.svc.wikimedia.cloud" # Hostname of the database server
db_port = 3306 # Port number for the database server
# db_username = "" # Add your actual database username credential (if not using Toolforge)
# db_password = "" # Add your actual database password credential (if not using Toolforge)
db_name_format = lang_code + "wiki_p" # Name of the target database
db_connect_file = "~/replica.my.cnf" # path to the "my.cnf" file
# Create a connection to the database
connection = pymysql.connect(
host=db_hostname_format,
port=db_port,
# user=db_username,
# password=db_password,
database=db_name_format,
read_default_file=db_connect_file, # "my.cnf" file contains user and password and read these parameters from under the [client] section.
charset='utf8'
)
# Create a cursor
cursor = connection.cursor()
# Use the kuwiki_p database
cursor.execute("USE " + lang_code + "wiki_p;")
# Execute the query with parameters if provided
if query_params is not None:
cursor.execute(query_name, query_params)
else:
cursor.execute(query_name)
# Fetch the results
results = cursor.fetchall()
# Close the cursor and the database connection
cursor.close()
connection.close()
toolforge_home = os.getenv('HOMEPATH', '/data/project/balyozbot/')
# Write the results to a temporary file
temp_output_file_path = os.path.join(toolforge_home, file_name + '_temp')
if results:
with open(temp_output_file_path, 'w', encoding='utf-8') as output_file:
for result in results:
ku_page_title = result[0] # Assuming there's only one column in the result
ku_page_title = ku_page_title.decode('utf-8')
ku_page_title = ku_page_title.replace('_', ' ')
output_file.write(f"{ku_page_title}\n")
else:
print("No results found from the query.")
return temp_output_file_path
# query
ku_query = """
SELECT
langlinks.ll_title AS enwiki_sitelink
FROM
categorylinks
JOIN
page ON cl_from = page_id
LEFT JOIN
page_props ON page.page_id = page_props.pp_page AND page_props.pp_propname = 'hiddencat'
LEFT JOIN (
SELECT
rev_page,
MIN(rev_timestamp) AS first_revision_timestamp
FROM
revision
GROUP BY
rev_page
) AS first_revision ON page_id = first_revision.rev_page
LEFT JOIN
pagelinks ON page_id = pagelinks.pl_from
INNER JOIN -- Change LEFT JOIN to INNER JOIN
langlinks ON cl_from = langlinks.ll_from AND langlinks.ll_lang = 'en'
WHERE
page_namespace = 14
AND page.page_touched >= NOW() - INTERVAL 1 DAY
AND first_revision.first_revision_timestamp >= NOW() - INTERVAL 1 DAY
AND page_props.pp_page IS NULL
AND NOT EXISTS (
SELECT
1
FROM
templatelinks
JOIN
page AS template_page ON tl_from = template_page.page_id
JOIN
revision AS template_revision ON template_page.page_latest = template_revision.rev_id
WHERE
tl_from = cl_from
AND template_page.page_namespace = 10
AND template_page.page_title = 'Kategoriya_çavdêriyê'
)
AND NOT EXISTS (
SELECT
1
FROM
templatelinks
JOIN
linktarget ON lt_id = tl_target_id
WHERE
tl_from = cl_from
AND linktarget.lt_namespace = 10
AND linktarget.lt_title = 'Kategoriya_çavdêriyê'
)
GROUP BY
enwiki_sitelink
ORDER BY
MAX(cl_timestamp) DESC;
"""
en_file_name = 'getlisteyacategorizeEnKategori'
ku_file_name = 'getlisteyacategorizeKuKategori'
#en_file_path = getList("en", en_query, en_file_name)
ku_file_path = getList("ku", ku_query, ku_file_name)
toolforge_home = os.getenv('HOMEPATH', '/data/project/balyozbot/')
output_file_path = os.path.join(toolforge_home, 'getlisteyacategorize.txt')
# Clear the file by opening it in write mode ('w')
with open(output_file_path, 'w', encoding='utf-8') as clear_file:
clear_file.write('')
with open(ku_file_path, 'r', encoding='utf-8') as ku_file:
ku_results = set(ku_file.read().splitlines())
unique_results_set = set()
for enwiki_sitelink in ku_results:
if check_eligibility_en(enwiki_sitelink):
enwiki_sitelink = enwiki_sitelink.replace(' ', '_')
enwiki_sitelink = enwiki_sitelink.replace('Category:', '')
en_query = """
SELECT
ll.ll_title AS ku_page_title
FROM
page
INNER JOIN langlinks AS ll ON ll.ll_from = page.page_id AND ll.ll_lang = 'ku'
INNER JOIN categorylinks AS cl ON cl.cl_from = page.page_id
WHERE
page.page_namespace IN (0,14)
AND cl.cl_to = %s
GROUP BY
ku_page_title
ORDER BY ku_page_title;
"""
en_query_result_path = getList("en", en_query, en_file_name, (enwiki_sitelink,))
try:
with open(en_query_result_path, 'r', encoding='utf-8') as en_query_result_file:
en_query_results = set(en_query_result_file.read().splitlines())
# Check and append only unique results to the output file
for en_result in en_query_results:
if en_result not in unique_results_set:
unique_results_set.add(en_result)
with open(output_file_path, 'a', encoding='utf-8') as output_file:
output_file.write(f"{en_result}\n")
except Exception as e:
print(f"Error processing enwiki_sitelink '{enwiki_sitelink}': {str(e)}")
# Remove the temporary en_query result file
os.remove(en_query_result_path)
os.remove(ku_file_path)
# Check if the file is empty
with open(output_file_path, 'r', encoding='utf-8') as check_file:
file_content = check_file.read().strip()
# Count the number of lines in the file content
num_lines = file_content.count('\n')
# If the file is not empty, print the number of lines and proceed with categorization
if num_lines > 0:
print(f"Working on {num_lines + 1} pages")
os.system(f"$HOME/pwbvenv/bin/pwb categorize -always -lang:ku -family:wikipedia -file:getlisteyacategorize.txt")
else:
print("No changes detected.")