SQL
x
SELECT
pagecats.cl_to,
talkcats.cl_to,
-- REGEXP_EXTRACT(talkcats.cl_to, '(?WikiProject_|.*_importance_|.*_priority_|Unassessed)(.*)'), -- does not work
COUNT(*) AS my_count
FROM
page AS article
INNER JOIN page AS talk ON talk.page_title = article.page_title
INNER JOIN categorylinks AS pagecats ON pagecats.cl_from = article.page_id
INNER JOIN categorylinks AS talkcats ON talkcats.cl_from = talk.page_id
WHERE
article.page_random < 0.25 -- page_random is between 0 and 1, so this sets the probe size
AND article.page_namespace = 0
AND talk.page_namespace = 1
AND article.page_is_redirect = 0
AND talk.page_is_redirect = 0
-- filter out redirect and disambig pages entirely
AND NOT EXISTS (SELECT 1 from page WHERE
pagecats.cl_to LIKE "%disambig%"
OR pagecats.cl_to LIKE "%Disambig%"
OR pagecats.cl_to LIKE "%redirect%"
OR pagecats.cl_to LIKE "%Redirect%")
AND (
talkcats.cl_to LIKE "%WikiProject_%"
OR talkcats.cl_to LIKE "%-Class_%_articles"
OR talkcats.cl_to LIKE "%-importance_%_articles"
OR talkcats.cl_to LIKE "%-priority_%_articles"
OR talkcats.cl_to LIKE "Unassessed_%_articles"
)
AND NOT (
talkcats.cl_to LIKE "%vital%"
OR talkcats.cl_to LIKE "%Version%"
)
AND NOT (pagecats.cl_to LIKE "%Disambig%" OR pagecats.cl_to LIKE "%disambig%" OR pagecats.cl_to LIKE "%set_index%" OR pagecats.cl_to LIKE "Set_index%")
AND NOT (talkcats.cl_to LIKE "%Disambig%" OR talkcats.cl_to LIKE "%disambig%")
AND NOT (pagecats.cl_to LIKE "Short_description%"
OR pagecats.cl_to LIKE "%_errors%"
OR pagecats.cl_to LIKE "CS1_%"
OR pagecats.cl_to LIKE "%short_description%"
OR pagecats.cl_to LIKE "%articles%"
OR pagecats.cl_to LIKE "Articles%"
OR pagecats.cl_to LIKE "%pages%"
OR pagecats.cl_to LIKE "%disputes%"
OR pagecats.cl_to LIKE "Pages%"
OR pagecats.cl_to LIKE "Use_dmy_date%"
OR pagecats.cl_to LIKE "Use_mdy_date%"
OR pagecats.cl_to LIKE "%Wikipedia%"
OR pagecats.cl_to LIKE "%articles%"
OR pagecats.cl_to LIKE "%Articles%"
OR pagecats.cl_to LIKE "%Wikidata%"
OR pagecats.cl_to = "Source_attribution"
OR pagecats.cl_to LIKE "Webarchive%")
GROUP BY pagecats.cl_to, talkcats.cl_to
ORDER BY my_count DESC, pagecats.cl_to, talkcats.cl_to
LIMIT 2000000
By running queries you agree to the Cloud Services Terms of Use and you irrevocably agree to release your SQL under CC0 License.
All SQL code is licensed under CC0 License.