SQL
x
SET @basecat='Players_of_American_football';
SET max_recursive_iterations=7; -- current maximum depth for this tree; 2457 categories, 47742 pages
WITH RECURSIVE deepcat (subcat, catpath, depth) AS
(
SELECT @basecat, REPLACE(@basecat, '_', ' '), 0
UNION
SELECT page_title, CONCAT(catpath, ' > ', REPLACE(page_title, '_', ' ')), depth + 1
FROM categorylinks
JOIN page ON page_id = cl_from AND page_namespace = 14
JOIN deepcat ON cl_to = subcat
)
SELECT DISTINCT REPLACE(page_title, '_', ' ') AS title,
page_len,
MIN(depth),
MIN(catpath) OVER (PARTITION BY page_title ORDER BY depth ASC) AS 'category path',
GROUP_CONCAT(DISTINCT els.el_to SEPARATOR ' ') AS 'external links'
FROM page
JOIN categorylinks ON cl_from = page_id
JOIN deepcat ON cl_to = subcat
LEFT JOIN externallinks AS els ON els.el_from = page_id
LEFT JOIN externallinks AS non_dbs ON non_dbs.el_from = page_id
AND non_dbs.el_to_domain_index NOT LIKE 'http://com.pro-football-reference.%'
AND non_dbs.el_to_domain_index NOT LIKE 'https://com.pro-football-reference.%'
AND non_dbs.el_to_domain_index NOT LIKE 'http://com.sports-reference.%'
AND non_dbs.el_to_domain_index NOT LIKE 'https://com.sports-reference.%'
AND non_dbs.el_to_domain_index NOT LIKE 'http://com.profootballarchives.%'
AND non_dbs.el_to_domain_index NOT LIKE 'https://com.profootballarchives.%'
AND non_dbs.el_to_domain_index NOT LIKE 'http://com.espn.%'
AND non_dbs.el_to_domain_index NOT LIKE 'https://com.espn.%'
AND non_dbs.el_to_domain_index NOT LIKE 'http://com.footballdb.%'
AND non_dbs.el_to_domain_index NOT LIKE 'https://com.footballdb.%'
AND non_dbs.el_to_domain_index NOT LIKE 'http://com.nfl.%'
AND non_dbs.el_to_domain_index NOT LIKE 'https://com.nfl.%'
WHERE non_dbs.el_id IS NULL
AND page_namespace = 0
AND page_is_redirect = 0
AND page_len <= 2500
GROUP BY page_id
ORDER BY MIN(depth) ASC;
By running queries you agree to the Cloud Services Terms of Use and you irrevocably agree to release your SQL under CC0 License.
All SQL code is licensed under CC0 License.