SQL
x
SET @cutoff = DATE_ADD(NOW(), INTERVAL -1 WEEK);
WITH newusers AS
(
-- parsing the user creation log is feasible up to time periods of about a
-- year; past that, it's faster to query user.user_registration directly
SELECT user.*, actor_id AS user_actor
FROM logging
JOIN user ON user_id = REGEXP_REPLACE(log_params, '^.*"4::userid";i:([0-9]+);.*$', '\\1')
JOIN actor_user ON actor_user = user_id
WHERE log_type = 'newusers'
AND log_timestamp >= DATE_FORMAT(@cutoff, '%Y%m%d%H%i%s')
AND user_editcount >= 1
),
first_live_edit_id AS
(
-- rev_id isn't always in the same order as rev_timestamp (particularly for
-- imported edits), but the difference is negligible
SELECT user_id AS flei_user, MIN(rev_id) AS flei_rev
FROM revision_userindex
JOIN newusers ON rev_actor = user_actor
GROUP BY user_id
),
first_live_edit AS
(
SELECT flei_user AS fle_user, rev_timestamp AS fle_timestamp, page_namespace AS fle_namespace, !rev_parent_id AS fle_new
FROM revision
JOIN first_live_edit_id ON rev_id = flei_rev
JOIN page ON page_id = rev_page
),
first_deleted_edit AS
(
-- ar_id, on the other hand, is often in a very different ordering from
-- ar_timestamp - the ids are assigned when the page is deleted. So, the
-- slow dependent subquery.
-- If the user managed to make two edits with the same timestamp and both have
-- been deleted, it's impractical to figure out which was earlier, so we just
-- pick one. (The original revision id is stored in archive, but it's not
-- indexed.)
SELECT user_id AS fde_user, ar_timestamp AS fde_timestamp, ar_namespace AS fde_namespace, !ar_parent_id AS fde_new
FROM archive_userindex
JOIN newusers ON ar_actor = user_actor
WHERE ar_timestamp = (SELECT MIN(ar_timestamp)
FROM archive_userindex AS a
WHERE a.ar_actor = user_actor)
GROUP BY user_id
),
first_edit AS
(
SELECT user_id,
CASE WHEN COALESCE(fde_timestamp, '9') < COALESCE(fle_timestamp, '9') THEN fde_namespace ELSE fle_namespace END AS fe_namespace,
CASE WHEN COALESCE(fde_timestamp, '9') < COALESCE(fle_timestamp, '9') THEN fde_new ELSE fle_new END AS fe_new,
CASE WHEN COALESCE(fde_timestamp, '9') < COALESCE(fle_timestamp, '9') THEN 0 ELSE 1 END AS fe_live
-- mariadb doesn't have full outer join, so rather than futzing around with
-- the direct workarounds, just join newusers too
FROM newusers
LEFT JOIN first_live_edit ON fle_user = user_id
LEFT JOIN first_deleted_edit ON fde_user = user_id
-- even if user_editcount is >= 1, there might be none visible if all edits
-- have been revdeleted or oversighted - don't count those
WHERE fle_timestamp IS NOT NULL
OR fde_timestamp IS NOT NULL
),
ns(ns_n, ns_s) AS (VALUES (-2, 'Media:'), (-1, 'Special:'), (0, '(Article)'), (1, 'Talk:'), (2, 'User:'), (3, 'User talk:'), (4,
'Wikipedia:'), (5, 'Wikipedia talk:'), (6, ':File:'), (7, 'File talk:'), (8, 'MediaWiki:'), (9, 'MediaWiki talk:'), (10, 'Template:'),
(11, 'Template talk:'), (12, 'Help:'), (13, 'Help talk:'), (14, ':Category:'), (15, 'Category talk:'), (100, 'Portal:'), (101,
'Portal talk:'), (118, 'Draft:'), (119, 'Draft talk:'), (710, 'TimedText:'), (711, 'TimedText talk:'), (828, 'Module:'), (829,
'Module talk:'), (2300, 'Gadget:'), (2301, 'Gadget talk:'), (2302, 'Gadget definition:'), (2303, 'Gadget definition talk:'))
SELECT COALESCE(ns_s, CONCAT('{{ns:', fe_namespace, '}}')) AS namespace,
COUNT(CASE WHEN fe_new = 1 THEN 1 ELSE NULL END) AS 'new page (all)',
COUNT(CASE WHEN fe_new = 1 AND fe_live = 1 THEN 1 ELSE NULL END) AS 'new page (still exists)',
COUNT(CASE WHEN fe_new = 1 AND fe_live = 0 THEN 1 ELSE NULL END) AS 'new page (deleted)',
COUNT(CASE WHEN fe_new = 0 THEN 1 ELSE NULL END) AS 'existing page (all)',
COUNT(CASE WHEN fe_new = 0 AND fe_live = 1 THEN 1 ELSE NULL END) AS 'existing page (still exists)',
COUNT(CASE WHEN fe_new = 0 AND fe_live = 0 THEN 1 ELSE NULL END) AS 'existing page (deleted)'
FROM first_edit
LEFT JOIN ns ON ns_n = fe_namespace
GROUP BY fe_namespace;
By running queries you agree to the Cloud Services Terms of Use and you irrevocably agree to release your SQL under CC0 License.
All SQL code is licensed under CC0 License.