SQL
x
query <- "
SELECT
DATE_FORMAT(mwh.event_timestamp, 'yyyy-MM-dd') as `date`,
mwh.wiki_db AS language,
event_user_text AS user_name,
mwh.revision_id AS revision_id,
topics.topic,
tc.main_topic,
tc.sub_topic,
event_user_registration_timestamp AS user_registration_date,
event_user_revision_count,
CASE WHEN array_contains(revision_tags, 'mobile app edit') THEN 'Mobile app'
WHEN array_contains(revision_tags, 'mobile web edit') THEN 'Mobile web'
ELSE 'Other' END AS platform,
CASE WHEN (ARRAY_CONTAINS(revision_tags,'sectiontranslation') AND revision_parent_id > 0) THEN 'expand an article'
WHEN ((ARRAY_CONTAINS(revision_tags,'contenttranslation') OR
ARRAY_CONTAINS(revision_tags, 'sectiontranslation') OR
ARRAY_CONTAINS(revision_tags,'contenttranslaton-v2')) AND revision_parent_id = 0) THEN 'create new article'
ELSE 'overwrite an article'
END AS translation_activity
FROM
wmf.mediawiki_history mwh
INNER JOIN canonical_data.wikis cd
ON mwh.wiki_db = cd.database_code AND
cd.database_group = 'wikipedia'
INNER JOIN isaacj.article_topics_outlinks topics ON (
cd.database_code = topics.wiki_db
AND mwh.page_id = topics.pageid
-- only include topics with a confidence score above 0.15
AND topics.score >= 0.5)
LEFT JOIN cchen.topic_component tc ON topics.topic = tc.topic
WHERE
mwh.event_entity = 'revision'
AND mwh.event_type = 'create'
-- only content mainspace
AND mwh.page_namespace = 0
AND (ARRAY_CONTAINS(revision_tags, 'sectiontranslation')
OR ARRAY_CONTAINS(revision_tags, 'contenttranslation')
OR ARRAY_CONTAINS(revision_tags, 'contenttranslation-v2'))
-- last snapshot where topic data was available
AND mwh.snapshot = '2022-09'
AND topics.snapshot = '2022-09'
-- last year of available data
AND mwh.event_timestamp BETWEEN '2021-09-01' and '2022-09-31'
"
cx_topics_data <- wmfdata::query_hive(query)
# save data
write.csv(cx_topics_data,
file = 'Data/cx_topics_data.csv',
row.names = FALSE
)
# load in data
cx_topics_data <-
read.csv(
file = 'Data/cx_topics_data.csv',
header = TRUE,
sep = ",",
stringsAsFactors = FALSE
)
# reformat and clean data
cx_topics_data$date <- as.Date(cx_topics_data$date, format = "%Y-%m-%d")
# set factor levels
cx_topics_data$main_topic <-
factor(
cx_topics_data$main_topic,
levels = c("Geography", "Culture", "History_and_Society", "STEM"),
labels = c("Geography", "Culture", "History and Society", "STEM")
)
# set factor level and rename platform
# note there is no specific tag for desktop in mediawiki; however, the majority of these instances are desktop. This may include some API type edit tools but they are rare.
cx_topics_data$platform <-
factor(
cx_topics_data$platform,
levels = c("Mobile web", "Other"),
labels = c("Mobile Web", "Desktop")
)
By running queries you agree to the Cloud Services Terms of Use and you irrevocably agree to release your SQL under CC0 License.
All SQL code is licensed under CC0 License.