Toggle navigation
Home
New Query
Recent Queries
Discuss
Database tables
Database names
MediaWiki
Wikibase
Replicas browser and optimizer
Login
History
Fork
Fork of
Wikidata - English descriptions starting with "a .."
by
Jura1
This query is marked as a draft
This query has been published
by
Hasley
.
Toggle Highlighting
SQL
%%capture %set_env http_proxy=http://webproxy.eqiad.wmnet:8080 %set_env https_proxy=https://webproxy.eqiad.wmnet:8080 %set_env no_proxy=localhost,127.0.0.1 import wmfdata wmfdata.utils.insert_code_toggle() from wmfdata import hive, mariadb import pandas as pd from datetime import datetime, timedelta, date %matplotlib inline import matplotlib.pyplot as plt import matplotlib.ticker as ticker @ticker.FuncFormatter def million_formatter(x, pos): return "%d M" % round(x/1000000) @ticker.FuncFormatter def thousand_formatter(x, pos): return "%d K" % round(x/1000) now = pd.Timestamp.utcnow() today=now.date() last_Sunday=(today - timedelta(days=today.weekday()+1)).strftime("%Y-%m-%d") month_1st_day= datetime.today().replace(day=1).strftime("%Y-%m-%d") start_date_YYYY_MM_DD = '2021-01-04' end_date_YYYY_MM_DD=last_Sunday yr='2021' pre_yr='2020' start_date_YYYYMMDD = '20210104' #end_date_YYYYMMDD=datetime.today().replace(day=1).strftime("%Y%m%d") end_date_YYYYMMDD=(today - timedelta(days=today.weekday()+1)).strftime("%Y%m%d") if datetime.today().day < 7: snapshot = datetime.today().replace(month=datetime.today().month-2).strftime("%Y-%m") else: snapshot = datetime.today().replace(month=datetime.today().month-1).strftime("%Y-%m") lastyear_start_date_YYYY_MM_DD = '2020-01-01' lastyear_end_date_YYYY_MM_DD='2020-12-31' lastyear_start_date_YYYYMMDD = '20200101' lastyear_end_date_YYYYMMDD='20201231' last2years_start_date_YYYY_MM_DD = '2019-01-01' last2years_end_date_YYYY_MM_DD='2019-12-31' last2years_start_date_YYYYMMDD = '20190101' last2years_end_date_YYYYMMDD='20191231' weekly_user_editors_query=''' SELECT tmp.week_n, COUNT(tmp.user_id) AS user_editors FROM (SELECT WEEKOFYEAR(rev_timestamp) AS week_n, performer.user_id AS user_id, max(performer.user_text) AS user_text FROM event_sanitized.mediawiki_revision_create WHERE year='{YEAR_YYYY}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') AND performer.user_id IS NOT null AND page_namespace IN (0,6,12,640,100,102,104,106,108,110,112,114,124,146,250,252) AND `database`='ptwiki' GROUP BY WEEKOFYEAR(rev_timestamp), performer.user_id )AS tmp GROUP BY tmp.week_n ''' df_weekly_user_editors=hive.run(weekly_user_editors_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD)) weekly_user_editors_query=''' SELECT weekofyear(event_timestamp) AS week_n, COUNT(DISTINCT event_user_id) wiki_user_editors FROM wmf.mediawiki_history WHERE event_entity = 'revision' AND event_type = 'create' AND DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND event_user_is_anonymous = false AND page_namespace_is_content AND snapshot = '{SNAPSHOT}' AND wiki_db='ptwiki' GROUP BY weekofyear(event_timestamp) ''' df2_weekly_user_editors_pre_1_year=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) df2_weekly_user_editors_pre_2_years=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD)) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki Weekly User Editors',fontweight="bold",fontsize = 16,y=1.08) ax.set_xlabel('Week', fontsize = 16) ax.set_ylabel('Editors', fontsize = 16) ax.plot(df_weekly_user_editors['week_n'], df_weekly_user_editors['user_editors'], 'o-',label='Year 2021') ax.plot(df2_weekly_user_editors_pre_1_year['week_n'], df2_weekly_user_editors_pre_1_year['wiki_user_editors'], 'o--',label='Year 2020') ax.plot(df2_weekly_user_editors_pre_2_years['week_n'], df2_weekly_user_editors_pre_2_years['wiki_user_editors'], ':',label='Year 2019') ax.set_xlim(1,52) ax.set_ylim(0,5000) ax.vlines(40, 0, 5000, colors='k', linestyles='dashdot', label='Turned off') ax.legend(loc='upper right') ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') plt.show() weekly_edits_query=''' SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS edits FROM event_sanitized.mediawiki_revision_create WHERE year='{YEAR_YYYY}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') AND `database`='ptwiki' GROUP BY WEEKOFYEAR(rev_timestamp) ''' df_weekly_edits=hive.run(weekly_edits_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD)) weekly_edits_query_2=''' SELECT weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits FROM wmf.mediawiki_history WHERE event_entity = 'revision' AND event_type = 'create' AND DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND snapshot = '{SNAPSHOT}' AND wiki_db='ptwiki' GROUP BY weekofyear(event_timestamp) ''' df2_weekly_edits_pre_1_year=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) df2_weekly_edits_pre_2_years=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD)) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki Weekly Edits',fontweight="bold",fontsize = 16,y=1.08) ax.set_xlabel('Week', fontsize = 16) ax.set_ylabel('Edits', fontsize = 16) ax.plot(df_weekly_edits['week_n'], df_weekly_edits['edits'], 'o-',label='Year 2021') ax.plot(df2_weekly_edits_pre_1_year['week_n'], df2_weekly_edits_pre_1_year['edits'], 'o--',label='Year 2020') ax.plot(df2_weekly_edits_pre_2_years['week_n'], df2_weekly_edits_pre_2_years['edits'], ':',label='Year 2019') ax.set_xlim(1,52) ax.set_ylim(0,150000) ax.vlines(40, 0, 150000, colors='k', linestyles='dashdot', label='Turned off') ax.legend(loc='upper right') ax.yaxis.set_major_formatter(thousand_formatter) ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') #plt.savefig("1_ptwiki_weekly_edits.png") plt.show() query_weekly_reverts=''' SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n, count(revision_id) AS reverted_edits FROM wmf.mediawiki_history WHERE snapshot= '{SNAPSHOT}' AND wiki_db='ptwiki' AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}' AND event_entity = "revision" AND revision_is_identity_reverted GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) ''' df_weekly_reverts=hive.run(query_weekly_reverts.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD)) df_weekly_reverts_pre_1_year=hive.run(query_weekly_reverts.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) df_weekly_reverts_pre_2_years=hive.run(query_weekly_reverts.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD)) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki Weekly Reverted Edits',fontweight="bold",fontsize = 16,y=1.08) ax.set_xlabel('Week', fontsize = 16) ax.set_ylabel('Reverts', fontsize = 16) ax.plot(df_weekly_reverts['week_n'][:-1], df_weekly_reverts['reverted_edits'][:-1], 'o-',label='Year 2021') ax.plot(df_weekly_reverts_pre_1_year['week_n'], df_weekly_reverts_pre_1_year['reverted_edits'], 'o--',label='Year 2020') ax.plot(df_weekly_reverts_pre_2_years['week_n'], df_weekly_reverts_pre_2_years['reverted_edits'], ':',label='Year 2019') ax.set_xlim(1,52) ax.set_ylim(0,15000) ax.vlines(40, 0, 15000, colors='k', linestyles='dashdot', label='Turned off') ax.legend(loc='upper right') ax.yaxis.set_major_formatter(thousand_formatter) ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') plt.show() query_weekly_reverts_48hrs=''' SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n, count(revision_id) AS reverted_edits FROM wmf.mediawiki_history WHERE snapshot= '{SNAPSHOT}' AND wiki_db='ptwiki' AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}' AND event_entity = "revision" AND revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800 GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) ''' df_weekly_reverts_48hrs=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD)) df_weekly_reverts_48hrs_pre_1_year=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) df_weekly_reverts_48hrs_pre_2_years=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD)) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki Weekly Reverts Within 48 Hours of Editing',fontweight="bold",fontsize = 16,y=1.08) ax.set_xlabel('Week', fontsize = 16) ax.set_ylabel('Reverts', fontsize = 16) ax.plot(df_weekly_reverts_48hrs['week_n'][:-1], df_weekly_reverts_48hrs['reverted_edits'][:-1], 'o-',label='Year 2021') ax.plot(df_weekly_reverts_48hrs_pre_1_year['week_n'], df_weekly_reverts_48hrs_pre_1_year['reverted_edits'], 'o--',label='Year 2020') ax.plot(df_weekly_reverts_48hrs_pre_2_years['week_n'], df_weekly_reverts_48hrs_pre_2_years['reverted_edits'], ':',label='Year 2019') ax.set_xlim(1,52) ax.set_ylim(0,15000) ax.vlines(40, 0, 15000, colors='k', linestyles='dashdot', label='Turned off') ax.legend(loc='upper right') ax.yaxis.set_major_formatter(thousand_formatter) ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') plt.show() query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits=''' WITH t1 AS ( SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n, count(revision_id) AS non_reverted_edits FROM wmf.mediawiki_history WHERE snapshot= '{SNAPSHOT}' AND wiki_db='ptwiki' -- for bots size(event_user_is_bot_by) returns 1/2, IP editors return -1, registered non-bot editors return 0 AND size(event_user_is_bot_by) <= 0 AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}' AND event_entity = "revision" AND (NOT revision_is_identity_reverted OR revision_seconds_to_identity_revert > 172800) GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) ORDER BY week_N LIMIT 1000000 ), t2 AS ( SELECT weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n, count(distinct h1.revision_id) AS revert_edits FROM wmf.mediawiki_history AS h1 LEFT JOIN wmf.mediawiki_history AS h2 -- h1 is revert edits, h2 is the edits being reverted ON h1.revision_id=h2.revision_first_identity_reverting_revision_id AND h1.wiki_db = h2.wiki_db AND h1.snapshot= h2.snapshot WHERE h1.wiki_db = 'ptwiki' AND h1.snapshot='{SNAPSHOT}' -- for bots size(event_user_is_bot_by) returns 1 or 2, IP editors return -1, registered non-bot editors return 0 AND size(h1.event_user_is_bot_by) <= 0 AND size(h2.event_user_is_bot_by) <= 0 AND substr(h1.event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}' AND h1.event_entity = "revision" and h2.event_entity = "revision" AND (NOT h1.revision_is_identity_reverted OR h1.revision_seconds_to_identity_revert > 172800) AND h2.revision_is_identity_reverted AND h2.revision_seconds_to_identity_revert < 172800 AND h1.revision_is_identity_revert AND h1.event_entity = 'revision' and h2.event_entity = 'revision' GROUP BY weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss'))) ORDER BY week_n LIMIT 10000 ) SELECT t1.week_n, t1.non_reverted_edits - t2.revert_edits AS net_non_reverted_edits FROM t1 LEFT JOIN t2 ON t1.week_n=t2.week_n ''' df_weekly_net_non_reverted_48hrs_edits=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD)) df_weekly_net_non_reverted_48hrs_edits_pre_1_year=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) df_weekly_net_non_reverted_48hrs_edits_pre_2_years=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD)) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki Weekly Edits Which Were Not Reverted in 48 Hours (Excluding Bot And Revert Edits)',fontweight="bold",fontsize = 16, y=1.08) ax.set_xlabel('Week', fontsize = 16) ax.set_ylabel('Net Non-reverted Edits', fontsize = 16) ax.plot(df_weekly_net_non_reverted_48hrs_edits['week_n'][:-1], df_weekly_net_non_reverted_48hrs_edits['net_non_reverted_edits'][:-1], 'o-',label='Year 2021') ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_1_year['week_n'], df_weekly_net_non_reverted_48hrs_edits_pre_1_year['net_non_reverted_edits'], 'o--',label='Year 2020') ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_2_years['week_n'], df_weekly_net_non_reverted_48hrs_edits_pre_2_years['net_non_reverted_edits'], ':',label='Year 2019') ax.set_xlim(1,52) ax.set_ylim(0,150000) ax.vlines(40, 0, 150000, colors='k', linestyles='dashdot', label='Turned off') ax.legend(loc='upper right') ax.yaxis.set_major_formatter(thousand_formatter) ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') plt.show() weekly_edits_bot_query=''' SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS bot_edits FROM event_sanitized.mediawiki_revision_create WHERE year='{YEAR_YYYY}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') AND performer.user_id IS not null AND `database`='ptwiki' AND ( performer.user_is_bot=true OR performer.user_text regexp "^.*bot([^a-z].*$|$)") GROUP BY WEEKOFYEAR(rev_timestamp) ''' weekly_edits_nonbot_query=''' SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS nonbot_edits FROM event_sanitized.mediawiki_revision_create WHERE year='{YEAR_YYYY}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') AND performer.user_id IS not null AND `database`='ptwiki' AND ( performer.user_is_bot=false AND performer.user_text not regexp "^.*bot([^a-z].*$|$)") GROUP BY WEEKOFYEAR(rev_timestamp) ''' df_weekly_bot_edits=hive.run(weekly_edits_bot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD)) df_weekly_nonbot_edits=hive.run(weekly_edits_nonbot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD)) weekly_edits_bot_query_2=''' SELECT weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits FROM wmf.mediawiki_history WHERE event_entity = 'revision' AND event_type = 'create' AND DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND event_user_is_anonymous = false AND (size(event_user_is_bot_by_historical) > 0 OR size(event_user_is_bot_by) > 0 OR event_user_text regexp "^.*bot([^a-z].*$|$)" ) AND snapshot = '{SNAPSHOT}' AND wiki_db='ptwiki' GROUP BY weekofyear(event_timestamp) ''' df2_weekly_bot_edits_pre_1_year=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) df2_weekly_bot_edits_pre_2_years=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD)) weekly_edits_nonbot_query_2=''' SELECT weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits FROM wmf.mediawiki_history WHERE event_entity = 'revision' AND event_type = 'create' AND DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND event_user_is_anonymous = false AND (size(event_user_is_bot_by_historical) = 0 AND size(event_user_is_bot_by) = 0 AND event_user_text not regexp "^.*bot([^a-z].*$|$)" ) AND snapshot = '{SNAPSHOT}' AND wiki_db='ptwiki' GROUP BY weekofyear(event_timestamp) ''' df2_weekly_nonbot_edits_pre_1_year=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) df2_weekly_nonbot_edits_pre_2_years=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD)) fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax[0].set_title('Weekly Edits by Registered Bot Editors',fontweight="bold",fontsize = 16,y=1.08) ax[0].set_xlabel('Week', fontsize = 16) ax[0].set_ylabel('Edits', fontsize = 16) ax[0].plot(df_weekly_bot_edits['week_n'], df_weekly_bot_edits['bot_edits'], 'o-',label='Year 2021') ax[0].plot(df2_weekly_bot_edits_pre_1_year['week_n'], df2_weekly_bot_edits_pre_1_year['edits'], 'o--',label='Year 2020') ax[0].plot(df2_weekly_bot_edits_pre_2_years['week_n'], df2_weekly_bot_edits_pre_2_years['edits'], ':',label='Year 2019') ax[0].set_xlim(1,52) ax[0].set_ylim(0,80000) ax[0].vlines(40, 0, 80000, colors='k', linestyles='dashdot', label='Turned off') ax[0].legend(loc='upper right') ax[0].yaxis.set_major_formatter(thousand_formatter) ax2 = ax[0].twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') ax[1].set_title('Weekly Edits by Registered Non-bot Editors',fontweight="bold",fontsize = 16,y=1.08) ax[1].set_xlabel('Week',fontsize = 16) ax[1].set_ylabel('Edits',fontsize = 16) ax[1].set_xlim(1,52) ax[1].set_ylim(0,80000) ax[1].vlines(40, 0, 80000, colors='k', linestyles='dashdot', label='Turned off') ax3=ax[1].twiny() ax3.set_xlim(0,12) ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax3.tick_params(length=0) #ax3.set_xlabel('Month',fontsize=16) ax3.xaxis.set_minor_locator(miloc) ax3.grid(axis='x', which='minor') ax[1].plot(df_weekly_nonbot_edits['week_n'], df_weekly_nonbot_edits['nonbot_edits'], 'o-',label='Year 2021') ax[1].plot(df2_weekly_nonbot_edits_pre_1_year['week_n'], df2_weekly_nonbot_edits_pre_1_year['edits'], 'o--',label='Year 2020') ax[1].plot(df2_weekly_nonbot_edits_pre_2_years['week_n'], df2_weekly_nonbot_edits_pre_2_years['edits'], ':',label='Year 2019') ax[1].legend(loc='upper right') ax[1].yaxis.set_major_formatter(thousand_formatter) plt.show() query_block_weekly=''' SELECT WEEKofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n, COUNT(1) AS blocks FROM wmf_raw.mediawiki_logging WHERE snapshot ='{SNAPSHOT}' AND log_type = 'block' AND wiki_db='ptwiki' AND log_action IN ('block','reblock') AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}' GROUP BY WEEKofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) ORDER BY week_n LIMIT 100000 ''' df_block_weekly=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD)) df_block_weekly_pre_1_year=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD)) df_block_weekly_pre_2_years=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD)) fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax[0].set_title('Ptwiki Weekly Blocks',fontweight="bold",fontsize = 16,y=1.08) ax[0].set_xlabel('Week', fontsize = 16) ax[0].set_ylabel('Blocks', fontsize = 16) ax[0].plot(df_block_weekly['week_n'], df_block_weekly['blocks'], 'o-',label='Year 2021') ax[0].plot(df_block_weekly_pre_1_year['week_n'], df_block_weekly_pre_1_year['blocks'], 'o--',label='Year 2020') ax[0].plot(df_block_weekly_pre_2_years['week_n'], df_block_weekly_pre_2_years['blocks'], ':',label='Year 2019') ax[0].set_xlim(1,52) ax[0].set_ylim(0,14000) ax[0].vlines(40, 0, 140000, colors='k', linestyles='dashdot', label='Turned off') ax[0].legend(loc='upper right') ax[0].yaxis.set_major_formatter(thousand_formatter) ax2 = ax[0].twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') ax[1].set_title('Ptwiki Weekly Blocks',fontweight="bold",fontsize = 16,y=1.08) ax[1].set_xlabel('Week',fontsize = 16) ax[1].set_ylabel('Blocks',fontsize = 16) ax[1].set_xlim(1,52) ax[1].set_ylim(0,4000) ax[1].vlines(40, 0, 4000, colors='k', linestyles='dashdot', label='Turned off') ax3=ax[1].twiny() ax3.set_xlim(0,12) ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax3.tick_params(length=0) ax3.set_xlabel('Month',fontsize=16) ax3.xaxis.set_minor_locator(miloc) ax3.grid(axis='x', which='minor') ax[1].plot(df_block_weekly['week_n'], df_block_weekly['blocks'], 'o-',label='Year 2021') ax[1].plot(df_block_weekly_pre_1_year['week_n'], df_block_weekly_pre_1_year['blocks'], 'o--',label='Year 2020') ax[1].legend(loc='upper right') plt.show() query_accounts_created=''' SELECT WEEKofyear(from_unixtime(UNIX_timestamp(user_registration,'yyyyMMddHHmmss'))) AS week_n , COUNT(DISTINCT user_id) AS new_accounts FROM wmf_raw.mediawiki_user WHERe snapshot ='{SNAPSHOT}' AND substr(user_registration, 1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}' AND wiki_db='ptwiki' AND user_registration IS NOT NULL GROUP BY WEEKofyear(from_unixtime(UNIX_timestamp(user_registration,'yyyyMMddHHmmss'))) ORDER BY week_n LIMIT 10000 ''' df_accounts_created=hive.run(query_accounts_created.format(SNAPSHOT=snapshot,START_YYYYMMDD= start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD )) df_accounts_created_pre_1_year=hive.run(query_accounts_created.format(SNAPSHOT=snapshot, START_YYYYMMDD= lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD )) df_accounts_created_pre_2_years=hive.run(query_accounts_created.format(SNAPSHOT=snapshot, START_YYYYMMDD= last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD )) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki Created Accounts',fontweight="bold",fontsize = 16,y=1.08) ax.set_xlabel('Week', fontsize = 16) ax.set_ylabel('New Accounts', fontsize = 16) ax.plot(df_accounts_created['week_n'][:-1], df_accounts_created['new_accounts'][:-1], 'o-',label='Year 2021') ax.plot(df_accounts_created_pre_1_year['week_n'], df_accounts_created_pre_1_year['new_accounts'], 'o--',label='Year 2020') ax.plot(df_accounts_created_pre_2_years['week_n'], df_accounts_created_pre_2_years['new_accounts'], ':',label='Year 2019') ax.set_xlim(1,52) ax.set_ylim(0,6000) ax.vlines(40, 0, 6000, colors='k', linestyles='dashdot', label='Turned off') ax.legend(loc='upper right') ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') plt.show() query_retention_rate=''' -- only count non-bot user SELECT weekofyear(1st_month.user_creation_date) AS user_creation_cohort_week, ROUND(SUM(CASE WHEN 2nd_month.edits IS NOT NULL THEN 1 ELSE 0 END) /COUNT(1), 6) AS retention_rate FROM ( SELECT event_user_text AS user_name, event_user_id AS user_id, substr(event_user_creation_timestamp,1,10) AS user_creation_date, count(*) AS edits FROM wmf.mediawiki_history WHERE snapshot = "{SNAPSHOT}" AND wiki_db='ptwiki' AND event_entity = "revision" AND event_type = "create" AND NOT event_user_is_created_by_system AND size(event_user_is_bot_by) = 0 AND event_user_creation_timestamp BETWEEN "{START_YYYY_MM_DD}" AND "{END_YYYY_MM_DD}" AND unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") < (unix_timestamp(event_user_creation_timestamp, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60)) GROUP BY event_user_text, event_user_id, event_user_creation_timestamp ) 1st_month LEFT JOIN ( SELECT event_user_text AS user_name, event_user_id AS user_id, substr(event_user_creation_timestamp,1,10) AS user_creation_date, count(*) AS edits FROM wmf.mediawiki_history WHERE snapshot = "{SNAPSHOT}" AND wiki_db='ptwiki' AND event_entity = "revision" AND event_type = "create" AND NOT event_user_is_created_by_system AND size(event_user_is_bot_by) = 0 AND event_user_creation_timestamp BETWEEN "{START_YYYY_MM_DD}" AND "{END_YYYY_MM_DD}" AND unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") >= (unix_timestamp(event_user_creation_timestamp, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60)) AND unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") < (unix_timestamp(event_user_creation_timestamp, "yyyy-MM-dd HH:mm:ss.0") + (60*24*60*60)) GROUP BY event_user_text, event_user_id, event_user_creation_timestamp ) 2nd_month ON ( 1st_month.user_id = 2nd_month.user_id AND 1st_month.user_creation_date = 2nd_month.user_creation_date) GROUP BY weekofyear(1st_month.user_creation_date) ''' df_retention_rate=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD)) df_retention_rate_pre_1_year=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) df_retention_rate_pre_2_years=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD)) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki Retention Rate',fontweight="bold",fontsize = 16,y=1.08) ax.set_xlabel('User Cohort (Creation Week)', fontsize = 16) ax.set_ylabel('Retention Rate', fontsize = 16) ax.plot(df_retention_rate['user_creation_cohort_week'][:-8], df_retention_rate['retention_rate'][:-8], 'o-',label='Year 2021') ax.plot(df_retention_rate_pre_1_year['user_creation_cohort_week'], df_retention_rate_pre_1_year['retention_rate'], 'o--',label='Year 2020') ax.plot(df_retention_rate_pre_2_years['user_creation_cohort_week'], df_retention_rate_pre_2_years['retention_rate'], ':',label='Year 2019') ax.set_xlim(1,52) ax.set_ylim(0,0.3) ax.vlines(40, 0, 0.3, colors='k', linestyles='dashdot', label='Turned off') ax.legend(loc='upper right') ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') plt.show() query_check_user=''' SELECT weekofyear(from_unixtime(UNIX_timestamp(cul_timestamp))) AS week_n, CASE WHEN cul_type='investigate' THEN 'investigate' ELSE 'checkuser' END AS tool, COUNT(cul_id) AS requests FROM cu_log WHERE substr(cul_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}' GROUP BY weekofyear(from_unixtime(UNIX_timestamp(cul_timestamp))) , CASE WHEN cul_type='investigate' THEN 'investigate' ELSE 'checkuser' END ORDER BY week_n LIMIT 10000 ''' df_check_user=mariadb.run(query_check_user.format(START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD),'ptwiki') df_check_user_pre_1_year=mariadb.run(query_check_user.format(START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD),'ptwiki') df_check_user_pre_2_years=mariadb.run(query_check_user.format(START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD),'ptwiki') fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax[0].set_title('Checkuser Requests',fontweight="bold",fontsize = 16,y=1.08) ax[0].set_xlabel('Week', fontsize = 16) ax[0].set_ylabel('Requests', fontsize = 16) ax[0].plot(df_check_user[df_check_user['tool']=='checkuser']['week_n'], df_check_user[df_check_user['tool']=='checkuser']['requests'], 'o-',label='Year 2021') ax[0].plot(df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='checkuser']['week_n'], df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='checkuser']['requests'], 'o--',label='Year 2020') ax[0].plot(df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='checkuser']['week_n'], df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='checkuser']['requests'], ':',label='Year 2019') ax[0].set_xlim(1,52) ax[0].set_ylim(0,300) ax[0].vlines(40, 0, 300, colors='k', linestyles='dashdot', label='Turned off') ax[0].legend(loc='upper right') ax2 = ax[0].twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') ax[1].set_title('Special Investigate Requests',fontweight="bold",fontsize = 16,y=1.08) ax[1].set_xlabel('Week',fontsize = 16) ax[1].set_ylabel('Requests',fontsize = 16) ax[1].set_xlim(1,52) ax[1].set_ylim(0,300) ax[1].vlines(40, 0, 300, colors='k', linestyles='dashdot', label='Turned off') ax3=ax[1].twiny() ax3.set_xlim(0,12) ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax3.tick_params(length=0) #ax3.set_xlabel('Month',fontsize=16) ax3.xaxis.set_minor_locator(miloc) ax3.grid(axis='x', which='minor') ax[1].plot(df_check_user[df_check_user['tool']=='investigate']['week_n'], df_check_user[df_check_user['tool']=='investigate']['requests'], 'o-',label='Year 2021') ax[1].plot(df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='investigate']['week_n'], df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='investigate']['requests'], 'o--',label='Year 2020') ax[1].plot(df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='investigate']['week_n'], df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='investigate']['requests'], ':',label='Year 2019') ax[1].legend(loc='upper right') plt.show() protected_pages_query=''' SELECT weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n , COUNT(DISTINCT log_page) AS protected_pages FROM wmf_raw.mediawiki_logging WHERE snapshot ='{SNAPSHOT}' AND wiki_db='ptwiki' AND log_action = 'protect' AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}' GROUP BY weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) ORDER BY week_n LIMIT 100000 ''' df_protected_pages=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD)) df_protected_pages_pre_1_year=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD)) df_protected_pages_pre_2_years=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD)) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki Protected Pages',fontweight="bold",fontsize = 16,y=1.08) ax.set_xlabel('Week', fontsize = 16) ax.set_ylabel('Protected Pages', fontsize = 16) ax.plot(df_protected_pages['week_n'][:-1], df_protected_pages['protected_pages'][:-1], 'o-',label='Year 2021') ax.plot(df_protected_pages_pre_1_year['week_n'], df_protected_pages_pre_1_year['protected_pages'], 'o--',label='Year 2020') ax.plot(df_protected_pages_pre_2_years['week_n'], df_protected_pages_pre_2_years['protected_pages'], ':',label='Year 2019') ax.set_xlim(1,52) ax.set_ylim(0,600) ax.vlines(40, 0, 600, colors='k', linestyles='dashdot', label='Turned off') ax.legend(loc='upper right') ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') plt.show() query_edits_quality_damaging_model=''' SELECT WEEKOFYEAR(rev_timestamp) AS week_n, ROUND(SUM(case when scores["damaging"].prediction[0]='true' then 1 else 0 end)/count(rev_id) ,4) AS damaging_rate FROM event_sanitized.mediawiki_revision_score WHERE year='{YEAR}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp < '{END_YYYY_MM_DD}') AND `database`='ptwiki' GROUP BY WEEKOFYEAR(rev_timestamp) ORDER BY week_n LIMIT 10000 ''' df_edits_quality_damaging_rate=hive.run(query_edits_quality_damaging_model.format(YEAR=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD , END_YYYY_MM_DD=end_date_YYYY_MM_DD)) df_edits_quality_damaging_rate_pre_1_year=hive.run(query_edits_quality_damaging_model.format(YEAR=pre_yr, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD , END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki ORES Damaging Rate (Including IP Editors)',fontweight="bold",fontsize = 16,y=1.08) ax.set_xlabel('Week', fontsize = 16) ax.set_ylabel('Damages/Edits Rate', fontsize = 16) ax.plot(df_edits_quality_damaging_rate['week_n'], df_edits_quality_damaging_rate['damaging_rate'], 'o-',label='Year 2021') ax.plot(df_edits_quality_damaging_rate_pre_1_year['week_n'], df_edits_quality_damaging_rate_pre_1_year['damaging_rate'], 'o--',label='Year 2020') ax.set_xlim(1,52) ax.set_ylim(0,0.3) ax.legend(loc='upper right') ax.vlines(40, 0, 0.3, colors='k', linestyles='dashdot', label='Turned off') ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') plt.show() query_edits_quality_damaging_model_nonbot_registered_user=''' SELECT WEEKOFYEAR(rev_timestamp) AS week_n, ROUND(SUM(case when scores["damaging"].prediction[0]='true' then 1 else 0 end)/count(rev_id) ,4) AS damaging_rate FROM event_sanitized.mediawiki_revision_score WHERE year='{YEAR}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') AND `database`='ptwiki' -- exclude IP editors in baseline as ORES is biased against anonymous editors AND performer.user_id IS NOT NULL -- exclude bot editors AND not performer.user_is_bot AND performer.user_text not regexp "^.*bot([^a-z].*$|$)" GROUP BY WEEKOFYEAR(rev_timestamp) ORDER BY week_n LIMIT 10000 ''' df_edits_quality_damaging_rate_nonbot_registered_user=hive.run(query_edits_quality_damaging_model_nonbot_registered_user.format(YEAR=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD , END_YYYY_MM_DD=end_date_YYYY_MM_DD)) df_edits_quality_damaging_rate_nonbot_registered_user_pre_1_year=hive.run(query_edits_quality_damaging_model_nonbot_registered_user.format(YEAR=pre_yr, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD , END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD)) fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6)) fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6) ax.set_title('Ptwiki ORES Damaging Rate (Registered Non-bot Editors)',fontweight="bold",fontsize = 16,y=1.08) ax.set_xlabel('Week', fontsize = 16) ax.set_ylabel('Damages/Edits Rate', fontsize = 16) ax.plot(df_edits_quality_damaging_rate_nonbot_registered_user['week_n'], df_edits_quality_damaging_rate_nonbot_registered_user['damaging_rate'], 'o-',label='Year 2021') ax.plot(df_edits_quality_damaging_rate_nonbot_registered_user_pre_1_year['week_n'], df_edits_quality_damaging_rate_nonbot_registered_user_pre_1_year['damaging_rate'], 'o--',label='Year 2020') ax.set_xlim(1,52) ax.set_ylim(0,0.15) ax.vlines(40, 0, 0.15, colors='k', linestyles='dashdot', label='Turned off') ax.legend(loc='upper right') ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis ax2.set_xlim(0,12) ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5]) ax2.tick_params(length=0) ax2.set_xlabel('Month',fontsize=16) miloc = plt.MultipleLocator(1) ax2.xaxis.set_minor_locator(miloc) ax2.grid(axis='x', which='minor') plt.show() print("This dashboard was generated on {} at {}. It is maintained by Jennifer Wang, Product Analytics. If you have questions or feedback please email jwang@wikimedia.org or product-analytics@wikimedia.org.".format(today.strftime("%Y-%m-%d"), now.strftime("%H:%M %Z")))
By running queries you agree to the
Cloud Services Terms of Use
and you irrevocably agree to release your SQL under
CC0 License
.
Submit Query
Stop Query
All SQL code is licensed under
CC0 License
.
Checking query status...