List of Wikipedias by sample of articles/Source code
This is the source code of a script that is used for updating the List of Wikipedias by sample of articles. Originally written by MarsRover, later tweaked and upgraded by Boivie and Dcirovic, additional minor fixes by Yerpo.
An old and out-of-date version by Smeira is available at Source code (original).
MakeScoreTable.py
edit# -*- coding: utf_8 -*-
#
# Include this dependency: https://www.mediawiki.org/wiki/Manual:Pywikibot/Installation#Download_Pywikibot_with_SVN
#
import sys
sys.path.append('c:/Users/jerne/Documents/Wiki/Pywikibot/core_stable')
import pywikibot
import traceback
import os
import re
from pywikibot.data import api
import simplejson as json
from time import strftime
from datetime import date
#language information
lang_info ={'en': {'name':'English', 'localname':'English', 'weight': 1.0},
'de': {'name':'German', 'localname':'Deutsch', 'weight':1.0},
'fr': {'name':'French', 'localname':'Français', 'weight':1.0},
'pl': {'name':'Polish', 'localname':'Polski', 'weight':1.1},
'ja': {'name':'Japanese', 'localname':'日本語', 'weight':1.9},
'it': {'name':'Italian', 'localname':'Italiano', 'weight':1.1},
'nl': {'name':'Dutch', 'localname':'Nederlands', 'weight':0.9},
'pt': {'name':'Portuguese', 'localname':'Português', 'weight':1.1},
'es': {'name':'Spanish', 'localname':'Español', 'weight':1.1},
'sv': {'name':'Swedish', 'localname':'Svenska', 'weight':1.1},
'ru': {'name':'Russian', 'localname':'Русский', 'weight':1.4},
'zh': {'name':'Chinese', 'localname':'中文', 'weight':3.7},
'no': {'name':'Norwegian (Bokmål)','localname':'Norsk (Bokmål)', 'weight':1.2},
'fi': {'name':'Finnish', 'localname':'Suomi', 'weight':1.1},
'vo': {'name':'Volapük', 'localname':'Volapük'},
'ca': {'name':'Catalan', 'localname':'Català', 'weight':1.1},
'ro': {'name':'Romanian', 'localname':'Română', 'weight':1.1},
'tr': {'name':'Turkish', 'localname':'Türkçe', 'weight':1.3},
'uk': {'name':'Ukrainian', 'localname':'Українська', 'weight':1.3},
'eo': {'name':'Esperanto', 'localname':'Esperanto', 'weight':1.1},
'cs': {'name':'Czech', 'localname':'Čeština', 'weight':1.3},
'hu': {'name':'Hungarian', 'localname':'Magyar', 'weight':1.1},
'sk': {'name':'Slovak', 'localname':'Slovenčina', 'weight':1.3},
'da': {'name':'Danish', 'localname':'Dansk', 'weight':1.2},
'id': {'name':'Indonesian', 'localname':'Bahasa Indonesia', 'weight':0.9},
'he': {'name':'Hebrew', 'localname':'עברית', 'weight':1.2},
'lt': {'name':'Lithuanian', 'localname':'Lietuvių', 'weight':1.2},
'sr': {'name':'Serbian', 'localname':'Српски / Srpski', 'weight':1.4},
'sl': {'name':'Slovenian', 'localname':'Slovenščina', 'weight':1.2},
'ko': {'name':'Korean', 'localname':'한국어', 'weight':2.5},
'ar': {'name':'Arabic', 'localname':'العربية', 'weight':1.0},
'bg': {'name':'Bulgarian', 'localname':'Български', 'weight':1.1},
'et': {'name':'Estonian', 'localname':'Eesti', 'weight':1.2},
'hr': {'name':'Croatian', 'localname':'Hrvatski', 'weight':1.3},
'new':{'name':'Newar / Nepal Bhasa','localname':'नेपाल भाषा'},
'te': {'name':'Telugu', 'localname':'తెలుగు'},
'vi': {'name':'Vietnamese', 'localname':'Tiếng Việt', 'weight':1.1},
'th': {'name':'Thai', 'localname':'ไทย', 'weight':1.0},
'gl': {'name':'Galician', 'localname':'Galego', 'weight':1.1},
'fa': {'name':'Persian', 'localname':'فارسی', 'weight':1.2},
'nn': {'name':'Norwegian (Nynorsk)','localname':'Nynorsk', 'similar_lang':'no'},
'ceb':{'name':'Cebuano', 'localname':'Sinugboanong Binisaya', 'weight':0.8},
'el': {'name':'Greek', 'localname':'Ελληνικά', 'weight':1.1},
'ms': {'name':'Malay', 'localname':'Bahasa Melayu', 'weight':1.0},
'simple':{'name':'Simple English','localname':'Simple English'},
'eu': {'name':'Basque', 'localname':'Euskara', 'weight':1.1},
'bpy':{'name':'Bishnupriya Manipuri','localname':'ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী'},
'bs': {'name':'Bosnian', 'localname':'Bosanski', 'similar_lang':'hr'},
'lb': {'name':'Luxembourgish','localname':'Lëtzebuergesch'},
'is': {'name':'Icelandic', 'localname':'Íslenska', 'weight':1.1},
'ka': {'name':'Georgian', 'localname':'ქართული'},
'sq': {'name':'Albanian', 'localname':'Shqip'},
'la': {'name':'Latin', 'localname':'Latina', 'weight':1.1},
'br': {'name':'Breton', 'localname':'Brezhoneg'},
'az': {'name':'Azeri', 'localname':'Azərbaycan', 'weight':1.2},
'hi': {'name':'Hindi', 'localname':'हिन्दी', 'weight':1.0},
'bn': {'name':'Bengali', 'localname':'বাংলা'},
'ht': {'name':'Haitian', 'localname':'Krèyol ayisyen'},
'mk': {'name':'Macedonian', 'localname':'Македонски', 'weight':1.3},
'mr': {'name':'Marathi', 'localname':'मराठी'},
'sh': {'name':'Serbo-Croatian','localname':'Srpskohrvatski / Српскохрватски', 'similar_lang':'hr'},
'tl': {'name':'Tagalog', 'localname':'Tagalog'},
'io': {'name':'Ido', 'localname':'Ido'},
'cy': {'name':'Welsh', 'localname':'Cymraeg', 'weight':1.2},
'pms':{'name':'Piedmontese', 'localname':'Piemontèis'},
'lv': {'name':'Latvian', 'localname':'Latviešu', 'weight':1.1},
'su': {'name':'Sundanese', 'localname':'Basa Sunda'},
'ta': {'name':'Tamil', 'localname':'தமிழ்', 'weight':0.9},
'jv': {'name':'Javanese', 'localname':'Basa Jawa'},
'nap':{'name':'Neapolitan', 'localname':'Nnapulitano'},
'oc': {'name':'Occitan', 'localname':'Occitan'},
'nds':{'name':'Low Saxon', 'localname':'Plattdüütsch'},
'scn':{'name':'Sicilian', 'localname':'Sicilianu'},
'ast':{'name':'Asturian', 'localname':'Asturianu'},
'ku': {'name':'Kurdish', 'localname':'Kurdî / كوردی'},
'be': {'name':'Belarusian', 'localname':'Беларуская', 'similar_lang':'be-x-old'},
'wa': {'name':'Walloon', 'localname':'Walon'},
'af': {'name':'Afrikaans', 'localname':'Afrikaans', 'weight':1.0},
'be-x-old':{'name':'Belarusian (Taraškievica)','localname':'Беларуская (тарашкевіца)', 'weight':1.4},
'tg': {'name':'Tajik', 'localname':'Тоҷикӣ'},
'an': {'name':'Aragonese', 'localname':'Aragonés', 'weight':1.1},
'fy': {'name':'West Frisian','localname':'Frysk'},
'vec':{'name':'Venetian', 'localname':'Vèneto'},
'roa-tara':{'name':'Tarantino', 'localname':'Tarandíne'},
'cv': {'name':'Chuvash', 'localname':'Чăваш'},
'zh-yue':{'name':'Cantonese', 'localname':'粵語', 'similar_lang':'zh'},
'ur': {'name':'Urdu', 'localname':'اردو'},
'ksh':{'name':'Ripuarian', 'localname':'Ripoarisch'},
'sw': {'name':'Swahili', 'localname':'Kiswahili'},
'qu': {'name':'Quechua', 'localname':'Runa Simi'},
'uz': {'name':'Uzbek', 'localname':'O‘zbek'},
'mi': {'name':'Maori', 'localname':'Māori'},
'ga': {'name':'Irish', 'localname':'Gaeilge'},
'bat-smg':{'name':'Samogitian', 'localname':'Žemaitėška'},
'ml': {'name':'Malayalam', 'localname':'മലയാളം', 'weight':1.1},
'gd': {'name':'Scottish Gaelic','localname':'Gàidhlig'},
'yo': {'name':'Yoruba', 'localname':'Yorùbá'},
'co': {'name':'Corsican', 'localname':'Corsu'},
'kn': {'name':'Kannada', 'localname':'ಕನ್ನಡ', 'weight':0.9},
'pam':{'name':'Kapampangan', 'localname':'Kapampangan'},
'yi': {'name':'Yiddish', 'localname':'ייִדיש'},
'hsb':{'name':'Upper Sorbian','localname':'Hornjoserbsce'},
'nah':{'name':'Nahuatl', 'localname':'Nāhuatl'},
'ia': {'name':'Interlingua', 'localname':'Interlingua', 'weight':1.0},
'li': {'name':'Limburgian', 'localname':'Limburgs'},
'sa': {'name':'Sanskrit', 'localname':'संस्कृतम्'},
'hy': {'name':'Armenian', 'localname':'Հայերեն', 'weight':1.2},
'tt': {'name':'Tatar', 'localname':'Tatarça / Татарча'},
'als':{'name':'Alemannic', 'localname':'Alemannisch', 'weight':1.1},
'roa-rup':{'name':'Aromanian', 'localname':'Armãneashce'},
'lmo':{'name':'Lombard', 'localname':'Lumbaart'},
'map-bms':{'name':'Banyumasan', 'localname':'Basa Banyumasan'},
'am': {'name':'Amharic', 'localname':'አማርኛ'},
'nrm':{'name':'Norman', 'localname':'Nouormand/Normaund'},
'zh-min-nan':{'name':'Min Nan', 'localname':'Bân-lâm-gú', 'weight':1.2},
'pag':{'name':'Pangasinan', 'localname':'Pangasinan'},
'wuu':{'name':'Wu', 'localname':'吴语', 'similar_lang':'zh'},
'fo': {'name':'Faroese', 'localname':'Føroyskt'},
'vls':{'name':'West Flemish','localname':'West-Vlams'},
'nds-nl':{'name':'Dutch Low Saxon','localname':'Nedersaksisch'},
'se': {'name':'Northern Sami','localname':'Sámegiella'},
'rm': {'name':'Romansh', 'localname':'Rumantsch'},
'ne': {'name':'Nepali', 'localname':'नेपाली'},
'war':{'name':'Waray-Waray', 'localname':'Winaray'},
'fur':{'name':'Friulian', 'localname':'Furlan'},
'lij':{'name':'Ligurian', 'localname':'Ligure'},
'nov':{'name':'Novial', 'localname':'Novial'},
'bh': {'name':'Bihari', 'localname':'भोजपुरी'},
'sco':{'name':'Scots', 'localname':'Scots'},
'dv': {'name':'Divehi', 'localname':'ދިވެހިބަސް'},
'pi': {'name':'Pali', 'localname':'पाऴि'},
'diq':{'name':'Zazaki', 'localname':'Zazaki'},
'ilo':{'name':'Ilokano', 'localname':'Ilokano'},
'kk': {'name':'Kazakh', 'localname':'Қазақша', 'weight':1.3},
'os': {'name':'Ossetian', 'localname':'Иронау'},
'frp':{'name':'Franco-Provençal/Arpitan','localname':'Arpitan'},
'zh-classical':{'name':'Classical Chinese','localname':'古文 / 文言文', 'similar_lang':'zh'},
'mt': {'name':'Maltese', 'localname':'Malti'},
'lad':{'name':'Ladino', 'localname':'Dzhudezmo'},
'fiu-vro':{'name':'Võro', 'localname':'Võro'},
'pdc':{'name':'Pennsylvania German','localname':'Deitsch'},
'csb':{'name':'Kashubian', 'localname':'Kaszëbsczi'},
'kw': {'name':'Cornish', 'localname':'Kernewek'},
'bar':{'name':'Bavarian', 'localname':'Boarisch'},
'to': {'name':'Tongan', 'localname':'faka Tonga'},
'haw':{'name':'Hawaiian', 'localname':'Hawai`i'},
'mn': {'name':'Mongolian', 'localname':'Монгол'},
'ps': {'name':'Pashto', 'localname':'پښتو'},
'ang':{'name':'Anglo-Saxon', 'localname':'Englisc'},
'km': {'name':'Khmer', 'localname':'ភាសាខ្មែរ'},
'gv': {'name':'Manx', 'localname':'Gaelg'},
'tk': {'name':'Turkmen', 'localname':'تركمن / Туркмен'},
'ln': {'name':'Lingala', 'localname':'Lingala'},
'ie': {'name':'Interlingue', 'localname':'Interlingue'},
'tpi':{'name':'Tok Pisin', 'localname':'Tok Pisin'},
'crh':{'name':'Crimean Tatar','localname':'Qırımtatarca'},
'jbo':{'name':'Lojban', 'localname':'Lojban', 'weight':1.2},
'wo': {'name':'Wolof', 'localname':'Wolof'},
'ay': {'name':'Aymara', 'localname':'Aymar'},
'zea':{'name':'Zealandic', 'localname':'Zeêuws'},
'eml':{'name':'Emilian-Romagnol','localname':'Emiliàn e rumagnòl'},
'si': {'name':'Sinhalese', 'localname':'සිංහල'},
'sc': {'name':'Sardinian', 'localname':'Sardu'},
'or': {'name':'Oriya', 'localname':'ଓଡ଼ିଆ'},
'ig': {'name':'Igbo', 'localname':'Igbo'},
'mg': {'name':'Malagasy', 'localname':'Malagasy'},
'cbk-zam':{'name':'Zamboanga Chavacano','localname':'Chavacano de Zamboanga'},
'gu': {'name':'Gujarati', 'localname':'ગુજરાતી'},
'ky': {'name':'Kirghiz', 'localname':'Кыргызча'},
'kg': {'name':'Kongo', 'localname':'KiKongo'},
'ty': {'name':'Tahitian', 'localname':'Reo Mā`ohi'},
'glk':{'name':'Gilaki', 'localname':'گیلکی'},
'arc':{'name':'Assyrian Neo-Aramaic','localname':'ܐܪܡܝܐ'},
'gn': {'name':'Guarani', 'localname':'Avañe\'ẽ'},
'kab':{'name':'Kabyle', 'localname':'Taqbaylit'},
'so': {'name':'Somali', 'localname':'Soomaaliga'},
'ks': {'name':'Kashmiri', 'localname':'कश्मीरी / كشميري'},
'stq':{'name':'Saterland Frisian','localname':'Seeltersk'},
'mzn':{'name':'Mazandarani', 'localname':'مَزِروني'},
'cu': {'name':'Old Church Slavonic','localname':'Словѣньскъ'},
'ce': {'name':'Chechen', 'localname':'Нохчийн'},
'udm':{'name':'Udmurt', 'localname':'Удмурт кыл'},
'tet':{'name':'Tetum', 'localname':'Tetun'},
'sd': {'name':'Sindhi', 'localname':'سنڌي'},
'pap':{'name':'Papiamentu', 'localname':'Papiamentu'},
'ba': {'name':'Bashkir', 'localname':'Башҡорт', 'weight':1.4},
'pa': {'name':'Punjabi', 'localname':'ਪੰਜਾਬੀ'},
'rmy':{'name':'Romani', 'localname':'romani - रोमानी'},
'lo': {'name':'Lao', 'localname':'ລາວ'},
'bcl':{'name':'Central Bicolano','localname':'Bikol'},
'kaa':{'name':'Karakalpak', 'localname':'Qaraqalpaq tili'},
'gan':{'name':'Gan', 'localname':'贛語', 'similar_lang':'zh'},
'iu': {'name':'Inuktitut', 'localname':'ᐃᓄᒃᑎᑐᑦ'},
'myv':{'name':'Erzya', 'localname':'Эрзянь (Erzjanj Kelj)'},
'szl':{'name':'Silesian', 'localname':'Ślůnski'},
'sah':{'name':'Sakha', 'localname':'Саха тыла (Saxa Tyla)'},
'my': {'name':'Burmese', 'localname':'Burmese'},
'ext':{'name':'Extremaduran','localname':'Estremeñu'},
'hif':{'name':'Fiji Hindi', 'localname':'Fiji Hindi'},
'bo': {'name':'Tibetan', 'localname':'བོད་སྐད་'},
'srn':{'name':'Sranan', 'localname':'Sranantongo'},
'got':{'name':'Gothic', 'localname':'𐌲𐌿𐍄𐌹𐍃𐌺'},
'dsb':{'name':'Lower Sorbian','localname':'Dolnoserbšćina'},
'bm': {'name':'Bambara', 'localname':'Bamanankan'},
'sm': {'name':'Samoan', 'localname':'Gagana Samoa'},
'cdo':{'name':'Min Dong', 'localname':'Mìng-dĕ̤ng-ngṳ̄'},
'chr':{'name':'Cherokee', 'localname':'ᏣᎳᎩ ᎧᏬᏂᎯᏍᏗ'},
'mdf':{'name':'Moksha', 'localname':'Мокшень (Mokshanj Kälj)'},
'om': {'name':'Oromo', 'localname':'Oromoo'},
'ee': {'name':'Ewe', 'localname':'Eʋegbe'},
'as': {'name':'Assamese', 'localname':'অসমীয়া ভাষা আৰু লিপি'},
'ti': {'name':'Tigrinya', 'localname':'ትግርኛ_ፊደል'},
'ug': {'name':'Uyghur', 'localname':'Oyghurque'},
'kv': {'name':'Komi', 'localname':'Коми'},
'zu': {'name':'Zulu', 'localname':'IsiZulu'},
'av': {'name':'Avar', 'localname':'Авар'},
'nv': {'name':'Navajo', 'localname':'Diné bizaad'},
'ss': {'name':'Swati', 'localname':'SiSwati'},
'pih':{'name':'Norfolk', 'localname':'Norfuk'},
'ts': {'name':'Tsonga', 'localname':'Xitsonga'},
'cr': {'name':'Cree', 'localname':'Nehiyaw'},
've': {'name':'Venda', 'localname':'TshiVenda'},
'ch': {'name':'Chamorro', 'localname':'Chamoru'},
'bi': {'name':'Bislama', 'localname':'Bislama'},
'xh': {'name':'Xhosa', 'localname':'IsiXhosa'},
'rw': {'name':'Kinyarwanda', 'localname':'Kinyarwanda'},
'dz': {'name':'Dzongkha', 'localname':'རྫོང་ཁ་'},
'tn': {'name':'Tswana', 'localname':'Setswana'},
'kl': {'name':'Greenlandic', 'localname':'Kalaallisut'},
'bug':{'name':'Buginese', 'localname':'Basa Ugi'},
'ik': {'name':'Inupiak', 'localname':'Iñupiak uqautchit'},
'bxr':{'name':'Buryat (Russia)','localname':'Буряад'},
'st': {'name':'Sesotho', 'localname':'Sesotho'},
'xal':{'name':'Kalmyk', 'localname':'Хальмг келн'},
'ny': {'name':'Chichewa', 'localname':'Chicheŵa'},
'ab': {'name':'Abkhazian', 'localname':'Аҧсуа бызшәа'},
'fj': {'name':'Fijian', 'localname':'Na Vosa Vakaviti'},
'lg': {'name':'Luganda', 'localname':'Luganda'},
'tw': {'name':'Twi', 'localname':'Twi'},
'ha': {'name':'Hausa', 'localname':'هَوُسَ'},
'za': {'name':'Zhuang', 'localname':'Sawcuengh'},
'ff': {'name':'Fula', 'localname':'Fulfulde'},
'lbe':{'name':'Lak', 'localname':'Лакку маз'},
'ki': {'name':'Kikuyu', 'localname':'Gĩgĩkũyũ'},
'sn': {'name':'Shona', 'localname':'ChiShona'},
'tum':{'name':'Tumbuka', 'localname':'ChiTumbuka'},
'sg': {'name':'Sango', 'localname':'Sängö'},
'chy':{'name':'Cheyenne', 'localname':'Tsetsêhestâhese'},
'rn': {'name':'Kirundi', 'localname':'Kirundi'},
'arz':{'name':'Egyptian Arabic', 'localname':'مصرى (Maṣrī)', 'similar_lang':'ar'},
'pnt':{'name':'Pontic', 'localname':'Ποντιακά', 'similar_lang':'el'},
'mhr':{'name':'Meadow Mari', 'localname':'Олык Марий'},
'ace':{'name':'Acehnese', 'localname':'Acèh'},
'ckb':{'name':'Soranî', 'localname':'Soranî / کوردی'},
'mwl':{'name':'Mirandese', 'localname':'Mirandés'},
'pnb':{'name':'Western Panjabi', 'localname':'پنجابی'},
'pcd':{'name':'Picard', 'localname':'Picard'},
'krc':{'name':'Karachay-Balkar', 'localname':'Къарачай-Малкъар'},
'frr':{'name':'North Frisian', 'localname':'Nordfriisk'},
'bjn':{'name':'Banjar', 'localname':'Bahasa Banjar'},
'mrj':{'name':'Hill Mari', 'localname':'Кырык Мары (Kyryk Mary)'},
'koi':{'name':'Komi-Permyak', 'localname':'Перем Коми (Perem Komi)'},
'gag':{'name':'Gagauz', 'localname':'Gagauz'},
'pfl':{'name':'Palatinate German','localname':'Pfälzisch'},
'rue':{'name':'Rusyn', 'localname':'русиньскый язык'},
'ltg':{'name':'Latgalian', 'localname':'Latgaļu volūda'},
'kbd':{'name':'Kabardian', 'localname':'Aдыгэбзэ'},
'xmf':{'name':'Mingrelian', 'localname':'მარგალური'},
'nso':{'name':'Northern Sotho', 'localname':'Sesotho sa Leboa'},
'vep':{'name':'Veps', 'localname':'Vepsän kel\''},
'lez':{'name':'Lezgi', 'localname':'Лезги'},
'min':{'name':'Minangkabau', 'localname':'Minangkabau'},
'tyv':{'name':'Tuva', 'localname':'Тыва дыл'},
'hak':{'name':'Hakka', 'localname':'Hak-kâ-fa / 客家話'},
'mai':{'name':'Maithili', 'localname':'मैथिली'},
'gom':{'name':'Konkani', 'localname':'कोंकणी / Konknni'},
'ady':{'name':'Western Adyghe', 'localname':'адыгабзэ'},
'azb':{'name':'South Azerbaijani','localname':'تۆرکجه'},
'jam':{'name':'Patois', 'localname':'Jamaican Creole English'},
'olo':{'name':'Livvi-Karelian', 'localname':'Livvinkarjala'},
'tcy':{'name':'Tulu', 'localname':'ತುಳು'},
'kbp':{'name':'Kabiye', 'localname':'Kabɩyɛ'},
'atj':{'name':'Atikamekw', 'localname':'Atikamekw'},
'dty':{'name':'Doteli', 'localname':'डोटेली'},
'inh':{'name':'Ingush', 'localname':'Гӏалгӏай'},
'gor':{'name':'Gorontalo', 'localname':'Hulontalo'},
'lfn':{'name':'Lingua Franca Nova', 'localname':'Lingua Franca Nova'},
'sat':{'name':'Santali', 'localname':'ᱥᱟᱱᱛᱟᱲᱤ'},
'din':{'name':'Dinka', 'localname':'Thuɔŋjäŋ'},
'hyw':{'name':'West Armenian', 'localname':'Արեւմտահայերէն'},
'shn':{'name':'Shan', 'localname':'လိၵ်ႈတႆး'},
'szy':{'name':'Sakizaya', 'localname':'Sakizaya'},
'ban':{'name':'Balinese ', 'localname':'Bali'},
'gcr':{'name':'French Guianese Creole', 'localname':'Kriyòl Gwiyannen'},
'nqo':{'name':'N\'Ko', 'localname':'ߒߞߏ'},
'mnw':{'name':'Mon', 'localname':'မန်'},
'skr':{'name':'Saraiki', 'localname':'سرائیکی'},
'mad':{'name':'Madurese', 'localname':'Madhurâ'},
'smn':{'name':'Inari Sami', 'localname':'Anarâškielâ'},
'lld':{'name':'Ladin', 'localname':'Lingaz'},
'avk':{'name':'Kotava', 'localname':'Kotava'},
'ary':{'name':'Moroccan Arabic', 'localname':'الدارجة'},
'mni':{'name':'Meitei', 'localname':'ꯃꯤꯇꯩꯂꯣꯟ'},
'alt':{'name':'Southern Altai', 'localname':'Алтай'},
'nia':{'name':'Nias', 'localname':'Li Niha'},
'dag':{'name':'Dagbani', 'localname':'Dagbanli'},
'pcm':{'name':'Nigerian Pidgin', 'localname':'Naijá'},
'blk':{'name':'Pa\'O', 'localname':'ပအိုဝ်ႏဘာႏသာႏ'},
'kcg':{'name':'Tyap', 'localname':'Tyap'},
'guw':{'name':'Gun', 'localname':'Gungbe'},
'ami':{'name':'Amis', 'localname':'Pangcah'},
'pwn':{'name':'Paiwan', 'localname':'Pinayuanan'},
'trv':{'name':'Taroko', 'localname':'Seediq'},
'tay':{'name':'Atayal', 'localname':'Tayal'},
'gur':{'name':'Frafra', 'localname':'Farefare'},
'ti':{'name':'Tigrinya', 'localname':'ትግርኛ'},
'anp':{'name':'Angika', 'localname':'अंगिका'},
'awa':{'name':'Awadhi', 'localname':'अवधी'},
'fat':{'name':'Fanti', 'localname':'mfantse'},
'gpe':{'name':'Ghanaian Pidgin', 'localname':'Ghanaian Pidgin'},
'guc':{'name':'Wayuu', 'localname':'wayuunaiki'},
'shi':{'name':'Tachelhit', 'localname':'Taclḥit'},
'tly':{'name':'Talysh', 'localname':'tolışi'},
'fon':{'name':'Fon', 'localname':'fɔ̀ngbè'}
}
# after adding languages, update \Pywikibot\core_stable\pywikibot\families\wikipedia_family.py
# closed wikis
# 'ii': {'name':'Sichuan Yi', 'localname':'ꆇꉙ'},
# 'cho':{'name':'Choctaw', 'localname':'Chahta Anumpa'},
# 'mh': {'name':'Marshallese', 'localname':'Kajin M̧ajeļ'},
# 'mo': {'name':'Moldovan', 'localname':'Молдовеняскэ'},
# 'aa': {'name':'Afar', 'localname':'Afar'},
# 'ng': {'name':'Ndonga', 'localname':'Oshiwambo'},
# 'kj': {'name':'Kuanyama', 'localname':'Kuanyama'},
# 'ho': {'name':'Hiri Motu', 'localname':'Hiri Motu'},
# 'mus':{'name':'Muscogee', 'localname':'Muskogee'},
# 'kr': {'name':'Kanuri', 'localname':' Kanuri'},
# 'hz': {'name':'Herero', 'localname':'Otsiherero'},
# 'tokipona':{'name':'Tokipona', 'localname':'Tokipona'},
# 'lrc':{'name':'Northern Luri', 'localname':'لۊری شومالی'},
# 'na': {'name':'Nauruan', 'localname':'dorerin Naoero'},
# 'ak': {'name':'Akan', 'localname':'Akana'},
#languages to process
lang_keys = list(lang_info.keys())
lang_keys.sort()
textfile_encoding = 'utf-8'
#optimize by caching stuff
iw_cache = {}
en_labels = {}
item_list = []
#debug
max_words = -1
prev_score = {}
#score colors
color10000 = '440154'
color8000 = '472d7b'
color6000 = '3b528b'
color4000 = '2c728e'
color3000 = '21918c'
color2000 = '28ae80'
color1000 = '5ec962'
color500 = 'addc30'
color100 = 'fde725'
color0 = 'EFEFEF'
item_list_path = "ItemList.txt"
def ListOfArticles():
meta_page = []
meta_wiki = pywikibot.Site('meta', 'meta')
meta_page = pywikibot.Page(meta_wiki, 'List of articles every Wikipedia should have')
return meta_page
def LoadItemList():
item_path = item_list_path
if os.path.isfile(item_path):
return
f = open(item_path, 'w', encoding=textfile_encoding)
count = 0
grand_total = 0
page = ListOfArticles()
article = page.get(get_redirect=False)
name_last = 0
name_first = article.find('[[d:', name_last)
while name_first > -1:
name_mid = article.find('|', name_first)
cat_start =article.rfind('\n== ', name_last, name_first)
if cat_start > -1:
cat_end = article.find('==',cat_start+3, name_first)
if cat_end > -1:
cat = article[cat_start+3:cat_end]
catName = ''.center(len(cat),'-')
pywikibot.output('\n%s' % cat)
pywikibot.output('\n%s' % catName)
count = 0
name_last = article.find(']]', name_first)
if name_last > name_mid:
name_last = name_mid
article_item = article[name_first+4:name_last]
f.write(str(article_item))
f.write("\n")
count += 1
grand_total += 1
pywikibot.output('%d %s' % (count,article_item))
name_first = article.find('[[d:', name_last)
f.close()
pywikibot.output('\nGRAND TOTAL\n-----------\n%d articles' % (grand_total))
def GetItemList():
LoadItemList()
count = 0
item_file = open(item_list_path, 'r', encoding=textfile_encoding)
for line in item_file:
item = line[:-1]
if item in item_list:
errortext = item + " twice in list\n"
pywikibot.output(errortext)
with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
errorlog.write(errortext)
else:
item_list.append(item)
item_file.close()
def GetManyIws(itemlist):
pipedword = '|'.join(itemlist)
wiki = pywikibot.Site('wikidata', 'wikidata')
params = {
'action' :'query',
'prop' :'revisions',
'redirects' :True,
'titles' :pipedword,
'rvprop' :'content',
'rvslots' :'main'
}
pageRequest = api.Request(parameters=params, site=wiki)
queryresult = pageRequest.submit()
pages = queryresult['query']['pages']
word_text = {}
newitemlist = set()
for k, v in list(pages.items()):
item = v['title']
newitemlist.add(item)
if item not in itemlist:
print('not in ', item)
item_list.append(item)
errortext = item + " is redirected to.\n"
with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
errorlog.write(errortext)
try:
pagetext=v['revisions'][0]['slots']['main']['*']
except:
errortext = item + " has no wikidata item\n"
if item in item_list:
item_list.remove(item)
pywikibot.output(errortext)
with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
errorlog.write(errortext)
data_dict = json.loads(pagetext)
try:
iw_link_info = data_dict['sitelinks']
except:
iw_link_info = data_dict['links']
iw_links = {}
print(item)
try:
for linkkey, linkvalue in list(iw_link_info.items()):
iw_links[linkkey] = linkvalue['title']
except:
errortext = item + " has no links\n"
if item in item_list:
item_list.remove(item)
pywikibot.output(errortext)
with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
errorlog.write(errortext)
try:
labels = data_dict['labels']
if 'en' in labels:
en_labels[item] = labels['en']['value']
else:
en_labels[item] = item
except:
labels = data_dict['label']
if 'en' in labels:
en_labels[item] = labels['en']
else:
en_labels[item] = item
iw_cache[item] = iw_links
try:
word_text[v['title']]=en_labels[item]
except:
word_text[v['title']] = item
pywikibot.output(str(word_text.values()))
redir_items = [x for x in itemlist if x not in newitemlist]
for redir_item in redir_items:
item_list.remove(redir_item)
errortext = redir_item + " is redirected from.\n"
with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
errorlog.write(errortext)
return word_text
def GetIwLinks():
iw_link_path = "IwLinks.json"
en_label_path = "Labels.json"
global iw_cache
global en_labels
if os.path.isfile(iw_link_path):
iwf = open(iw_link_path, 'r', encoding=textfile_encoding)
iw_cache = json.load(iwf)
iwf.close()
enf = open(en_label_path, 'r', encoding=textfile_encoding)
en_labels = json.load(enf)
enf.close()
return
textdict = {}
article_group = []
item_file = open(item_list_path, encoding=textfile_encoding)
for line in item_file:
item = line[:-1]
article_group.append(item)
if len(article_group) == 10: #50
textdict.update(GetManyIws(article_group))
article_group = []
if len(article_group) > 0:
textdict.update(GetManyIws(article_group))
article_group = []
item_file.close()
iwf = open('IwLinks.json', 'w', encoding=textfile_encoding)
json.dump(iw_cache, iwf)
iwf.close()
enf = open('Labels.json', 'w', encoding=textfile_encoding)
json.dump(en_labels, enf)
enf.close()
return "cleared"
#format with spaces
def FormatNumber(s):
r = []
for i, c in enumerate(reversed(str(int(s)))):
if i and i % 3 == 0:
r.insert(0, ',')
r.insert(0, c)
return ''.join(r)
def GetPreviousScores():
temp_path = "PreviousScores.txt"
if os.path.isfile(temp_path):
temp_file = open(temp_path, encoding=textfile_encoding)
for line in temp_file:
tokens = line.split()
prev_score[tokens[0]] = float(tokens[1])
temp_file.close()
def GetArticle(item, wiki, lang):
word = GetArticleInterwikiName(item, lang)
if len(word) > 0:
page = pywikibot.Page(wiki, word)
article = page.get(get_redirect=True)
if '#REDIRECT' in article.upper():
text_start = article.find('[[')
text_end = article.find(']]', text_start)
word = article[text_start+2:text_end]
page = pywikibot.Page(wiki, word)
article = page.get()
else:
article = ''
return article
def GetArticleInterwikiName(item, lang):
if item in iw_cache:
iw_links = iw_cache[item]
else:
wikidata = pywikibot.Site('wikidata', 'wikidata')
try:
datapage = pywikibot.DataPage(wikidata, item)
data_dict = datapage.get()
except:
print(('Where is ' + item))
return ''
iw_links = data_dict['links']
labels = data_dict['label']
iw_cache[item] = iw_links
if 'en' in labels:
en_labels[item] = labels['en']
else:
en_labels[item] = ''
lang_wiki = lang.replace("-","_") + 'wiki'
if lang_wiki in iw_links:
try:
local_name = iw_links[lang_wiki]['name']
except:
local_name = iw_links[lang_wiki]
return local_name
else:
return ''
def GetInterwikiLength(article):
#calculate len of all interwiki links
interwiki_len = 0
interwiki_last = 0
interwiki_colon = 0
interwiki_nl = 0
interwiki_first = article.find('[[', interwiki_last)
while interwiki_first > -1:
interwiki_last = article.find(']]', interwiki_first)
interwiki_colon = article.find(':', interwiki_first)
if interwiki_colon > -1 and interwiki_colon < interwiki_last:
curlang = article[interwiki_first+2:interwiki_colon]
if curlang in lang_info:
interwiki_nl = article.find('\n', interwiki_last)
if interwiki_nl > -1:
interwiki_len += (interwiki_nl - interwiki_first) + 1
else:
interwiki_len += (interwiki_last - interwiki_first) + 2
interwiki_first = article.find('[[', interwiki_last)
return interwiki_len
def GetCommentLength(article):
#calculate len of all comments
comment_len = 0
comment_last = 0
comment_first = article.find('<!--', comment_last)
while comment_first > -1:
comment_last = article.find('-->', comment_first)
if comment_last == -1:
comment_last = comment_first + 4
comment_len += (comment_last - comment_first) - 4
comment_first = article.find('<!--', comment_last)
return comment_len
def IsArticleEnglish(article):
#remove comments
comments = re.compile(r'<!--(.|\n|\r)*?-->')
article = comments.sub("", article)
#remove references
refs = re.compile(r'<ref(.|\n|\r)*?</ref>')
article = refs.sub("", article)
# convert article to lower case word list
word_list = article.lower().split()
if len(word_list) == 0:
return False
# create dictionary of word:frequency pairs
freq_dic = {}
# punctuation marks to be removed
punctuation = re.compile(r'[.?!,":;]')
for word in word_list:
word = punctuation.sub("", word)
if word in freq_dic:
freq_dic[word] += 1
else:
freq_dic[word] = 1
# usually English is ~30% these words and non-English at most a few percent
common_english_words = ['the','of','on','a','is','in','his','have','by','but','that','to','with','for',
'an','from''are','was','he','which','be','as','it','this','first', 'new', 'and',
'she','also','after','at','become','best','from','had','great', 'into','their',
'these','they','time','who','her','not','one','or', 'made', 'would','are','between']
en_word_count = 0
for word in common_english_words:
if word in freq_dic:
en_word_count += freq_dic[word]
percent_thats_common_english = 100.0 * en_word_count / len(word_list)
# flag if 20% or more in the list which means more than half the article is English
if percent_thats_common_english > 20 and en_word_count > 20:
print(("Percent %f, %d out of %d" % (percent_thats_common_english, en_word_count, len(word_list))))
return True
return False
def GetArticleType(wt_article_size):
if wt_article_size == 0:
return 'absent'
elif 0 < wt_article_size < 10000:
return 'stubs'
elif 10000 <= wt_article_size < 30000:
return 'articles'
elif wt_article_size >= 30000:
return 'longarticles'
def GetScoreForLang(lang):
absent = lang_info[lang]['absent']
stubs = lang_info[lang]['stubs']
articles = lang_info[lang]['articles']
longarticles = lang_info[lang]['longarticles']
return GetScore(absent, stubs, articles, longarticles)
def GetScore(absent, stubs, articles, longarticles):
max_count = absent + stubs + articles + longarticles
max_score = max_count * 9
raw_score = stubs + (articles*4) + (longarticles*9)
if max_score > 0:
score = 100.0 * raw_score / max_score
else:
score = 0
return score
def GetLink(subtable,lang,value):
return '[[/'+subtable+'#' + lang +' '+lang_info[lang]['localname']+ '|' + value + ']]'
def GetTableNumber(count, min_subtable_count, max_subtable_count0, subtable, lang, max_subtable_count40=0):
value = FormatNumber(count)
max_subtable_count = max_subtable_count0
if GetScoreForLang(lang) > 40 and max_subtable_count40 > 0:
max_subtable_count = max_subtable_count40
if count >= min_subtable_count and (count <= max_subtable_count or max_subtable_count==-1):
return GetLink(subtable,lang,value)
else:
return value
num_lang = 0
def CalculateStatistics():
for lang in lang_keys:
CalculateStatisticsForLang(lang)
def GetWeightForLang(lang):
lang_weight = 1.0
if 'weight' in lang_info[lang]:
lang_weight = lang_info[lang]['weight']
elif 'similar_lang' in lang_info[lang]:
lang_weight = lang_info[lang_info[lang]['similar_lang']]['weight']
return lang_weight
def GetManyArticles(lang, wordlist):
#print wordlist
pipedword = '|'.join(list(wordlist.values()))
wiki = pywikibot.Site(lang, 'wikipedia')
#pageRequest = api.Request(action="query", prop="revisions", redirects=True, titles=pipedword, rvprop="content", rvslots="main", site=wiki)
params = {
'action' :'query',
'prop' :'revisions',
'redirects' :True,
'titles' :pipedword,
'rvprop' :'content',
'rvslots' :'main'
}
pageRequest = api.Request(site=wiki, parameters=params)
item_text = {}
second_try = {}
try:
queryresult = pageRequest.submit()
redirects = {}
if 'redirects' in queryresult['query']:
for redirpair in queryresult['query']['redirects']:
redirects[redirpair['from']] = redirpair['to']
pywikibot.output(str(redirects))
pages = queryresult['query']['pages']
word_text = {}
for k, v in list(pages.items()):
try:
word_text[v['title']]=v['revisions'][0]['slots']['main']['*']
except:
word_text[v['title']]=''
for k, v in list(wordlist.items()):
if v in redirects:
word = redirects[v]
else:
word = v
try:
item_text[k] = word_text[word]
except:
pywikibot.output(word)
second_try[k] = word
pywikibot.output(str(list(item_text.keys())))
except:
second_try = wordlist
if len(second_try)>0:
if len(second_try)<len(wordlist):
item_text.update(GetManyArticles(lang, second_try))
elif len(second_try)>1:
for k, v in list(second_try.items()):
one_item = {}
one_item[k] = v
item_text.update(GetManyArticles(lang, one_item))
else:
for k, v in list(second_try.items()):
item_text[k] = ''
pywikibot.output('Error getting: ' + k + ' ' + v)
return item_text
def GetArticleTexts(lang):
textdict = {}
article_group = {}
for item in item_list:
word = GetArticleInterwikiName(item, lang)
if (word == ''):
textdict[item] = ''
else:
article_group[item]=word
if len(article_group) == 50:
# print article_group
textdict.update(GetManyArticles(lang, article_group))
article_group.clear()
pywikibot.output(lang +' '+ str(len(textdict)))
if len(article_group) > 0:
textdict.update(GetManyArticles(lang, article_group))
article_group.clear()
return textdict
def CalculateStatisticsForLang(lang):
global num_lang
num_lang += 1
print((('=['+lang+' '+str(num_lang)+ '/' + str(len(lang_keys)) + ']').ljust(76,'=')))
try:
lang_info[lang]['total_size'] = 0
lang_info[lang]['absent'] = 0
lang_info[lang]['stubs'] = 0
lang_info[lang]['articles'] = 0
lang_info[lang]['longarticles'] = 0
lang_info[lang]['art_count'] = 0
temp_path = "~%s_output.txt" % (lang)
if os.path.isfile(temp_path):
temp_file = open(temp_path, encoding=textfile_encoding)
art_count = int(temp_file.readline())
lang_info[lang]['art_count'] = art_count
for index in range(art_count):
artKey = 'art_'+str(index)
lang_info[lang][artKey] = {}
lang_info[lang][artKey]['item'] = temp_file.readline().strip()
lang_info[lang][artKey]['name'] = temp_file.readline().strip()
linetext = temp_file.readline()
try:
lang_info[lang][artKey]['size'] = int(linetext)
except:
print((index, lang_info[lang][artKey]['item'], lang_info[lang][artKey]['name']))
lang_info[lang][artKey]['size'] = 0
lang_info[lang][artKey]['error'] = temp_file.readline().strip()
temp_file.close()
print(('..using previous %s result...' % (lang)))
else:
wiki = pywikibot.Site(lang, 'wikipedia')
textdict = GetArticleTexts(lang)
word_count = 0
for item, article in list(textdict.items()):
word_count += 1
if word_count > max_words > 0:
break
article_size = 0
error = ''
try:
raw_article_size = len(article)
interwiki_len = GetInterwikiLength(article)
comment_len = GetCommentLength(article)
article_size = (raw_article_size - interwiki_len - comment_len)
if lang != "en" and lang != 'simple' and lang != 'sco' and IsArticleEnglish(article):
raise TypeError ("Wrong language, [[%s:%s]] has too much untranslated English." % (lang, GetArticleInterwikiName(item, lang)))
lang_weight = GetWeightForLang(lang)
print((str(lang).ljust(3), str(word_count).rjust(3), item.ljust(30)), end=' ' )
print(("%.1f" % (article_size * lang_weight)).rjust(11), str(lang_weight).rjust(5), str(interwiki_len).rjust(9), str(comment_len).rjust(9))
except KeyboardInterrupt:
sys.exit(1)
except Exception:
e = sys.exc_info()[1]
sys.stderr.write('\n')
traceback.print_exc()
sys.stderr.write('\n')
try:
error = CookString(str(e))
except:
error = "Error."
art_index = item_list.index(item)
artKey = 'art_'+str(art_index)
lang_info[lang][artKey] = {}
lang_info[lang][artKey]['item'] = item
if item in en_labels:
lang_info[lang][artKey]['name'] = en_labels[item]
else:
lang_info[lang][artKey]['name'] = item
lang_info[lang][artKey]['size'] = article_size
lang_info[lang][artKey]['error'] = error
lang_info[lang]['art_count'] = lang_info[lang]['art_count'] + 1
temp_file = open(temp_path,'w', encoding=textfile_encoding)
temp_file.write(str(lang_info[lang]['art_count'])+'\n')
for index in range(lang_info[lang]['art_count']):
artKey = 'art_'+str(index)
temp_file.write(lang_info[lang][artKey]['item']+'\n')
sz_name_tmp = lang_info[lang][artKey]['name']+'\n'
temp_file.write(sz_name_tmp)
temp_file.write(str(lang_info[lang][artKey]['size'])+'\n')
temp_file.write(lang_info[lang][artKey]['error']+'\n')
temp_file.close()
for index in range(lang_info[lang]['art_count']):
artKey = 'art_'+str(index)
article_size = lang_info[lang][artKey]['size']
wt_article_size = article_size * GetWeightForLang(lang)
article_type = GetArticleType(wt_article_size)
if not lang_info[lang][artKey]['error']:
lang_info[lang][article_type] = lang_info[lang][article_type] + 1
lang_info[lang]['total_size'] = lang_info[lang]['total_size'] + article_size
except:
sys.stderr.write('\n')
traceback.print_exc()
sys.stderr.write('\n')
def GetGrowthNumber(lang, score):
if lang in prev_score:
return score - prev_score[lang]
def GetGrowth(lang, score):
if lang in prev_score:
growth = "%+2.2f" % round(GetGrowthNumber(lang, score),2)
else:
growth = "n/a"
if growth == '-0.00':
growth = '+0.00'
return growth
def GetAverageSize(lang, article_count):
if article_count > 0:
avg_size = int(round(lang_info[lang]['total_size'] / article_count))
else:
avg_size = 0
return int(avg_size * GetWeightForLang(lang))
def GetMedianSize(lang):
x = []
art_count = lang_info[lang]['art_count']
for index in range(art_count):
artKey = 'art_'+str(index)
size = lang_info[lang][artKey]['size']
if size > 0:
x.append(size)
x.sort()
mid = int(len(x)/2)
median_size = 0
if len(x) > 0:
if len(x) % 2:
median_size = x[mid]
else:
median_size = (x[mid-1] + x[mid]) / 2
return int(median_size * GetWeightForLang(lang))
def PrintResults():
lang_keys.sort(key=GetScoreForLang, reverse=True)
print('\n')
print('RESULTS\n----------------------------------------------------------------------')
print(('Lang:',' AvgSize','Median','Absent',' <10k ','10-30k',' >30k ', 'Score', 'Growth'))
for lang in lang_keys:
absent = lang_info[lang]['absent']
stubs = lang_info[lang]['stubs']
articles = lang_info[lang]['articles']
longarticles = lang_info[lang]['longarticles']
article_count = stubs + articles + longarticles
score = GetScore(absent, stubs, articles, longarticles)
growth = GetGrowth(lang, score)
avg_size = GetAverageSize(lang, article_count)
med_size = GetMedianSize(lang)
print((lang.ljust(6)), end=' ')
print((str(avg_size).rjust(7)), end=' ')
print((str(med_size).rjust(7)), end=' ')
print((str(absent).rjust(5)), end=' ')
print((str(stubs).rjust(6)), end=' ')
print((str(articles).rjust(6)), end=' ')
print((str(longarticles).rjust(6)), end=' ')
print((("%6.2f" % score).rjust(6)), end=' ')
print((growth.rjust(6)))
def GetWikiTableResults(awards):
lang_keys.sort(key=GetScoreForLang, reverse=True)
table = 'Last Update: ' + date.today().strftime('%d %b %Y') + '\n\n'
table += '{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"'
table += '\n|-\n'
table += u'!width = 45 | No. !! width = 55 | Wiki !! width = 220 | Language !! width = 55 | [[Talk:List of Wikipedias by sample of articles/Archives/2007#Proposed weighting of characters for formula_.28Option.232_using_Babel_text.29|Weight]] !! width = 120 | Mean Article<br>Size !! width = 120 | [[Talk:List_of_Wikipedias_by_sample_of_articles#average_or_median.3F|Median Article<br>Size]] !! width = 80 | [[/Absent Articles|Absent]]<br>(0k) !! width=80| Stubs<br>(< 10k)!! width = 80 | Articles<br>(10-30k) !! width = 80 | Long Art.<br>(> 30k) !! width = 80 | [[Talk:List of Wikipedias by sample of articles/Archives/2008#Other possibility of maximum score|Score]]'
table += '!! width = 50 | [[Talk:List of Wikipedias by sample of articles/Archives/2008#Script_extension|Growth]]'
table += '\n|-\n'
i=0
for lang in lang_keys:
i += 1
absent = lang_info[lang]['absent']
stubs = lang_info[lang]['stubs']
articles = lang_info[lang]['articles']
longarticles = lang_info[lang]['longarticles']
article_count = stubs + articles + longarticles
dagger = '†'
if absent + article_count == 0:
lang_footnote = dagger
absent = lang_info['en']['art_count']
else:
lang_footnote = ''
table += '|' + str(i) + '\n'
table += '| [[:' + lang + ':|' + lang + ']]' + lang_footnote + '\n'
table += '| style = "text-align: left" | [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]\n'
if 'weight' in lang_info[lang]:
weight = str(lang_info[lang]['weight'])
elif 'similar_lang' in lang_info[lang]:
weight = str(lang_info[lang_info[lang]['similar_lang']]['weight']) + '**'
else:
weight = '1.0*'
score = GetScore(absent, stubs, articles, longarticles)
growth = GetGrowth(lang, score)
avg_size = GetAverageSize(lang, article_count)
med_size = GetMedianSize(lang)
if HasAwards(awards, lang):
growth = GetLink('Growth',lang, growth)
table += '| ' + weight + '\n'
table += '| ' + GetTableNumber(avg_size, 1, -1,'Neglected', lang) + '\n'
table += '| ' + FormatNumber(med_size) + '\n'
table += '| ' + GetTableNumber(absent, 1, 250,'Absent Articles', lang) + '\n'
table += '| ' + GetTableNumber(stubs, 1, 100,'Stubs', lang, 250) + '\n'
table += '| ' + GetTableNumber(articles, 1, 100,'Articles', lang, 250) + '\n'
table += '| ' + GetTableNumber(longarticles, 1, 100,'Long Articles', lang) + '\n'
#color code score
if score >= 100.00:
color = "|style = \"background: "+'\u0023'+color10000+"\; color:#ffffff\""
elif score >= 80.00:
color = "|style = \"background: "+'\u0023'+color8000+"\; color:#ffffff\""
elif score >= 60.00:
color = "|style = \"background: "+'\u0023'+color6000+"\; color:#ffffff\""
elif score >= 40.00:
color = "|style = \"background: "+'\u0023'+color4000+"\; color:#ffffff\""
elif score >= 30.00:
color = "|style = \"background: "+'\u0023'+color3000+"\; color:#000000\""
elif score >= 20.00:
color = "|style = \"background: "+'\u0023'+color2000+"\; color:#000000\""
elif score >= 10.00:
color = "|style = \"background: "+'\u0023'+color1000+"\; color:#000000\""
elif score >= 5.00:
color = "|style = \"background: "+'\u0023'+color500+"\; color:#000000\""
elif score >= 1.00:
color = "|style = \"background: "+'\u0023'+color100+"\; color:#000000\""
else:
color = "|style = \"background: "+'\u0023'+color0+"\; color:#000000\""
table += color + '| ' + ("%.2f" % score) + '\n'
table += '| ' + growth + '\n'
table += '|-\n'
table = table[:-2] + '}'
return table
def GetWikiTableArticles(article_type, min_articles, max_articles_0, max_articles_40=0):
lang_keys.sort()
table = ''
i=0
for lang in lang_keys:
i += 1
count=0
max_articles = max_articles_0
score = GetScoreForLang(lang)
if score > 40 and max_articles_40 > 0:
max_articles = max_articles_40
section = '==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
for index in range(lang_info[lang]['art_count']):
artKey = 'art_'+str(index)
artWtSize = GetArticleSize(lang, artKey)
artType = GetArticleType(artWtSize)
if artType == article_type:
section += '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info[lang][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n'
count += 1
if min_articles <= count <= max_articles:
table += section
return table
def GetArticleName(lang, artKey):
if artKey in lang_info[lang]:
return lang_info[lang][artKey]['name']
else:
return 0
def GetArticleSize(lang, artKey):
if artKey in lang_info[lang]:
if lang_info[lang][artKey]['error'] :
return 0;
return lang_info[lang][artKey]['size'] * GetWeightForLang(lang)
else:
return 0
def GetEdgeFactor(lang, artKey):
size = GetArticleSize(lang, artKey)
if size==0:
return 1
if 7000 < size < 1000:
return (size - 7000) / 1000
if 24000 < size < 30000:
return (size - 24000) / 1000
else:
return 0
def GetRuntFactor(lang, artKey):
size = GetArticleSize(lang, artKey)
if size > 0:
for index in range(lang_info['en']['art_count']):
otherArtKey = 'art_'+str(index)
if otherArtKey != artKey:
otherSize = GetArticleSize(lang, otherArtKey)
if 0 < otherSize < size:
return 0 #you are not the runt
return 4
return 0
def GetArticlePoints(lang, artKey):
size = GetArticleSize(lang, artKey)
if size > 0 and size < 10000:
return 1
elif size > 10000 and size < 30000:
return 4
elif size > 30000:
return 9
return 0
def GetAverageArticlePoints(artKey):
total = sum(GetArticlePoints(lang, artKey) for lang in lang_keys)
return float(total) / len(lang_keys)
def GetAverageArticleSize(artKey):
total = sum(GetArticleSize(lang, artKey) for lang in lang_keys)
return int(float(total) / len(lang_keys))
def GetNeglectForArticle(lang, artInfo):
artKey = artInfo['artKey']
avgPnts = GetAverageArticlePoints(artKey) #0 to 9
pnts = GetArticlePoints(lang, artKey) #0 to 9
edgeFactor = GetEdgeFactor(lang, artKey) #0 to 6
runtFactor = GetRuntFactor(lang, artKey) #0 to 4
return avgPnts - pnts + edgeFactor + runtFactor
def GetArticlesSortedByNeglect(lang):
artInfos = []
if 'art_count' in lang_info['en']:
for index in range(lang_info['en']['art_count']):
artKey = 'art_'+str(index)
artInfos.append( {} )
artInfos[index]['artKey'] = artKey
artInfos[index]['popularity'] = GetAverageArticleSize(artKey)
artInfos[index]['neglect'] = GetNeglectForArticle(lang, artInfos[index])
artInfos.sort(key=lambda x: (x['neglect'], x['popularity']), reverse=True)
return artInfos
def GetLargestArticles(artKey, maxLangs):
lang_keys = list(lang_info.keys())
lang_keys.sort(key=lambda lang: GetArticleSize(lang, artKey), reverse=True)
item = lang_info['en'][artKey]['item']
ret = []
for lang in lang_keys[0:maxLangs]:
ret.append ( '[['+lang+':'+GetArticleInterwikiName(item, lang)+'|'+lang+':'+FormatNumber(GetArticleSize(lang, artKey))+']]' )
return ' -- '.join(ret)
def GetArticleTypeCount(artKey,points):
return len([lang for lang in lang_keys if GetArticlePoints(lang, artKey) == points])
def GetNeglectedArticles(lang, max_articles):
artInfos = GetArticlesSortedByNeglect(lang)
i=0
table = ''
for artInfo in artInfos:
if artInfo['artKey'] in lang_info[lang]:
item = lang_info[lang][artInfo['artKey']]['item']
name = lang_info[lang][artInfo['artKey']]['name']
table += '#[[d:'+item+'|'+name+']]'
size = int(GetArticleSize(lang, artInfo['artKey']))
if size > 0:
iw_name = GetArticleInterwikiName(item, lang)
if iw_name == '':
table += ' ('+str(size) + ')'
else:
iw_link = lang+':'+iw_name
table += ' ([['+iw_link+'|'+str(size)+']])'
table += '\n'
i+=1
if i >= max_articles: break
return table
def GetPopularArticles(max_articles):
artInfos = GetArticlesSortedByNeglect('en')
artInfos.sort(key=lambda x: x['popularity'], reverse=True)
i=0
table = '{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"'
table += '\n|-\n'
table += '!width = 45 | ? !! width = 90 | Average Size !! width = 150 | Article Name !! width = 80 | [[Talk:List of Wikipedias by sample of articles#Article metric|Absent<br>(0k)]] !! width=80| Stubs<br>(< 10k)!! width = 80 | Articles<br>(10-30k) !! width = 80 | Long Art.<br>(> 30k) !! width = 150 | Largest Articles\n'
for artInfo in artInfos:
i+=1
artKey = artInfo['artKey']
table += '|-\n'
table += '|' + str(i)
table += '||'+FormatNumber(artInfo['popularity'])
table += '||style="text-align:left"|[[d:'+lang_info['en'][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']]'
table += '||'+str(GetArticleTypeCount(artKey,0))
table += '||'+str(GetArticleTypeCount(artKey,1))
table += '||'+str(GetArticleTypeCount(artKey,4))
table += '||'+str(GetArticleTypeCount(artKey,9))
table += '||'+GetLargestArticles(artKey,4)+'\n'
if i >= max_articles > 0: break
table += '|}\n'
return table
def GetWikiNeglectedArticles():
lang_keys.sort()
table = ''
print('writing Popular Articles...')
table += '==Popular Articles==\n'
table += GetPopularArticles(-1)
print('writing Neglected Articles...')
table += '==Neglected Articles==\n'
for lang in lang_keys:
print((' '+lang))
if lang_info[lang]['art_count'] > 0:
table += '==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
table += GetNeglectedArticles(lang, 10)
has_errors = False
section = '====Errors====\n'
for index in range(lang_info[lang]['art_count']):
artKey = 'art_'+str(index)
if lang_info[lang][artKey]['error'] :
section = section + '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n'
has_errors = True
if has_errors:
table = table + section
return table
def SaveWikiTableResults(awards):
print('writing Results ...')
f = open('results.txt', 'w', encoding=textfile_encoding)
f.write(GetWikiTableResults(awards))
f.close()
print('writing Absent...')
f = open('_absent.txt', 'w', encoding=textfile_encoding)
f.write(GetWikiTableArticles('absent',1, 250))
f.close()
print('writing Stubs...')
f = open('_stub.txt', 'w', encoding=textfile_encoding)
f.write(GetWikiTableArticles('stubs',1, 100, 250))
f.close()
print('writing Articles...')
f = open('_articles.txt', 'w', encoding=textfile_encoding)
f.write(GetWikiTableArticles('articles',1, 100, 250))
f.close()
print('writing Long Articles...')
f = open('_longarticles.txt', 'w', encoding=textfile_encoding)
f.write(GetWikiTableArticles('longarticles',1,100))
f.close()
#print('writing Awards...')
#f = open('_growth.txt', 'w', encoding=textfile_encoding)
#f.write(GetWikiAwards(awards))
#f.close()
print('writing Suggestions...')
f = open('_neglectedarticles.txt', 'w', encoding=textfile_encoding)
f.write(GetWikiNeglectedArticles())
f.close()
def CookString(rawString):
cookString = ''
for part in rawString.replace("'","||").split("|"):
if len(part)==0:
cookString += "'"
else:
cookString += eval("u'"+part+"'")
return cookString
def GetGrowths(article):
growths = {}
lang_last = 0
lang_first = article.find('[[:', lang_last)
while lang_first > -1:
lang_last = article.find('|', lang_first)
if lang_last == -1:
break
lang = article[lang_first+3:lang_last-1]
score_first = article.find('style = "background:',lang_last)
if score_first == -1:
break
score_last = article.find('|', score_first+32)
if score_last == -1:
break
growth_end = article.find('\n', score_last)
growth_str = article[score_last+2:growth_end]
try:
growth_pipe = growth_str.find('|')
if growth_pipe > -1:
growth_str = growth_str[growth_pipe+1:-2]
if growth_str.find(' ‡') > -1:
growth_str = growth_str[0:-2]
growth = float(growth_str)
except:
growth = 0
growths[lang]=growth
lang_first = article.find('[[:', score_last)
return growths
def GetLastUpdated(article):
date_first = article.find('Last Update')
if date_first > -1:
date_last_paren = article.find('(', date_first)
date_last_br = article.find('<br/>', date_first)
if date_last_paren > -1 and date_last_paren < date_last_br :
date_last = date_last_paren
else:
date_last = date_last_br
if date_last > -1:
hyphen = article.find('-', date_first,date_last)
if hyphen > -1:
date_first = hyphen+1
else:
date_first += 12
parts = article[date_first:date_last].strip().split(' ')
if len(parts[0])==1:
parts[0] = '0'+parts[0]
if parts[0][0]==':':
parts[0] = '0'+parts[0][1]
parts[1] = parts[1][0:3]
return ' '.join(parts)
growthsG = {}
def CalculatePlacing(growths,oldid,update):
global growthsG
growthsG = growths
lang_keys = list(growths.keys())
lang_keys.sort(key=lambda x: growthsG[x], reverse=True)
placeNo=0
print(update)
placing = []
for lang in lang_keys:
if (placeNo < 3 or growths[lang] > 1) and growths[lang] != None:
placeNo += 1
if placeNo==1:
placestr = '1st Place'
ribbonimg = 'Article blue.svg'
elif placeNo==2:
placestr = '2nd Place'
ribbonimg = 'Article red.svg'
elif placeNo==3:
placestr = '3rd Place'
ribbonimg = 'Article yellow.svg'
elif placeNo>3:
placestr = 'Honorable Mention'
ribbonimg = 'Article green.svg'
print((" %d %-3s %+2.2f" % (placeNo, lang, growths[lang])))
place = {'lang':lang,'growth':growths[lang],'oldid':oldid,'update':update,'placestr':placestr,'ribbonimg':ribbonimg}
placing.append(place)
return placing
def GetPreviousAwards():
article_name = 'List of Wikipedias by sample of articles'
meta_wiki = pywikibot.Site('meta', 'meta')
meta_page = pywikibot.Page(meta_wiki, article_name)
awards = {}
prevUpdate = ''
prevGrowth = -999
for rev in meta_page.revisions():
oldid,datetime,username,comments = rev.hist_entry()
if datetime.year >= 2009 and ("updat" in comments.lower() or 'correct' in comments.lower()) and oldid!=2228213 and oldid!=2264612 and oldid!=3122655 and oldid!=3359817:
article = meta_page.getOldVersion(get_redirect=False,oldid=oldid)
growths = GetGrowths(article)
if 'en' in growths:
update = GetLastUpdated(article)
growth = growths['en']
if update != prevUpdate and ( prevGrowth != growth or oldid > 3807780 ):
prevUpdate = update
prevGrowth = growth
awards[update] = CalculatePlacing(growths,oldid,update)
return awards
def HasAwards(awards, lang):
for placings in list(awards.values()):
for place in placings:
if lang == place['lang']:
return True
return False
def GetWikiAwards(awards):
table = """<!--
Only the current awards are given.
The original implementation utilizes edit history of the page “List of Wikipedias
by sample of articles” for regeneration of the entire awards page. Since that
source page has over the years accumulated numerous irregular edits, which
are not easily distinguishable from monthly updates, such an approach has
become impractical.
-->"""
table += '\n\n==2009-2020 Improvement Awards==\n'
for lang in lang_keys:
section = '==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
rows = []
for update, placings in list(awards.items()):
for place in placings:
if lang == place['lang']:
mid_section = '|-\n'
mid_section += '|width = 150 | [[Image:%s|20px]] %s\n' % (place['ribbonimg'],place['placestr'])
if place['oldid'] == -1:
mid_section += '|width = 120 align=center| [[:m:List of Wikipedias by sample of articles|%s]]\n' % (place['update'])
else:
mid_section += '|width = 120 align=center| <span class="plainlinks">[http://meta.wikimedia.org/w/index.php?title=List_of_Wikipedias_by_sample_of_articles&oldid=%s %s]</span>\n' % (place['oldid'],place['update'])
mid_section += '|width = 80 align=center| %+2.2f\n' % round(place['growth'],2)
rows.append({'place':place,'mid_section':mid_section})
if len(rows) > 0:
rows = sorted(sorted(rows, key=lambda row: row['place']['growth'], reverse=True), key=lambda row: row['place']['placestr'])
if len(rows) > 1:
section += '{|class="wikitable sortable" cellpadding="6" cellspacing="0"\n'
section += '! !! !!\n'
else:
section += '{|class="wikitable" cellpadding="6" cellspacing="0"\n'
for row in rows:
section += row['mid_section']
section += '|}\n'
table += section
return table
def CalculateAwards():
print("calculating awards...")
todays = {}
for lang in lang_keys:
absent = lang_info[lang]['absent']
stubs = lang_info[lang]['stubs']
articles = lang_info[lang]['articles']
longarticles = lang_info[lang]['longarticles']
score = GetScore(absent, stubs, articles, longarticles)
growth = GetGrowthNumber(lang, score)
todays[lang] = growth
update = strftime("%d %b %Y")
placing = CalculatePlacing(todays,-1,update)
awards = {}
#awards = GetPreviousAwards()
awards[update] = placing
return awards
def SavePreviousScore():
article_name = 'List of Wikipedias by sample of articles'
meta_wiki = pywikibot.Site('meta', 'meta')
meta_page = pywikibot.Page(meta_wiki, article_name)
article = meta_page.get(get_redirect=False)
f = open('PreviousScores.txt', 'w', encoding=textfile_encoding)
count = 0
lang_last = 0
lang_first = article.find('[[:', lang_last)
while lang_first > -1:
lang_last = article.find('|', lang_first)
lang = article[lang_first+3:lang_last-1]
score_first = article.find('style = "background:',lang_last)
score_last = article.find('|', score_first+48)
score = article[score_first+47:score_last-1]
f.write(lang + ' ' + score + '\n')
count += 1
print((count, lang, score))
lang_first = article.find('[[:', score_last)
f.close()
#support dividing up work
if len(sys.argv) == 3:
part = int(sys.argv[1])-1
numparts = int(sys.argv[2])
lang_keys = [lang for lang in lang_keys if lang_keys.index(lang) % numparts == part]
def main():
SavePreviousScore()
GetPreviousScores()
GetItemList()
GetIwLinks()
CalculateStatistics()
awards = {}
#CalculateAwards()
PrintResults()
SaveWikiTableResults(awards)
if __name__ == '__main__':
try:
main()
finally:
pywikibot.stopme()