1. -*- coding: utf_8 -*-

import sys sys.path.append('./core')

import pywikibot from pywikibot import pagegenerators import traceback

import os import re import collections from pywikibot.data import api import simplejson as json from time import strftime from datetime import date

  1. language information

lang_info ={'en': {'name':'English', 'localname':'English', 'weight': 1.000},

           'ab': {'name':'Abkhazian',   'localname':'Аҧсуа бызшәа'},
           'ace':{'name':'Acehnese',    'localname':'Acèh'},
           'ady':{'name':'Western Adyghe', 'localname':'адыгабзэ'},
           'af': {'name':'Afrikaans',   'localname':'Afrikaans', 'weight':1.025},
           'ak': {'name':'Akan',        'localname':'Akana'},
           'als':{'name':'Alemannic',   'localname':'Alemannisch', 'weight':1.1},
           'alt':{'name':'Southern Altai', 'localname':'алтай тил', 'similar_lang':'ru'},
           'am': {'name':'Amharic',     'localname':'አማርኛ'},
           'an': {'name':'Aragonese',   'localname':'Aragonés', 'weight':1.1},
           'ang':{'name':'Anglo-Saxon', 'localname':'Englisc'},
           'ar': {'name':'Arabic',      'localname':'العربية', 'weight':1.408},
           'arc':{'name':'Assyrian Neo-Aramaic','localname':'ܐܪܡܝܐ'},
           'ary':{'name':'Moroccan Arabic', 'localname':'الدارجة', 'similar_lang':'ar'},
           'arz':{'name':'Egyptian Arabic', 'localname':'مصرى (Maṣrī)', 'similar_lang':'ar'},
           'as': {'name':'Assamese',    'localname':'অসমীয়া ভাষা আৰু লিপি'},
           'ast':{'name':'Asturian',    'localname':'Asturianu'},
           'atj':{'name':'Atikamekw',   'localname':'Atikamekw'},
           'av': {'name':'Avar',        'localname':'Авар'},
           'avk':{'name':'Kotava',      'localname':'Kotava'},
           'awa':{'name':'Awadhi',      'localname':'अवधी'},
           'ay': {'name':'Aymara',      'localname':'Aymar'},
           'az': {'name':'Azeri',       'localname':'Azərbaycan', 'weight':1.2},
           'azb':{'name':'South Azerbaijani','localname':'تۆرکجه'},
           'ba': {'name':'Bashkir',     'localname':'Башҡорт', 'similar_lang':'kk'},
           'ban':{'name':'Balinese ',   'localname':'Bali'},
           'bar':{'name':'Bavarian',    'localname':'Boarisch'},
           'bat-smg':{'name':'Samogitian', 'localname':'Žemaitėška'},
           'bcl':{'name':'Central Bicolano','localname':'Bikol'},
           'be': {'name':'Belarusian',  'localname':'Беларуская', 'weight':0.937},
           'be-x-old':{'name':'Belarusian (Taraškievica)','localname':'Беларуская (тарашкевіца)', 'weight':1.4},
           'bg': {'name':'Bulgarian',   'localname':'Български', 'weight':0.935},
           'bh': {'name':'Bihari',      'localname':'भोजपुरी'},
           'bi': {'name':'Bislama',     'localname':'Bislama'},
           'bjn':{'name':'Banjar',      'localname':'Bahasa Banjar'},
           'bm': {'name':'Bambara',     'localname':'Bamanankan'},
           'bn': {'name':'Bengali',     'localname':'বাংলা'},
           'bo': {'name':'Tibetan',     'localname':'བོད་སྐད་'},
           'bpy':{'name':'Bishnupriya Manipuri','localname':'ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী'},
           'br': {'name':'Breton',      'localname':'Brezhoneg'},
           'bs': {'name':'Bosnian',     'localname':'Bosanski', 'similar_lang':'hr'},
           'bug':{'name':'Buginese',    'localname':'Basa Ugi'},
           'bxr':{'name':'Buryat (Russia)','localname':'Буряад'},
           'ca': {'name':'Catalan',     'localname':'Català', 'weight':0.971},
           'cbk-zam':{'name':'Zamboanga Chavacano','localname':'Chavacano de Zamboanga'},
           'cdo':{'name':'Min Dong',    'localname':'Mìng-dĕ̤ng-ngṳ̄'},
           'ce': {'name':'Chechen',     'localname':'Нохчийн'},
           'ceb':{'name':'Cebuano',     'localname':'Sinugboanong Binisaya', 'weight':0.873},
           'ch': {'name':'Chamorro',    'localname':'Chamoru'},
           'chr':{'name':'Cherokee',    'localname':'ᏣᎳᎩ ᎧᏬᏂᎯᏍᏗ'},
           'chy':{'name':'Cheyenne',    'localname':'Tsetsêhestâhese'},
           'ckb':{'name':'Soranî',      'localname':'Soranî / کوردی'},
           'co': {'name':'Corsican',    'localname':'Corsu'},
           'cr': {'name':'Cree',        'localname':'Nehiyaw'},
           'crh':{'name':'Crimean Tatar', 'localname':'Qırımtatarca'},
           'cs': {'name':'Czech',       'localname':'Čeština', 'weight':1.083},
           'csb':{'name':'Kashubian',   'localname':'Kaszëbsczi'},
           'cu': {'name':'Old Church Slavonic','localname':'Словѣньскъ'},
           'cv': {'name':'Chuvash',     'localname':'Чăваш'},
           'cy': {'name':'Welsh',       'localname':'Cymraeg', 'weight':1.050},
           'da': {'name':'Danish',      'localname':'Dansk', 'weight':0.978},
           'de': {'name':'German',      'localname':'Deutsch',  'weight':0.894},
           'din':{'name':'Dinka',       'localname':'Thuɔŋjäŋ'},
           'diq':{'name':'Zazaki',      'localname':'Zazaki'},
           'dsb':{'name':'Lower Sorbian','localname':'Dolnoserbšćina'},
           'dty':{'name':'Doteli',      'localname':'डोटेली'},
           'dv': {'name':'Divehi',      'localname':'ދިވެހިބަސް'},
           'dz': {'name':'Dzongkha',    'localname':'རྫོང་ཁ་'},
           'ee': {'name':'Ewe',         'localname':'Eʋegbe'},
           'el': {'name':'Greek',       'localname':'Ελληνικά', 'weight':0.857},
           'eml':{'name':'Emilian-Romagnol','localname':'Emiliàn e rumagnòl'},
           'eo': {'name':'Esperanto',   'localname':'Esperanto', 'weight':1.074},
           'es': {'name':'Spanish',     'localname':'Español',  'weight':0.897},
           'et': {'name':'Estonian',    'localname':'Eesti', 'weight':0.986},
           'eu': {'name':'Basque',      'localname':'Euskara', 'weight':0.979},
           'ext':{'name':'Extremaduran','localname':'Estremeñu'},
           'fa': {'name':'Persian',     'localname':'فارسی', 'weight':1.167},
           'ff': {'name':'Fula',        'localname':'Fulfulde'},
           'fi': {'name':'Finnish',     'localname':'Suomi', 'weight':0.958},
           'fiu-vro':{'name':'Võro',    'localname':'Võro'},
           'fj': {'name':'Fijian',      'localname':'Na Vosa Vakaviti'},
           'fo': {'name':'Faroese',     'localname':'Føroyskt'},
           'fr': {'name':'French',      'localname':'Français', 'weight':0.894},
           'frp':{'name':'Franco-Provençal/Arpitan','localname':'Arpitan'},
           'frr':{'name':'North Frisian', 'localname':'Nordfriisk'},
           'fur':{'name':'Friulian',    'localname':'Furlan'},
           'fy': {'name':'West Frisian','localname':'Frysk'},
           'ga': {'name':'Irish',       'localname':'Gaeilge'},
           'gag':{'name':'Gagauz',      'localname':'Gagauz'},
           'gan':{'name':'Gan',         'localname':'贛語', 'similar_lang':'zh'},
           'gcr':{'name':'French Guianese Creole', 'localname':'Kriyòl Gwiyannen'},
           'gd': {'name':'Scottish Gaelic','localname':'Gàidhlig'},
           'gl': {'name':'Galician',    'localname':'Galego', 'weight':0.947},
           'glk':{'name':'Gilaki',      'localname':'گیلکی'},
           'gn': {'name':'Guarani',     'localname':'Avañe\'ẽ'},
           'gom':{'name':'Konkani',     'localname':'कोंकणी / Konknni'},
           'gor':{'name':'Gorontalo',   'localname':'Hulontalo'},
           'got':{'name':'Gothic',      'localname':'𐌲𐌿𐍄𐌹𐍃𐌺'},
           'gu': {'name':'Gujarati',    'localname':'ગુજરાતી'},
           'gv': {'name':'Manx',        'localname':'Gaelg'},
           'ha': {'name':'Hausa',       'localname':'هَوُسَ'},
           'hak':{'name':'Hakka',       'localname':'Hak-kâ-fa / 客家話'},
           'haw':{'name':'Hawaiian',    'localname':'Hawai`i'},
           'he': {'name':'Hebrew',      'localname':'עברית', 'weight':1.466},
           'hi': {'name':'Hindi',       'localname':'हिन्दी', 'weight':0.978},
           'hif':{'name':'Fiji Hindi',  'localname':'Fiji Hindi'},
           'hr': {'name':'Croatian',    'localname':'Hrvatski', 'weight':1.078},
           'hsb':{'name':'Upper Sorbian','localname':'Hornjoserbsce'},
           'ht': {'name':'Haitian',     'localname':'Krèyol ayisyen'},
           'hu': {'name':'Hungarian',   'localname':'Magyar', 'weight':0.884},
           'hy': {'name':'Armenian',    'localname':'Հայերեն', 'weight':0.904},
           'hyw':{'name':'West Armenian', 'localname':'Արեւմտահայերէն'},
           'ia': {'name':'Interlingua', 'localname':'Interlingua', 'weight':1.0},
           'id': {'name':'Indonesian',  'localname':'Bahasa Indonesia', 'weight':0.851},
           'ie': {'name':'Interlingue', 'localname':'Interlingue'},
           'ig': {'name':'Igbo',        'localname':'Igbo'},
           'ik': {'name':'Inupiak',     'localname':'Iñupiak uqautchit'},
           'ilo':{'name':'Ilokano',     'localname':'Ilokano'},
           'inh':{'name':'Ingush',      'localname':'Гӏалгӏай'},
           'io': {'name':'Ido',         'localname':'Ido'},
           'is': {'name':'Icelandic',   'localname':'Íslenska', 'weight':1.041},
           'it': {'name':'Italian',     'localname':'Italiano', 'weight':0.891},
           'iu': {'name':'Inuktitut',   'localname':'ᐃᓄᒃᑎᑐᑦ'},
           'ja': {'name':'Japanese',    'localname':'日本語',    'weight':2.551},
           'jam':{'name':'Patois',      'localname':'Jamaican Creole English'},
           'jbo':{'name':'Lojban',      'localname':'Lojban', 'weight':1.2},
           'jv': {'name':'Javanese',    'localname':'Basa Jawa'},
           'ka': {'name':'Georgian',    'localname':'ქართული'},
           'kaa':{'name':'Karakalpak',  'localname':'Qaraqalpaq tili'},
           'kab':{'name':'Kabyle',      'localname':'Taqbaylit'},
           'kbd':{'name':'Kabardian',   'localname':'Aдыгэбзэ'},
           'kbp':{'name':'Kabiye',      'localname':'Kabɩyɛ'},
           'kg': {'name':'Kongo',       'localname':'KiKongo'},
           'ki': {'name':'Kikuyu',      'localname':'Gĩgĩkũyũ'},
           'kk': {'name':'Kazakh',      'localname':'Қазақша', 'weight':1.3},
           'kl': {'name':'Greenlandic', 'localname':'Kalaallisut'},
           'km': {'name':'Khmer',       'localname':'ភាសាខ្មែរ'},
           'kn': {'name':'Kannada',     'localname':'ಕನ್ನಡ', 'weight':0.999},
           'ko': {'name':'Korean',      'localname':'한국어', 'weight':2.252},
           'koi':{'name':'Komi-Permyak', 'localname':'Перем Коми (Perem Komi)'},
           'krc':{'name':'Karachay-Balkar', 'localname':'Къарачай-Малкъар'},
           'ks': {'name':'Kashmiri',    'localname':'कश्मीरी / كشميري'},
           'ksh':{'name':'Ripuarian',   'localname':'Ripoarisch'},
           'ku': {'name':'Kurdish',     'localname':'Kurdî / كوردی'},
           'kv': {'name':'Komi',        'localname':'Коми'},
           'kw': {'name':'Cornish',     'localname':'Kernewek'},
           'ky': {'name':'Kirghiz',     'localname':'Кыргызча'},
           'la': {'name':'Latin',       'localname':'Latina', 'weight':1.070},
           'lad':{'name':'Ladino',      'localname':'Dzhudezmo'},
           'lb': {'name':'Luxembourgish','localname':'Lëtzebuergesch'},
           'lbe':{'name':'Lak',         'localname':'Лакку маз'},
           'lez':{'name':'Lezgi',       'localname':'Лезги'},
           'lfn':{'name':'Lingua Franca Nova', 'localname':'Lingua Franca Nova'},
           'lg': {'name':'Luganda',     'localname':'Luganda'},
           'li': {'name':'Limburgian',  'localname':'Limburgs'},
           'lij':{'name':'Ligurian',    'localname':'Líguru'},
           'lld':{'name':'Ladin',       'localname':'Ladin'},
           'lmo':{'name':'Lombard',     'localname':'Lumbaart'},
           'ln': {'name':'Lingala',     'localname':'Lingala'},
           'lo': {'name':'Lao',         'localname':'ລາວ'},
           'lt': {'name':'Lithuanian',  'localname':'Lietuvių', 'weight':0.977},
           'ltg':{'name':'Latgalian',   'localname':'Latgaļu volūda'},
           'lv': {'name':'Latvian',     'localname':'Latviešu', 'weight':1.017},
           'mad':{'name':'Madurese',    'localname':'Madhurâ'},
           'mai':{'name':'Maithili',    'localname':'मैथिली'},
           'map-bms':{'name':'Banyumasan', 'localname':'Basa Banyumasan'},
           'mdf':{'name':'Moksha',      'localname':'Мокшень (Mokshanj Kälj)'},
           'mg': {'name':'Malagasy',    'localname':'Malagasy'},
           'mhr':{'name':'Meadow Mari', 'localname':'Олык Марий'},
           'mi': {'name':'Maori',       'localname':'Māori'},
           'min':{'name':'Minangkabau', 'localname':'Minangkabau'},
           'mk': {'name':'Macedonian',  'localname':'Македонски', 'weight':0.995},
           'ml': {'name':'Malayalam',   'localname':'മലയാളം', 'weight':1.004},
           'mn': {'name':'Mongolian',   'localname':'Монгол'},
           'mni': {'name':'Meitei',     'localname':'ꯃꯤꯇꯩ ꯂꯣꯟ'},
           'mnw': {'name':'Mon',        'localname':'ဘာသာ မန်'},
           'mr': {'name':'Marathi',     'localname':'मराठी'},
           'mrj':{'name':'Hill Mari',   'localname':'Кырык Мары (Kyryk Mary)'},
           'ms': {'name':'Malay',       'localname':'Bahasa Melayu', 'weight':0.845},
           'mt': {'name':'Maltese',     'localname':'Malti'},
           'mwl':{'name':'Mirandese',   'localname':'Mirandés'},
           'my': {'name':'Burmese',     'localname':'Burmese'},
           'myv':{'name':'Erzya',       'localname':'Эрзянь (Erzjanj Kelj)'},
           'mzn':{'name':'Mazandarani', 'localname':'مَزِروني'},
           'na': {'name':'Nauruan',     'localname':'dorerin Naoero'},
           'nah':{'name':'Nahuatl',     'localname':'Nāhuatl'},
           'nap':{'name':'Neapolitan',  'localname':'Nnapulitano'},
           'nds':{'name':'Low Saxon',   'localname':'Plattdüütsch'},
           'nds-nl':{'name':'Dutch Low Saxon','localname':'Nedersaksisch'},
           'ne': {'name':'Nepali',      'localname':'नेपाली'},
           'new':{'name':'Newar / Nepal Bhasa','localname':'नेपाल भाषा'},
           'nia':{'name':'Nias',        'localname':'Li Niha'},
           'nl': {'name':'Dutch',       'localname':'Nederlands', 'weight':0.833},
           'nn': {'name':'Norwegian (Nynorsk)','localname':'Nynorsk', 'similar_lang':'no'},
           'no': {'name':'Norwegian (Bokmål)','localname':'Norsk (Bokmål)', 'weight':1.042},
           'nov':{'name':'Novial',      'localname':'Novial'},
           'nqo':{'name':'N\'Ko',       'localname':'ߒߞߏ'},
           'nrm':{'name':'Norman',      'localname':'Nouormand/Normaund'},
           'nso':{'name':'Northern Sotho', 'localname':'Sesotho sa Leboa'},
           'nv': {'name':'Navajo',      'localname':'Diné bizaad'},
           'ny': {'name':'Chichewa',    'localname':'Chicheŵa'},
           'oc': {'name':'Occitan',     'localname':'Occitan'},
           'olo':{'name':'Livvi-Karelian', 'localname':'Livvinkarjala'},
           'om': {'name':'Oromo',       'localname':'Oromoo'},
           'or': {'name':'Oriya',       'localname':'ଓଡ଼ିଆ'},
           'os': {'name':'Ossetian',    'localname':'Иронау'},
           'pa': {'name':'Punjabi',     'localname':'ਪੰਜਾਬੀ'},
           'pag':{'name':'Pangasinan',  'localname':'Pangasinan'},
           'pam':{'name':'Kapampangan', 'localname':'Kapampangan'},
           'pap':{'name':'Papiamentu',  'localname':'Papiamentu'},
           'pcd':{'name':'Picard',      'localname':'Picard'},
           'pdc':{'name':'Pennsylvania German','localname':'Deitsch'},
           'pfl':{'name':'Palatinate German','localname':'Pfälzisch'},
           'pi': {'name':'Pali',        'localname':'पाऴि'},
           'pih':{'name':'Norfolk',     'localname':'Norfuk'},
           'pl': {'name':'Polish',      'localname':'Polski',   'weight':0.956},
           'pms':{'name':'Piedmontese', 'localname':'Piemontèis'},
           'pnb':{'name':'Western Panjabi', 'localname':'پنجابی'},
           'pnt':{'name':'Pontic',      'localname':'Ποντιακά', 'similar_lang':'el'},
           'ps': {'name':'Pashto',      'localname':'پښتو'},
           'pt': {'name':'Portuguese',  'localname':'Português', 'weight':0.937},
           'qu': {'name':'Quechua',     'localname':'Runa Simi'},
           'rm': {'name':'Romansh',     'localname':'Rumantsch'},
           'rmy':{'name':'Romani',      'localname':'romani - रोमानी'},
           'rn': {'name':'Kirundi',     'localname':'Kirundi'},
           'ro': {'name':'Romanian',    'localname':'Română', 'weight':0.894},
           'roa-rup':{'name':'Aromanian', 'localname':'Armãneashce'},
           'roa-tara':{'name':'Tarantino', 'localname':'Tarandíne'},
           'ru': {'name':'Russian',     'localname':'Русский',  'weight':0.908},
           'rue':{'name':'Rusyn',       'localname':'русиньскый язык'},
           'rw': {'name':'Kinyarwanda', 'localname':'Kinyarwanda'},
           'sa': {'name':'Sanskrit',    'localname':'संस्कृतम्'},
           'sah':{'name':'Sakha',       'localname':'Саха тыла (Saxa Tyla)'},
           'sat':{'name':'Santali',     'localname':'ᱥᱟᱱᱛᱟᱲᱤ'},
           'sc': {'name':'Sardinian',   'localname':'Sardu'},
           'scn':{'name':'Sicilian',    'localname':'Sicilianu'},
           'sco':{'name':'Scots',       'localname':'Scots'},
           'sd': {'name':'Sindhi',      'localname':'سنڌي، سندھی ، सिन्ध'},
           'se': {'name':'Northern Sami','localname':'Sámegiella'},
           'sg': {'name':'Sango',       'localname':'Sängö'},
           'sh': {'name':'Serbo-Croatian','localname':'Srpskohrvatski / Српскохрватски', 'similar_lang':'hr'},
           'shn':{'name':'Shan',        'localname':'လိၵ်ႈတႆး'},
           'si': {'name':'Sinhalese',   'localname':'සිංහල'},
           'simple':{'name':'Simple English','localname':'Simple English'},
           'sk': {'name':'Slovak',      'localname':'Slovenčina', 'weight':1.054},
           'skr':{'name':'Saraiki',     'localname':'سرائیکی'},
           'sl': {'name':'Slovenian',   'localname':'Slovenščina', 'weight':1.026},
           'sm': {'name':'Samoan',      'localname':'Gagana Samoa'},
           'smn':{'name':'Inari Sámi',  'localname':'anarâškielâ'},
           'sn': {'name':'Shona',       'localname':'ChiShona'},
           'so': {'name':'Somali',      'localname':'Soomaaliga'},
           'sq': {'name':'Albanian',    'localname':'Shqip'},
           'sr': {'name':'Serbian',     'localname':'Српски / Srpski', 'weight':1.121},
           'srn':{'name':'Sranan',      'localname':'Sranantongo'},
           'ss': {'name':'Swati',       'localname':'SiSwati'},
           'st': {'name':'Sesotho',     'localname':'Sesotho'},
           'stq':{'name':'Saterland Frisian','localname':'Seeltersk'},
           'su': {'name':'Sundanese',   'localname':'Basa Sunda'},
           'sv': {'name':'Swedish',     'localname':'Svenska',  'weight':1.004},
           'sw': {'name':'Swahili',     'localname':'Kiswahili'},
           'szl':{'name':'Silesian',    'localname':'Ślůnski'},
           'szy':{'name':'Sakizaya',    'localname':'Sakizaya'},
           'ta': {'name':'Tamil',       'localname':'தமிழ்', 'weight':0.800},
           'tay': {'name':'Atayal',     'localname':'Tayal', 'similar_lang':'zh'},
           'tcy':{'name':'Tulu',        'localname':'ತುಳು'},
           'te': {'name':'Telugu',      'localname':'తెలుగు'},
           'tet':{'name':'Tetum',       'localname':'Tetun'},
           'tg': {'name':'Tajik',       'localname':'Тоҷикӣ'},
           'th': {'name':'Thai',        'localname':'ไทย', 'weight':1.143},
           'ti': {'name':'Tigrinya',    'localname':'ትግርኛ_ፊደል'},
           'tk': {'name':'Turkmen',     'localname':'تركمن / Туркмен'},
           'tl': {'name':'Tagalog',     'localname':'Tagalog'},
           'tn': {'name':'Tswana',      'localname':'Setswana'},
           'to': {'name':'Tongan',      'localname':'faka Tonga'},
           'tpi':{'name':'Tok Pisin',   'localname':'Tok Pisin'},
           'tr': {'name':'Turkish',     'localname':'Türkçe', 'weight':1.034},
           'trv': {'name':'Taroko',     'localname':'Seediq'},
           'ts': {'name':'Tsonga',      'localname':'Xitsonga'},
           'tt': {'name':'Tatar',       'localname':'Tatarça / Татарча'},
           'tum':{'name':'Tumbuka',     'localname':'ChiTumbuka'},
           'tw': {'name':'Twi',         'localname':'Twi'},
           'ty': {'name':'Tahitian',    'localname':'Reo Mā`ohi'},
           'tyv':{'name':'Tuva',        'localname':'Тыва дыл'},
           'udm':{'name':'Udmurt',      'localname':'Удмурт кыл'},
           'ug': {'name':'Uyghur',      'localname':'Oyghurque'},
           'uk': {'name':'Ukrainian',   'localname':'Українська', 'weight':0.994},
           'ur': {'name':'Urdu',        'localname':'اردو'},
           'uz': {'name':'Uzbek',       'localname':'O‘zbek'},
           've': {'name':'Venda',       'localname':'TshiVenda'},
           'vec':{'name':'Venetian',    'localname':'Vèneto'},
           'vep':{'name':'Veps',        'localname':'Vepsän kel\},
           'vi': {'name':'Vietnamese',  'localname':'Tiếng Việt', 'weight':0.827},
           'vls':{'name':'West Flemish','localname':'West-Vlams'},
           'vo': {'name':'Volapük',     'localname':'Volapük'},
           'wa': {'name':'Walloon',     'localname':'Walon'},
           'war':{'name':'Waray-Waray', 'localname':'Winaray'},
           'wo': {'name':'Wolof',       'localname':'Wolof'},
           'wuu':{'name':'Wu',          'localname':'吴语', 'similar_lang':'zh'},
           'xal':{'name':'Kalmyk',      'localname':'Хальмг келн'},
           'xh': {'name':'Xhosa',       'localname':'IsiXhosa'},
           'xmf':{'name':'Mingrelian',  'localname':'მარგალური'},
           'yi': {'name':'Yiddish',     'localname':'ייִדיש'},
           'yo': {'name':'Yoruba',      'localname':'Yorùbá'},
           'za': {'name':'Zhuang',      'localname':'Sawcuengh'},
           'zea':{'name':'Zealandic',   'localname':'Zeêuws'},
           'zh': {'name':'Chinese',     'localname':'中文',      'weight':3.786},
           'zh-classical':{'name':'Classical Chinese','localname':'古文 / 文言文', 'similar_lang':'zh'},
           'zh-min-nan':{'name':'Min Nan', 'localname':'Bân-lâm-gú', 'weight':1.2},
           'zh-yue':{'name':'Cantonese', 'localname':'粵語', 'similar_lang':'zh'},
           'zu': {'name':'Zulu',        'localname':'IsiZulu'}
           }
  1. closed wikis
           # 'aa': {'name':'Afar',          'localname':'Afar'},
           # 'cho':{'name':'Choctaw',       'localname':'Chahta Anumpa'},
           # 'ho': {'name':'Hiri Motu',     'localname':'Hiri Motu'},
           # 'hz': {'name':'Herero',        'localname':'Otsiherero'},
           # 'ii': {'name':'Sichuan Yi',    'localname':'ꆇꉙ'},
           # 'kj': {'name':'Kuanyama',      'localname':'Kuanyama'},
           # 'kr': {'name':'Kanuri',        'localname':' Kanuri'},
           # 'lrc':{'name':'Northern Luri', 'localname':'لۊری شومالی'},
           # 'mh': {'name':'Marshallese',   'localname':'Kajin M̧ajeļ'},
           # 'mus':{'name':'Muscogee',      'localname':'Muskogee'},
           # 'ng': {'name':'Ndonga',        'localname':'Oshiwambo'},
  1. languages to process

lang_keys = list(lang_info.keys()) lang_keys.sort() textfile_encoding = 'utf-8'

  1. optimize by caching stuff

iw_cache = {} en_labels = {} item_list = [] disambigs = []

  1. debug

max_words = -1

prev_score = {}

  1. score colors

color10000 = 'BF5FFF' color4000 = 'FF7F00' color2000 = 'FFBE00' color1000 = 'FFFF00' color500 = 'BEFF00' color250 = '40FF00' color100 = '00FF7D' color0 = 'EFEFEF'

item_list_path = "ExpandedItemList.txt"

def ListOfArticlesExpanded():

   article_prefix = 'List of articles every Wikipedia should have/Expanded/'
   article_suffixes = ['People','History','Geography','Arts','Philosophy and religion','Anthropology, psychology and everyday life','Society and social sciences','Biology and health sciences','Physical sciences','Technology','Mathematics']
   pages = []
   meta_wiki = pywikibot.Site('meta', 'meta')
   for suffix in article_suffixes:
       article_name = article_prefix + suffix
       meta_page = pywikibot.Page(meta_wiki, article_name)
       pages.append(meta_page)
   return pages

def LoadItemList():

   item_path = item_list_path
   if os.path.isfile(item_path):
       return
   f = open(item_path, 'w', encoding=textfile_encoding)
   count = 0
   grand_total = 0
   pages = ListOfArticlesExpanded()
   for meta_page in pages:
       article   = meta_page.get(get_redirect=False)
       name_last  = 0
       name_first = article.find('[[d:', name_last)
       while name_first > -1:
           name_mid  = article.find('|',  name_first)
           cat_start =article.rfind('\n== ', name_last, name_first)
           if cat_start > -1:
               cat_end   = article.find('==',cat_start+3, name_first)
               if cat_end > -1:
                   cat   = article[cat_start+3:cat_end]
                   catName = .center(len(cat),'-')
                   pywikibot.output('\n%s' % cat)
                   pywikibot.output('\n%s' % catName)
                   count = 0
           name_last = article.find(']]', name_first)
           if name_last > name_mid:
               name_last = name_mid
           article_item = article[name_first+4:name_last]
           f.write(article_item)
           f.write('\n')
           count += 1
           grand_total += 1
           pywikibot.output('%d %s' % (count,article_item))
           name_first = article.find('[[d:', name_last)
   f.close()
   pywikibot.output('\nGRAND TOTAL\n-----------\n%d articles' % (grand_total))

def GetItemList():

   LoadItemList()
   count = 0
   item_file = open(item_list_path, 'r', encoding=textfile_encoding)
   for line in item_file:
       item = line[:-1]
       if item in item_list:
           errortext = item + " twice in list\n"
           pywikibot.output(errortext)
           with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
               errorlog.write(errortext)
       else:
           item_list.append(item)
   item_file.close()

def GetManyIws(itemlist):

   pipedword = '|'.join(itemlist)
   wiki = pywikibot.Site('wikidata', 'wikidata')
   params = {
       'action'    :'query',
       'prop'      :'revisions',
       'redirects' :True,
       'titles'    :pipedword,
       'rvprop'    :'content',
       'rvslots'   :'main'
       }
   pageRequest = api.Request(parameters=params, site=wiki)
   queryresult = pageRequest.submit()
   pages = queryresult['query']['pages']
   word_text = {}
   newitemlist = set()
   for k, v in pages.items():
       item = v['title']
       newitemlist.add(item)
       if item not in itemlist:
           print('not in ', item)
           item_list.append(item)
           errortext = item + " is redirected to.\n"
           with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
               errorlog.write(errortext)
       try:
           pagetext=v['revisions'][0]['slots']['main']['*']
       except:
           errortext = item + " has no wikidata item\n"
           if item in item_list:
               item_list.remove(item)
           pywikibot.output(errortext)
           with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
               errorlog.write(errortext)
       data_dict = json.loads(pagetext)
       try:
           iw_link_info = data_dict['sitelinks']
       except:
           iw_link_info = data_dict['links']
       iw_links = {}
       print(item)
       try:
           for linkkey, linkvalue in list(iw_link_info.items()):
               iw_links[linkkey] = linkvalue['title']
       except:
           errortext = item + " has no links\n"
           if item in item_list:
               item_list.remove(item)
           pywikibot.output(errortext)
           with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
               errorlog.write(errortext)
       try:
           labels = data_dict['labels']
           if 'en' in labels:
               en_labels[item] = labels['en']['value']
           else:
               en_labels[item] = item
       except:
           labels = data_dict['label']
           if 'en' in labels:
               en_labels[item] = labels['en']
           else:
               en_labels[item] = item
       iw_cache[item] = iw_links
       try:
           word_text[v['title']]=en_labels[item]
       except:
           word_text[v['title']] = item
       if 'P31' in data_dict['claims']:
           for claim in data_dict['claims']['P31']:
               if claim['mainsnak']['datavalue']['value']['numeric-id'] == 4167410:
                   disambigs.append(item)
   pywikibot.output(str(list(word_text.values())))
   redir_items = [x for x in itemlist if x not in newitemlist]
   for redir_item in redir_items:
       item_list.remove(redir_item)
       errortext = redir_item + " is redirected from.\n"
       with open("errorlog.txt", "a", encoding=textfile_encoding) as errorlog:
           errorlog.write(errortext)
   return word_text

def GetIwLinks():

   iw_link_path = "IwLinks.json"
   en_label_path = "Labels.json"
   global iw_cache
   global en_labels
   if os.path.isfile(iw_link_path):
       iwf = open(iw_link_path, 'r', encoding=textfile_encoding)
       iw_cache = json.load(iwf)
       iwf.close()
       enf = open(en_label_path, 'r', encoding=textfile_encoding)
       en_labels = json.load(enf)
       enf.close()
       return
   textdict = {}
   article_group = []
   item_file = open(item_list_path, encoding=textfile_encoding)
   for line in item_file:
           item = line[:-1]
           article_group.append(item)
           if len(article_group) == 10: # 50
               textdict.update(GetManyIws(article_group))
               article_group = []
   if len(article_group) > 0:
       textdict.update(GetManyIws(article_group))
       article_group = []
   item_file.close()
   iwf = open('IwLinks.json', 'w', encoding=textfile_encoding)
   json.dump(iw_cache, iwf)
   iwf.close()
   enf = open('Labels.json', 'w', encoding=textfile_encoding)
   json.dump(en_labels, enf)
   enf.close()
   return "cleared"
  1. format with spaces

def FormatNumber(s):

   r = []
   for i, c in enumerate(reversed(str(int(s)))):
       if i and i % 3 == 0:
           r.insert(0, ',')
       r.insert(0, c)
   return .join(r)

def GetPreviousScores():

   temp_path = "PreviousScores.txt"
   if os.path.isfile(temp_path):
      temp_file = open(temp_path, encoding=textfile_encoding)
      for line in temp_file:
           tokens = line.split()
           prev_score[tokens[0]] = float(tokens[1])
      temp_file.close()

def GetArticle(item, wiki, lang):

   word = GetArticleInterwikiName(item, lang)
   if len(word) > 0:
       page = pywikibot.Page(wiki, word)
       article = page.get(get_redirect=True)
       #if u'#REDIRECT' in article.upper():
       #    text_start = article.find('[[')
       #    text_end = article.find(']]', text_start)
       #    word = article[text_start+2:text_end]
       #    page = pywikibot.Page(wiki, word)
       #    article = page.get()
   else:
       article = 
   return article

def GetArticleInterwikiName(item, lang):

   if item in iw_cache:
       iw_links = iw_cache[item]
   else:
       wikidata = pywikibot.Site('wikidata', 'wikidata')
       try:
           datapage = pywikibot.DataPage(wikidata, item)
           data_dict = datapage.get()
       except:
            print(('Where is ' + item))
            return 
       iw_links = data_dict['links']
       labels = data_dict['label']
       iw_cache[item] = iw_links
       if 'en' in labels:
           en_labels[item] = labels['en']
       else:
           en_labels[item] = 
   lang_wiki = lang.replace("-","_") + 'wiki'
   if lang_wiki in iw_links:
       try:
           local_name = iw_links[lang_wiki]['name']
       except:
           local_name = iw_links[lang_wiki]
       return local_name
   else:
       return 

def GetInterwikiLength(article):

   #calculate len of all interwiki links
   interwiki_len   = 0
   interwiki_last  = 0
   interwiki_colon = 0
   interwiki_nl    = 0
   interwiki_first = article.find('[[', interwiki_last)
   while interwiki_first > -1:
       interwiki_last  = article.find(']]', interwiki_first)
       interwiki_colon = article.find(':',  interwiki_first)
       if interwiki_colon > -1 and interwiki_colon < interwiki_last:
          curlang = article[interwiki_first+2:interwiki_colon]
          if curlang in lang_info:
              interwiki_nl = article.find('\n', interwiki_last)
              if interwiki_nl > -1:
                 interwiki_len += (interwiki_nl - interwiki_first) + 1
              else:
                 interwiki_len += (interwiki_last - interwiki_first) + 2
       interwiki_first = article.find('[[', interwiki_last)
   return interwiki_len

def GetCommentLength(article):

   #calculate len of all comments
   comment_len   = 0
   comment_last  = 0
   comment_first = article.find(, comment_first)
       if comment_last == -1:
          comment_last = comment_first + 4
       comment_len += (comment_last - comment_first) - 4
       comment_first = article.find()
   article = comments.sub("", article)
   #remove references
   refs = re.compile(r'<ref(.|\n|\r)*?</ref>')
   article = refs.sub("", article)
   # convert article to lower case word list
   word_list = article.lower().split()
   if len(word_list) == 0:
       return False
   # create dictionary of word:frequency pairs
   freq_dic = {}
   # punctuation marks to be removed
   punctuation = re.compile(r'[.?!,":;]')
   for word in word_list:
       word = punctuation.sub("", word)
       if word in freq_dic:
           freq_dic[word] += 1
       else:
           freq_dic[word] = 1
   # usually English is ~30% these words and non-English at most a few percent
   common_english_words = ['the','of','on','a','is','in','his','have','by','but','that','to','with','for',
                           'an','fromare','was','he','which','be','as','it','this','first', 'new', 'and',
                           'she','also','after','at','become','best','from','had','great', 'into','their',
                           'these','they','time','who','her','not','one','or', 'made', 'would','are','between']
   en_word_count = 0
   for word in common_english_words:
       if word in freq_dic:
           en_word_count += freq_dic[word]
   percent_thats_common_english = 100.0 * en_word_count / len(word_list)
   # flag if 20% or more in the list which means more than half the article is English
   if percent_thats_common_english > 20 and  en_word_count > 20:
       print("Percent %f, %d out of %d" % (percent_thats_common_english, en_word_count, len(word_list)))
       return True
   return False

def GetArticleType(wt_article_size):

  if wt_article_size < 0:
     pywikibot.output('negative size!')
     return 'stubs'
  if wt_article_size == 0:
     return 'absent'
  elif 0 < wt_article_size < 8000:
     return 'stubs'
  elif 8000 <= wt_article_size < 16000:
     return 'articles'
  elif wt_article_size >= 16000:
     return 'longarticles'

def GetScoreForLang(lang):

   absent       = lang_info[lang]['absent']
   stubs        = lang_info[lang]['stubs']
   articles     = lang_info[lang]['articles']
   longarticles = lang_info[lang]['longarticles']
   return GetScore(absent, stubs, articles, longarticles)

def GetScore(absent, stubs, articles, longarticles):

   max_count = absent + stubs + articles + longarticles
   max_score = max_count * 4
   raw_score = (stubs*2) + (articles*3) + (longarticles*4)
   if max_score > 0:
       score = 100.0 * raw_score / max_score
   else:
       score = 0
   return score

def GetLink(subtable,lang,value):

   return '[[/'+subtable+'#' + lang +' '+lang_info[lang]['localname']+ '|' + value + ']]'

def GetTableNumber(count, min_subtable_count, max_subtable_count0, subtable, lang, max_subtable_count40=0):

   value = FormatNumber(count)
   max_subtable_count = max_subtable_count0
   if GetScoreForLang(lang) > 40 and max_subtable_count40 > 0:
       max_subtable_count = max_subtable_count40
   if count >= min_subtable_count and (count <= max_subtable_count or max_subtable_count==-1):
      return GetLink(subtable,lang,value)
   else:
      return value

num_lang = 0 def CalculateStatistics():

   for lang in lang_keys:
       CalculateStatisticsForLang(lang)

def GetWeightForLang(lang):

   lang_weight = 1.0
   if 'weight' in lang_info[lang]:
       lang_weight = lang_info[lang]['weight']
   elif 'similar_lang' in lang_info[lang]:
       lang_weight = lang_info[lang_info[lang]['similar_lang']]['weight']
   return lang_weight

def GetManyArticles(lang, wordlist):

   #print wordlist
   pipedword = '|'.join(list(wordlist.values()))
   wiki = pywikibot.Site(lang, 'wikipedia')
   params = {
       'action'    :'query',
       'prop'      :'revisions',
       'redirects' :True,
       'titles'    :pipedword,
       'rvprop'    :'content',
       'rvslots'   :'main'
       }
   pageRequest = api.Request(site=wiki, parameters=params)
   item_text = {}
   second_try = {}
   try:
       queryresult = pageRequest.submit()
       redirects = {}
       if 'redirects' in queryresult['query']:
           for redirpair in queryresult['query']['redirects']:
               redirects[redirpair['from']] = redirpair['to']
           pywikibot.output(str(redirects))
       pages = queryresult['query']['pages']
       word_text = {}
       for k, v in list(pages.items()):
           try:
               word_text[v['title']]=v['revisions'][0]['slots']['main']['*']
           except:
               word_text[v['title']]=
       for k, v in list(wordlist.items()):
           if v in redirects:
               word = redirects[v]
           else:
               word = v
           try:
               item_text[k] = word_text[word]
           except:
               pywikibot.output(word)
               second_try[k] = word
       pywikibot.output(str(list(item_text.keys())))
   except:
       second_try = wordlist
   if len(second_try)>0:
       if len(second_try)<len(wordlist):
           item_text.update(GetManyArticles(lang, second_try))
       elif len(second_try)>1:
           for k, v in list(second_try.items()):
               one_item = {}
               one_item[k] = v
               item_text.update(GetManyArticles(lang, one_item))
       else:
           for k, v in list(second_try.items()):
               item_text[k] = 
               pywikibot.output('Error getting: ' + k + ' ' + v)
   return item_text

def GetArticleTexts(lang):

   textdict = {}
   article_group = {}
   for item in item_list:
       word = GetArticleInterwikiName(item, lang)
       if (word == ):
           textdict[item] = 
       else:
           article_group[item]=word
           if len(article_group) == 50:
               # print article_group
               textdict.update(GetManyArticles(lang, article_group))
               article_group.clear()
               pywikibot.output(lang +' '+ str(len(textdict)))
   if len(article_group) > 0:
       textdict.update(GetManyArticles(lang, article_group))
       article_group.clear()
   return textdict

def CalculateStatisticsForLang(lang):

   global num_lang
   num_lang += 1
   print(('=['+lang+' '+str(num_lang)+ '/' + str(len(lang_keys)) + ']').ljust(76,'='))
   try:
       lang_info[lang]['total_size']   = 0
       lang_info[lang]['absent']       = 0
       lang_info[lang]['stubs']        = 0
       lang_info[lang]['articles']     = 0
       lang_info[lang]['longarticles'] = 0
       lang_info[lang]['art_count']    = 0
       temp_path = "~%s_output.txt" % (lang)
       if os.path.isfile(temp_path):
           temp_file = open(temp_path, encoding=textfile_encoding)
           art_count = int(temp_file.readline())
           lang_info[lang]['art_count']    = art_count
           for index in range(art_count):
               artKey = 'art_'+str(index)
               lang_info[lang][artKey] = {}
               lang_info[lang][artKey]['item']  = temp_file.readline().strip()
               lang_info[lang][artKey]['name']  = temp_file.readline().strip()
               linetext = temp_file.readline()
               try:
                   lang_info[lang][artKey]['size']  = int(linetext)
               except:
                   print(index, lang_info[lang][artKey]['item'], lang_info[lang][artKey]['name'])
                   lang_info[lang][artKey]['size'] = 0
               lang_info[lang][artKey]['error'] = temp_file.readline().strip()
           temp_file.close()
           print('..using previous %s result...' % (lang))
       else:
           wiki = pywikibot.Site(lang, 'wikipedia')
           textdict = GetArticleTexts(lang)
           word_count = 0
           for item, article in textdict.items():
               word_count += 1
               if word_count > max_words > 0:
                   break
               article_size = 0
               error = 
               try:
                   raw_article_size = len(article)
                   interwiki_len = GetInterwikiLength(article)
                   comment_len   = GetCommentLength(article)
                   article_size  = (raw_article_size - interwiki_len - comment_len)
                   if lang != "en" and lang != 'simple' and lang != 'sco' and IsArticleEnglish(article):
                       raise TypeError ("Wrong language, %s:%s has too much untranslated English." % (lang, GetArticleInterwikiName(item, lang)))
                   lang_weight = GetWeightForLang(lang)
                   print(str(lang).ljust(3), str(word_count).rjust(3), item.ljust(30), end=' ')
                   print(("%.1f" % (article_size * lang_weight)).rjust(11), str(lang_weight).rjust(5), str(interwiki_len).rjust(9), str(comment_len).rjust(9))
               except KeyboardInterrupt:
                   sys.exit(1)
               except Exception:
                   e = sys.exc_info()[1]
                   sys.stderr.write('\n')
                   traceback.print_exc()
                   sys.stderr.write('\n')
                   try:
                       error = CookString(str(e))
                   except:
                       error = "Error."
               art_index = item_list.index(item)
               artKey = 'art_'+str(art_index)
               lang_info[lang][artKey] = {}
               lang_info[lang][artKey]['item'] = item
               if item in en_labels:
                   lang_info[lang][artKey]['name'] = en_labels[item]
               else:
                   lang_info[lang][artKey]['name'] = item
               lang_info[lang][artKey]['size'] = article_size
               lang_info[lang][artKey]['error'] = error
               lang_info[lang]['art_count'] = lang_info[lang]['art_count'] + 1
           temp_file = open(temp_path,'w', encoding=textfile_encoding)
           temp_file.write(str(lang_info[lang]['art_count'])+'\n')
           for index in range(lang_info[lang]['art_count']):
               artKey = 'art_'+str(index)
               temp_file.write(lang_info[lang][artKey]['item']+'\n')
               temp_file.write(lang_info[lang][artKey]['name']+'\n')
               temp_file.write(str(lang_info[lang][artKey]['size'])+'\n')
               temp_file.write(lang_info[lang][artKey]['error']+'\n')
           temp_file.close()
       for index in range(lang_info[lang]['art_count']):
           artKey = 'art_'+str(index)
           article_size    = lang_info[lang][artKey]['size']
           wt_article_size = article_size * GetWeightForLang(lang)
           article_type    = GetArticleType(wt_article_size)
           lang_info[lang][article_type] = lang_info[lang][article_type] + 1
           if not lang_info[lang][artKey]['error']:
               lang_info[lang]['total_size'] = lang_info[lang]['total_size'] + article_size
   except:
       sys.stderr.write('\n')
       traceback.print_exc()
       sys.stderr.write('\n')

def GetGrowthNumber(lang, score):

   if lang in prev_score:
       return score - prev_score[lang]

def GetGrowth(lang, score):

   if lang in prev_score:
      growth    = "%+2.2f" % round(GetGrowthNumber(lang, score),2)
   else:
      growth    = "n/a"
   if growth == '-0.00':
      growth = '+0.00'
   return growth

def GetAverageSize(lang, article_count):

   if article_count > 0:
      avg_size = int(round(lang_info[lang]['total_size'] / article_count))
   else:
      avg_size = 0
   return int(avg_size * GetWeightForLang(lang))

def GetMedianSize(lang):

   x = []
   art_count = lang_info[lang]['art_count']
   for index in range(art_count):
       artKey = 'art_'+str(index)
       size = lang_info[lang][artKey]['size']
       if size > 0:
           x.append(size)
   x.sort()
   mid = int(len(x)/2)
   median_size = 0
   if len(x) > 0:
       if len(x) % 2:
           median_size = x[mid]
       else:
           median_size = (x[mid-1] + x[mid]) / 2
   return int(median_size * GetWeightForLang(lang))

def PrintResults():

   lang_keys.sort(key=GetScoreForLang, reverse=True)
   print('\n')
   print('RESULTS\n----------------------------------------------------------------------')
   print('Lang:',' AvgSize','Median','Absent','   <8k ','8-16k','  >16k ', 'Score', 'Growth')
   for lang in lang_keys:
       absent        = lang_info[lang]['absent']
       stubs         = lang_info[lang]['stubs']
       articles      = lang_info[lang]['articles']
       longarticles  = lang_info[lang]['longarticles']
       article_count = stubs + articles + longarticles
       score         = GetScore(absent, stubs, articles, longarticles)
       growth        = GetGrowth(lang, score)
       avg_size      = GetAverageSize(lang, article_count)
       med_size      = GetMedianSize(lang)
       print(lang.ljust(6), end=' ')
       print(str(avg_size).rjust(7), end=' ')
       print(str(med_size).rjust(7), end=' ')
       print(str(absent).rjust(5), end=' ')
       print(str(stubs).rjust(6), end=' ')
       print(str(articles).rjust(6), end=' ')
       print(str(longarticles).rjust(6), end=' ')
       print(("%6.2f" % score).rjust(6), end=' ')
       print(growth.rjust(6))

def GetWikiTableResults():

   lang_keys.sort(key=GetScoreForLang, reverse=True)
   table = 'This list of Wikipedias is based on the List of articles every Wikipedia should have/Expanded as a sample. The list currently has 10000 articles.  For every Wikipedia, the articles in this sample list are retrieved (based on interwiki links from Wikidata) and the number of characters is calculated (minus "comments" and the "interwiki" text at the bottom of the article). The size of each article is then adjusted for each language by multiplying it by the language weight. The articles are divided in four classes:'
   table += '\n'
   table += '* "absent" (i.e. non-existing; size = 0),'
   table += '\n'
   table += '* "stubs" (weighted \'\'size in characters\'\' less than 8 000),'
   table += '\n'
   table += '* "articles" (size between 8 000 and 16 000),'
   table += '\n'
   table += '* "long articles" (size more than 16 000).'
   table += '\n'
   table += 'The average and mean weighted sizes of all the non-absent articles in the sample are also calculated. Finally, a score is computed, based on the following formula:'
   table += '\n\n'
   table += ' rawscore = stubs*2 + articles*3 + long_articles*4.'
   table += '\n\n'
   table += 'In order to have a consistent scale, the raw score is normalized by dividing by the maximum score and multiplying by 100. The maximum score is  maxscore = total_items*4, where total_items is 10000. The final score is then'
   table += '\n\n'
   table += 'score = rawscore / (total_items * 0.04).'
   table += '\n\n'
   table += 'The language editions are then listed in order of decreasing score. The shortest articles for major Wikipedias are in List of Wikipedias by expanded sample of articles/Shortest.'
   table += '\n\n'
   table += 'More lists of Wikipedias by various criteria :  [ edit ]
   table += '\n\n'
   table += '== List =='
   table += '\n'
   table += 'Last Update: ' + date.today().strftime('%d. %b %Y.') + ' '
   table += '\n'
   table += '{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"'
   table += '\n|-\n'
   table += '!width = 45 | № !! width = 55 | Wiki !! width = 220 | Language !! width = 55 | Weight !! width = 120 | Mean Article
Size !! width = 120 | Median Article
Size
 !! width = 80 | Absent
(0k) !! width=80| Stubs
(< 8k)!! width = 80 | Articles
(8-16k) !! width = 80 | Long Art.
(> 16k) !! width = 80 | Score' table += '!! width = 50 | Growth' table += '\n|-\n'
   i=0
   for lang in lang_keys:
       i += 1
       absent        = lang_info[lang]['absent']
       stubs         = lang_info[lang]['stubs']
       articles      = lang_info[lang]['articles']
       longarticles  = lang_info[lang]['longarticles']
       article_count = stubs + articles + longarticles
       dagger = '†'
       if absent + article_count == 0:
           lang_footnote = dagger
           absent = lang_info['en']['art_count']
       else:
           lang_footnote = 
       table += '|' + str(i) + '\n'
       table += '| ' + lang + '' + lang_footnote + '\n'
       table += '| style = "text-align: left" | [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]\n'
       if 'weight' in lang_info[lang]:
           weight = str(lang_info[lang]['weight'])
       elif 'similar_lang' in lang_info[lang]:
           weight = str(lang_info[lang_info[lang]['similar_lang']]['weight']) + '**'
       else:
           weight = '1.0*'
       score         = GetScore(absent, stubs, articles, longarticles)
       growth        = GetGrowth(lang, score)
       avg_size      = GetAverageSize(lang, article_count)
       med_size      = GetMedianSize(lang)
       #if HasAwards(awards, lang):
           #growth = GetLink('Growth',lang, growth)
       table += '| ' + weight + '\n'
       table += '| ' + FormatNumber(avg_size) + '\n'
       table += '| ' + FormatNumber(med_size) + '\n'
       table += '| ' + GetTableNumber(absent,       1, 1000,'Shortest', lang, 3000) + '\n'
       table += '| ' + GetTableNumber(stubs,        1, 0,'Shortest',           lang, 1000) + '\n'
       table += '| ' + GetTableNumber(articles,     1, 0,'Articles',        lang, 0) + '\n'
       table += '| ' + GetTableNumber(longarticles, 1, 0,'Long Articles',   lang, 0) + '\n'
       #color code score
       if score >= 100.00:
           color = "|style = \"background: "+'\u0023'+color10000+"\""
       elif score >= 40.00:
           color = "|style = \"background: "+'\u0023'+color4000+"\""
       elif score >= 20.00:
           color = "|style = \"background: "+'\u0023'+color2000+"\""
       elif score >= 10.00:
           color = "|style = \"background: "+'\u0023'+color1000+"\""
       elif score >= 5.00:
           color = "|style = \"background: "+'\u0023'+color500+"\""
       elif score >= 2.50:
           color = "|style = \"background: "+'\u0023'+color250+"\""
       elif score >= 1.00:
           color = "|style = \"background: "+'\u0023'+color100+"\""
       else:
           color = "|style = \"background: "+'\u0023'+color0+"\""
       table += color + '| ' + ("%.2f" % score) + '\n'
       table += '| ' + growth + '\n'
       table += '|-\n'
   table = table[:-2] + '}\n'
   table += '\n'
   return table

def GetWikiTableArticles(article_type, min_articles, max_articles_0, max_articles_40=0):

   lang_keys.sort()
   table = 
   i=0
   for lang in lang_keys:
       i += 1
       count=0
       max_articles = max_articles_0
       score = GetScoreForLang(lang)
       if score > 40 and max_articles_40 > 0:
           max_articles = max_articles_40
       section = '==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
       for index in range(lang_info[lang]['art_count']):
           artKey  = 'art_'+str(index)
           artWtSize = GetArticleSize(lang, artKey)
           artType = GetArticleType(artWtSize)
           if artType == article_type:
              section += '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info[lang][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n'
              count += 1
       if min_articles <= count <= max_articles:
           table += section
   return table

def GetArticleName(lang, artKey):

   if artKey in lang_info[lang]:
      return lang_info[lang][artKey]['name']
   else:
      return 0

def GetArticleSize(lang, artKey):

   if artKey in lang_info[lang]:
      if lang_info[lang][artKey]['error'] :
          return 0;
      return lang_info[lang][artKey]['size'] * GetWeightForLang(lang)
   else:
      return 0

def GetEdgeFactor(lang, artKey):

   size = GetArticleSize(lang, artKey)
   if size==0:
       return 1
   if 7000 < size < 1000:
       return (size - 7000) / 1000
   if 24000 < size < 30000:
       return (size - 24000) / 1000
   else:
       return 0

def GetRuntFactor(lang, artKey):

   size = GetArticleSize(lang, artKey)
   if size > 0:
       for index in range(lang_info['en']['art_count']):
           otherArtKey =  'art_'+str(index)
           if otherArtKey != artKey:
              otherSize = GetArticleSize(lang, otherArtKey)
              if 0 < otherSize < size:
                  return 0 #you are not the runt
       return 4
   return 0

def GetArticlePoints(lang, artKey):

   size = GetArticleSize(lang, artKey)
   if size > 0 and size < 10000:
      return 1
   elif size > 10000 and size < 30000:
      return 4
   elif size > 30000:
      return 9
   return 0

def GetAverageArticlePoints(artKey):

   total = sum(GetArticlePoints(lang, artKey) for lang in lang_keys)
   return float(total) / len(lang_keys)

def GetAverageArticleSize(artKey):

   total = sum(GetArticleSize(lang, artKey) for lang in lang_keys)
   return int(float(total) / len(lang_keys))

def GetNeglectForArticle(lang, artInfo):

   artKey = artInfo['artKey']
   avgPnts = GetAverageArticlePoints(artKey) #0 to 9
   pnts = GetArticlePoints(lang, artKey)     #0 to 9
   edgeFactor = GetEdgeFactor(lang, artKey)  #0 to 6
   runtFactor = GetRuntFactor(lang, artKey)  #0 to 4
   return avgPnts - pnts + edgeFactor + runtFactor

def GetArticlesSortedByNeglect(lang):

   artInfos = []
   if 'art_count' in lang_info['en']:
       for index in range(lang_info['en']['art_count']):
           artKey =  'art_'+str(index)
           artInfos.append( {} )
           artInfos[index]['artKey']  = artKey
           artInfos[index]['popularity']  = GetAverageArticleSize(artKey)
           artInfos[index]['neglect'] = GetNeglectForArticle(lang, artInfos[index])
   artInfos.sort(key=lambda x: (x['neglect'], x['popularity']), reverse=True)
   return artInfos

def GetLargestArticles(artKey, maxLangs):

   lang_keys = list(lang_info.keys())
   lang_keys.sort(key=lambda lang: GetArticleSize(lang, artKey), reverse=True)
   item = lang_info['en'][artKey]['item']
   ret = []
   for lang in lang_keys[0:maxLangs]:
       ret.append ( ''+lang+':'+FormatNumber(GetArticleSize(lang, artKey))+'' )
   return ' -- '.join(ret)

def GetArticleTypeCount(artKey,points):

   return len([lang for lang in lang_keys if GetArticlePoints(lang, artKey) == points])

def GetNeglectedArticles(lang, max_articles):

   artInfos = GetArticlesSortedByNeglect(lang)
   i=0
   table = 
   for artInfo in artInfos:
       if artInfo['artKey'] in lang_info[lang]:
           item = lang_info[lang][artInfo['artKey']]['item']
           name = lang_info[lang][artInfo['artKey']]['name']
           table += '#'+name+''
           size = int(GetArticleSize(lang, artInfo['artKey']))
           if size > 0:
               iw_name = GetArticleInterwikiName(item, lang)
               if iw_name == :
                   table += ' ('+str(size) + ')'
               else:
                   iw_link = lang+':'+iw_name
                   table += ' ('+str(size)+')'
           table += '\n'
       i+=1
       if i >= max_articles: break
   return table

def GetPopularArticles(max_articles):

   artInfos = GetArticlesSortedByNeglect('en')
   artInfos.sort(key=lambda x: x['popularity'], reverse=True)
   i=0
   table = '{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"'
   table += '\n|-\n'
   table += '!width = 45 | № !! width = 90 | Average Size !! width = 150 | Article Name !! width = 80 | Absent
(0k)
 !! width=80| Stubs
(< 10k)!! width = 80 | Articles
(10-30k) !! width = 80 | Long Art.
(> 30k) !! width = 150 | Largest Articles\n'
   for artInfo in artInfos:
       i+=1
       artKey = artInfo['artKey']
       table += '|-\n'
       table += '|' + str(i)
       table += '||'+FormatNumber(artInfo['popularity'])
       table += '||style="text-align:left"|[[d:'+lang_info['en'][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']]'
       table += '||'+str(GetArticleTypeCount(artKey,0))
       table += '||'+str(GetArticleTypeCount(artKey,1))
       table += '||'+str(GetArticleTypeCount(artKey,4))
       table += '||'+str(GetArticleTypeCount(artKey,9))
       table += '||'+GetLargestArticles(artKey,4)+'\n'
       if i >= max_articles > 0: break
   table += '|}\n'
   return table

def GetWikiNeglectedArticles():

   lang_keys.sort()
   table = 
   print('writing Popular Articles...')
   table += '==Popular Articles==\n'
   table += GetPopularArticles(-1)
   print('writing Neglected Articles...')
   table += '==Neglected Articles==\n'
   for lang in lang_keys:
       print(' '+lang)
       if lang_info[lang]['art_count'] > 0:
           table += '==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
           table += GetNeglectedArticles(lang, 10)
       has_errors = False
       section = '====Errors====\n'
       for index in range(lang_info[lang]['art_count']):
           artKey  = 'art_'+str(index)
           if lang_info[lang][artKey]['error']  :
              section = section + '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n'
              has_errors = True
       if has_errors:
           table = table + section
   return table

def SaveWikiTableResults(awards):

   print('writing Results...')
   f = open('results.txt', 'w', encoding=textfile_encoding)
   f.write(GetWikiTableResults(awards))
   f.close()
   print('writing Absent...')
   f = open('_absent.txt', 'w', encoding=textfile_encoding)
   f.write(GetWikiTableArticles('absent',1, 250))
   f.close()
   print('writing Stubs...')
   f = open('_stub.txt', 'w', encoding=textfile_encoding)
   f.write(GetWikiTableArticles('stubs',1, 100, 250))
   f.close()
   print('writing Articles...', encoding=textfile_encoding)
   f = open('_articles.txt', 'w')
   f.write(GetWikiTableArticles('articles',1, 100, 250))
   f.close()
   print('writing Long Articles...', encoding=textfile_encoding)
   f = open('_longarticles.txt', 'w')
   f.write(GetWikiTableArticles('longarticles',1,100))
   f.close()
   print('writing Awards...')
   f = open('_growth.txt', 'w', encoding=textfile_encoding)
   f.write(GetWikiAwards(awards))
   f.close()
   print('writing Suggestions...', encoding=textfile_encoding)
   f = open('_neglectedarticles.txt', 'w')
   f.write(GetWikiNeglectedArticles())
   f.close()

def CookString(rawString):

   cookString = 
   for part in rawString.replace("'","||").split("|"):
       if len(part)==0:
           cookString += "'"
       else:
           cookString += eval("u'"+part+"'")
   return cookString

def GetGrowths(article):

   growths = {}
   lang_last  = 0
   lang_first = article.find('[[:', lang_last)
   while lang_first > -1:
       lang_last  = article.find('|',  lang_first)
       if lang_last == -1:
           break
       lang = article[lang_first+3:lang_last-1]
       score_first = article.find('style = "background:',lang_last)
       if score_first == -1:
           break
       score_last  = article.find('|', score_first+32)
       if score_last == -1:
           break
       growth_end = article.find('\n', score_last)
       growth_str = article[score_last+2:growth_end]
       try:
           growth_pipe = growth_str.find('|')
           if growth_pipe > -1:
               growth_str = growth_str[growth_pipe+1:-2]
           if growth_str.find(' ‡') > -1:
               growth_str = growth_str[0:-2]
           growth = float(growth_str)
       except:
           growth = 0
       growths[lang]=growth
       lang_first = article.find('[[:', score_last)
   return growths

def GetLastUpdated(article):

   date_first = article.find('Last Update')
   if date_first > -1:
       date_last_paren = article.find('(', date_first)
       date_last_br = article.find('
', date_first) if date_last_paren > -1 and date_last_paren < date_last_br : date_last = date_last_paren else: date_last = date_last_br
       if date_last > -1:
           hyphen = article.find('-', date_first,date_last)
           if hyphen > -1:
               date_first = hyphen+1
           else:
               date_first += 12
           parts = article[date_first:date_last].strip().split(' ')
           if len(parts[0])==1:
               parts[0] = '0'+parts[0]
           if parts[0][0]==':':
               parts[0] = '0'+parts[0][1]
           parts[1] = parts[1][0:3]
           return ' '.join(parts)

growthsG = {} def CalculatePlacing(growths,oldid,update):

   global growthsG
   growthsG = growths
   lang_keys = list(growths.keys())
   lang_keys.sort(key=lambda x: growthsG[x], reverse=True)
   placeNo=0
   print(update)
   placing = []
   for lang in lang_keys:
       if placeNo < 3 or growths[lang] > 1:
           placeNo += 1
           if placeNo==1:
               placestr = '1st Place'
               ribbonimg = 'Article blue.svg'
           elif placeNo==2:
               placestr = '2nd Place'
               ribbonimg = 'Article red.svg'
           elif placeNo==3:
               placestr = '3rd Place'
               ribbonimg = 'Article yellow.svg'
           elif placeNo>3:
               placestr = 'Honorable Mention'
               ribbonimg = 'Article green.svg'
           print(" %d  %-3s %+2.2f" % (placeNo, lang, growths[lang]))
           place = {'lang':lang,'growth':growths[lang],'oldid':oldid,'update':update,'placestr':placestr,'ribbonimg':ribbonimg}
           placing.append(place)
   return placing

def GetPreviousAwards():

   article_name = 'List of Wikipedias by sample of articles'
   meta_wiki = pywikibot.Site('meta', 'meta')
   meta_page = pywikibot.Page(meta_wiki, article_name)
   awards = {}
   prevUpdate = 
   prevGrowth = -999
   for rev in meta_page.revisions():
       oldid,datetime,username,comments = rev.hist_entry()
       if ('2009' in datetime or '2010' in datetime or '2011' in datetime or '2012' in datetime or '2013' in datetime) and ("updat" in comments.lower() or 'correct' in comments.lower()) and oldid!=2228213 and oldid!=2264612 and oldid!=3122655 and oldid!=3359817:
           article   = meta_page.getOldVersion(get_redirect=False,oldid=oldid)
           growths = GetGrowths(article)
           if 'en' in growths:
               update = GetLastUpdated(article)
               growth = growths['en']
               if update != prevUpdate and ( prevGrowth != growth or oldid > 3807780 ):
                   prevUpdate = update
                   prevGrowth = growth
                   awards[update] = CalculatePlacing(growths,oldid,update)
   return awards

def HasAwards(awards, lang):

   for placings in list(awards.values()):
       for place in placings:
           if lang == place['lang']:
               return True
   return False

def GetWikiAwards(awards):

   table = '==2009-2019 Improvement Awards==\n'
   for lang in lang_keys:
       section = '==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
       rows = []
       for update, placings in list(awards.items()):
           for place in placings:
               if lang == place['lang']:
                   mid_section = '|-\n'
                   mid_section += '|width = 150 | File:%s %s\n' % (place['ribbonimg'],place['placestr'])
                   if place['oldid'] == -1:
                       mid_section += '|width = 120 align=center| %s\n' % (place['update'])
                   else:
                       mid_section += '|width = 120 align=center| %s\n' % (place['oldid'],place['update'])
                   mid_section += '|width = 80 align=center| %+2.2f\n' % round(place['growth'],2)
                   rows.append({'place':place,'mid_section':mid_section})
       if len(rows) > 0:
           rows = sorted(sorted(rows, key=lambda row: row['place']['growth'], reverse=True), key=lambda row: row['place']['placestr'])
           if len(rows) > 1:
               section += '{|class="wikitable sortable" cellpadding="6" cellspacing="0"\n'
               section += '! !! !!\n'
           else:
               section += '{|class="wikitable" cellpadding="6" cellspacing="0"\n'
           for row in rows:
               section += row['mid_section']
           section += '|}\n'
           table += section
   return table

def CalculateAwards():

   print("calculating awards...")
   todays = {}
   for lang in lang_keys:
       absent        = lang_info[lang]['absent']
       stubs         = lang_info[lang]['stubs']
       articles      = lang_info[lang]['articles']
       longarticles  = lang_info[lang]['longarticles']
       score         = GetScore(absent, stubs, articles, longarticles)
       growth        = GetGrowthNumber(lang, score)
       todays[lang] = growth
   update = strftime("%d %b %Y")
   placing = CalculatePlacing(todays,-1,update)
   awards = GetPreviousAwards()
   awards[update] = placing
   return awards

def GetArticleExists(lang, artKey):

   size = GetArticleSize(lang, artKey)
   if size > 0:
      return 1
   else:
      return 0

def GetNumberExists(artKey):

   total = sum(GetArticleExists(lang, artKey) for lang in lang_keys)
   return total

def GetMostCommonList():

   artInfos = []
   for index in range(len(item_list)):
       artKey = 'art_'+str(index)
       artInfos.append( {} )
       artInfos[index]['artKey'] = artKey
       artInfos[index]['numberExists'] = GetNumberExists(artKey)
       artInfos[index]['averageSize'] = GetAverageArticleSize(artKey)
   artInfos.sort(key=lambda x: (x['numberExists'], x['averageSize']), reverse=True)
   most_common_list = [index['artKey'] for index in artInfos]
   return most_common_list

def WriteShortestSection(lang, most_common_list):

   artInfos = []
   for artKey in most_common_list:
       newDict = {}
       newDict['artKey'] = artKey
       newDict['length'] = GetArticleSize(lang, artKey)
       artInfos.append( newDict )
   artInfos.sort(key=lambda x: (x['length']), reverse=False)
   count=0
   section = '\n==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n'
   for index in artInfos:
       artKey  = index['artKey']
       artWtSize = '%d' %index['length']
       section += '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info[lang][artKey]['name']+']] ' + str(artWtSize) + ' ' + lang_info[lang][artKey]['error'] + '\n'
       if count >= 199:
           return section
       count += 1
   return section

def GetWikiShortestArticles():

   lang_keys.sort()
   table = 'The 200 shortest articles found when generating the wikipedia\'s score. Wikipedias are excluded from this list if their score is < 40.'

table += '\n

'

   table += '\n'
   most_common_list = GetMostCommonList()
   for lang in lang_keys:
       score = GetScoreForLang(lang)
       if score > 40:
           table += WriteShortestSection(lang, most_common_list)
   table += '\n'
   return table

def SavePreviousScore(): article_name = 'List of Wikipedias by expanded sample of articles' meta_wiki = pywikibot.Site('meta', 'meta') meta_page = pywikibot.Page(meta_wiki, article_name) article = meta_page.get(get_redirect=False) f = open('PreviousScores.txt', 'w', encoding=textfile_encoding) count = 0 lang_last = 0 lang_first = article.find('[[:', lang_last) while lang_first > -1: lang_last = article.find('|', lang_first) lang = article[lang_first+3:lang_last-1] score_first = article.find('style = "background:',lang_last) score_last = article.find('|', score_first+32) score = article[score_first+31:score_last-1] f.write(lang + ' ' + score + '\n') count += 1 print(count, lang, score) lang_first = article.find('[[:', score_last) f.close()

def WriteResultsToFile():

   print('writing Results...')
   f = open('results.txt', 'w', encoding=textfile_encoding)
   f.write(GetWikiTableResults())
   f.close()
   print('writing Shortest...')
   f = open('_shortest.txt', 'w', encoding=textfile_encoding)
   f.write(GetWikiShortestArticles())
   f.close()
  1. support dividing up work

if len(sys.argv) == 3:

   part      = int(sys.argv[1])-1
   numparts  = int(sys.argv[2])
   lang_keys = [lang for lang in lang_keys if lang_keys.index(lang) % numparts == part]

def oldMain():

   GetPreviousScores()
   CalculateStatistics()
   awards = CalculateAwards()
   PrintResults()
   SaveWikiTableResults(awards)

def main():

   SavePreviousScore()
   GetPreviousScores()
   GetItemList()
   GetIwLinks()
   CalculateStatistics()
   PrintResults()
   WriteResultsToFile()

if __name__ == '__main__':

   try:
       main()
   finally:
       pywikibot.stopme()