Utilisateur:RimBot/count-quotes.py
Apparence
#!/usr/bin/env python
# coding: utf-8
"""
This script counts the number of quotes.
# TODO :
# - Check for different quotes with the same 'original' field
# - Check not only for equal, but similar quotes (distance d'edition)
# - Check for same quote with different refs
This script understands various command-line arguments:
-cat Work on all pages which are in a specific category.
Argument can also be given as "-cat:categoryname".
-ref Work on all pages that link to a certain page.
Argument can also be given as "-ref:referredpagetitle".
-links Work on all pages that are linked from a certain page.
Argument can also be given as "-links:linkingpagetitle".
-new Work on the most recent new pages on the wiki
-subcat When the pages to work on have been chosen by -cat, pages in
subcategories of the selected category are also included.
When -cat has not been selected, this has no effect.
-file: used as -file:filename, read a list of pages to treat
from the named file
-start: used as -start:title, specifies that the robot should
go alphabetically through all pages on the home wiki,
starting at the named page.
-output: used as -output:pagename, generate report to pagename.
-outputprefix: used as -output:pagename, create NUMBEROFQUOTES and
NUMBEROFARTICLES as subpages of pagename
-outputquotes: used as -outputquotes:pagename, put the number of quotes
on pagename
-outputarticles: used as -outputarticles:pagename, put the number of
articles containing quotes on pagename
"""
import pywikibot
from pywikibot import *
from pywikibot import pagegenerators
import sys, re, string, time, locale, operator
msg = {
'fr': (u'<h1 style="border-bottom:none;margin-top:0;margin-bottom:0.8em;text-align:center;"> %d citations dans %d articles ',
u'<small>(Le nombre d\'articles n\'inclut que les articles qui possèdent au moins une citation. Les citations en plusieurs exemplaires ne sont comptées qu\'une fois.)',
u'== Répartion par article ==\n\n',
u'{| class="wikitable"\n|+\'\'\'Nombre de citations par article\'\'\'\n! align="center" | Article\n! align="center" | Nombre de citations\n',
u'|- align="center" \n| \'\'\'Moyenne\'\'\' || %0.2f\n',
u'<small>La moyenne exclut les articles ne contenant aucune citation.</small>\n\n',
u'== Citations en plusieurs exemplaires ==\n',
u'<small>(Ce tableau recense le nombre de citations qui figurent dans plusieurs articles. Ainsi, si le premier champ vaut N et le second M, cela signifie que M citations figurent en N exemplaires (dans N articles))</small>\n\n',
u'{| class="wikitable" style="width: 40%%"\n|+\'\'\'Nombre d\'exemplaires de la citation\'\'\'\n! align="center" | Article\n! align="center" | Nombre de citations\n',
u'Mise à jour des statistiques'),
'is': (u'<h1 style="border-bottom:none;margin-top:0;margin-bottom:0.8em;text-align:center;"> %d tilvitnanir à%d greinum ',
u'<small>(àfjölda greina eru greinar með engum tilvitnunum ekki taldar með. Tvöfaldaðar tilvitnanir eru taldar sem ein.)',
u'== Eftir greinum ==\n\n',
u'{| class="wikitable"\n|+\'\'\'Fjöldi tilvitnana eftir greinum\'\'\'\n! align="center" | Grein\n! align="center" | Fjöldi tilvitna\n',
u'|- align="center" \n| \'\'\'Meðaltal\'\'\' || %0.2f\n',
u'<small>Meðaltalið telur ekki með greinar sem innihalda engar tilvitnanir.</small>\n\n',
u'== Tvöfaldar tilvitnanir ==\n',
u'<small>(Þessi tafla sýnir fjölda tilvitnana sem koma fram àfleirum en einni grein. If the first field is N and the second M, it means M quotes appear N times (in N articles))</small>\n\n',
u'{| class="wikitable" style="width: 40%%"\n|+\'\'\'Fjöldi greina sem tilvitnunin kemur fyrir\'\'\n! align="center" | Greinar\n! align="center" | Fjöldi tilvitnana\n',
u'Updated statistics'),
'en': (u'<h1 style="border-bottom:none;margin-top:0;margin-bottom:0.8em;text-align:center;"> %d quotes in %d articles ',
u'<small>(The number of articles does not include articles without quotes. Duplicated quotes are counted only once.)',
'== By article ==\n\n',
'{| class="wikitable"\n|+\'\'\'Number of quotes by article\'\'\'\n! align="center" | Article\n! align="center" | Number of quotes\n',
'|- align="center" \n| \'\'\'Mean\'\'\' || %0.2f\n',
'<small>Mean excludes articles without quotes.</small>\n\n',
'== Duplicated quotes ==\n',
'<small>(This table gives the number of quotes that appear in more than one article. If the first field is N and the second M, it means M quotes appear N times (in N articles))</small>\n\n',
'{| class="wikitable" style="width: 40%%"\n|+\'\'\'Number of articles where the quote appears\'\'\n! align="center" | Articlse\n! align="center" | Number of quotes\n',
'Updated statistics')
}
def trim(s):
return s.strip(" \t\n\r\0\x0B")
def replace_citation(piece):
for p in piece['parts']:
if trim(p[0].lower()) == globalvar.quote_arg or trim(p[0]) == '1' or trim(p[0]) == '':
return p[1]
def replace_lang(piece):
# Return second argument.
return piece['parts'][1][1]
def replace_template(piece):
# Return title if no arg.
if len(piece['parts']) == 0:
return piece['title']
# Else : return first arg.
return piece['parts'][0][1]
class Global(object):
"""Container class for global settings.
Use of globals outside of this is to be avoided."""
mainpagename = None
quotes = {}
# List of articles without quotes.
noquotes = []
quote_template = 'citation'
quote_arg = 'citation'
known_templates = {quote_template: replace_citation, 'lang': replace_lang}
quiet = False
debug = False
def outputq(s, newline = True):
if not globalvar.quiet:
pywikibot.output(u'%s' % s, newline = newline)
def debug(s):
if globalvar.debug:
pywikibot.output(u'%s' % s)
def error(s):
# Output errors to ... stdout, because pywikipedia uses stderr for "normal"
# output. *sigh*
pywikibot.output(u'ERROR: %s' % s, toStdout = True)
def parse_templates(text):
"""Parse text (should be the beginning of a quote template) to get the
content of the quote with all known templates substituted."""
# Mostly adapted from <includes/Parser.php>
# Returned text
quote = ""
# Built from this
pieces = []
stack = []
# index in stack
lastOpeningBrace = -1
debug(u'Parsing templates')
debug('Showing first 200 chars from text')
# debug(u'%s' % text[:200])
i = 0
# End when the first found template ends, i.e. when stack is empty and
# we're not at the beginning
while (stack != [] or i == 0) and i < len(text):
if lastOpeningBrace == -1:
search = r'({{)'
else:
search = r'({{|}}|\|)'
debug(u"searching for %s" % search)
s = re.compile(search)
m = s.search(text[i:])
if not m:
error(u'ERROR: pattern not found: %s' % search)
return
i += m.start()
span = m.end() - m.start()
debug(u'%s (%d, %d)' % (m.group(), m.start(), m.end()))
debug(u'%s' % text[i:])
if m.group() == "{{":
debug(u"new template starting from %d" % i)
stack.append({})
lastOpeningBrace += 1
stack[lastOpeningBrace]['start'] = i
i += span
stack[lastOpeningBrace]['lastPartStart'] = i
else:
# End of part.
debug(u"part ending at %d" % i)
part = text[stack[lastOpeningBrace]['lastPartStart']:i]
# First part : template title.
if not 'parts' in stack[lastOpeningBrace]:
stack[lastOpeningBrace]['title'] = trim(part)
stack[lastOpeningBrace]['parts'] = []
else:
# Argument.
sep = part.find('=')
if sep != -1:
argname = part[:sep]
arg = part[sep+1:]
else:
argname = ''
arg = part
debug(u"part : (%s, %s)" % (argname, arg))
stack[lastOpeningBrace]['parts'].append([argname, arg])
i += span
stack[lastOpeningBrace]['lastPartStart'] = i
if m.group() == "}}":
# Replace template.
title = stack[lastOpeningBrace]['title'].lower()
debug(u"template: %s" % title)
if title in globalvar.known_templates:
debug(u"calling replace_citation on quote")
piece = globalvar.known_templates[title](stack[lastOpeningBrace])
else:
# Default callback.
debug(u"calling replace_template on other template")
piece = replace_template(stack[lastOpeningBrace])
debug("template %s replaced by %s" % (stack[lastOpeningBrace]['title'], piece))
if piece == None:
error("Found a null piece: probably unterminated %s template!" % title)
start = stack[lastOpeningBrace]['start']
text = text[:start] + piece + text[i:]
i = start + len(piece)
del stack[lastOpeningBrace]
lastOpeningBrace -= 1
if lastOpeningBrace != -1:
error('quote template not finished at end of text!')
return
debug(u'Quote : %s' % text[:i])
return [text, i]
def prepare_text(text):
wikilinksR = re.compile(r"^([ %!\"$&'()*,\\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+#]+)(?:\|(.+?))?]](.*)", re.DOTALL)
links = text.split('[[')
# All texts before the first link.
text = links.pop(0)
for l in links:
add = ""
m = wikilinksR.match(l)
if m:
# [[0|1]]2
if m.group(2):
add += m.group(2)
else:
add += m.group(1)
if m.group(3):
add += m.group(3)
else:
add += '[[' + l
text += add
# debug(u"Add: %s" % add)
# debug(u'After :')
# debug(u'%s' % text)
# Remove tags.
tagsR = re.compile(r'<[^>]*>')
text = tagsR.sub('', text)
return text
def parse_quote(text):
# Subst templates.
return parse_templates(text)
def workon(page):
if page.title() == globalvar.mainpagename:
return
outputq(u'handling : %s' % page.title())
try:
text = page.get()
except:
return
text = prepare_text(text)
if not pywikibot.getSite().nocapitalize:
pattern = '[' + re.escape(globalvar.quote_template[0].upper()) + re.escape(globalvar.quote_template[0].lower()) + ']' + re.escape(globalvar.quote_template[1:])
else:
pattern = re.escape(globalvar.quote_template)
r = re.compile(r'\{\{ *(?:[Tt]emplate:|[mM][sS][gG]:)?' + pattern)
alnum = re.compile('\W')
matches = r.finditer(text)
offset = 0
nquotes = 0
for m in matches:
debug(u'match (200) : %s' % text[m.start():m.start()+200])
debug(u'offset: %d' % offset)
# Real start of the match.
nquotes += 1
start = m.start() - offset
[endtext, qlen] = parse_quote(text[start:])
offset += (len(text[start:]) - len(endtext))
quote = unicode(endtext[:qlen])
text = text[:start] + endtext
debug(u'Quote 2: %s' % (quote))
debug(u'Text: %s' % text)
# Strip all non-alphanumeric chars.
squote = alnum.sub('', quote)
h = hash(squote)
if not h in globalvar.quotes:
globalvar.quotes[h] = [squote, [[quote, page]]]
else:
if globalvar.quotes[h][0] != squote:
error('hash collision! %s and %s have the same hash (%d).' % (globalvar.quotes[h][0], squote, h))
sys.exit(-1)
globalvar.quotes[h][1].append([quote, page])
if nquotes == 0:
outputq(u"no quotes in %s" % page.title())
globalvar.noquotes.append(page)
def generate_output(quotes):
# We want to compute :
# - The total number of (unique) quotes.
# - The number of (unique) quotes per article.
# - The number of duplicates.
tcount = 0
acount = {}
dupcount = {}
# We also want :
# - to print similar-but-not-equal quotes.
# - to print when the same quote appears twice in an article.
outputq(u'quotes : %d' % len(quotes))
for h in quotes:
# Unique quotes.
uquotes = {}
[operator.setitem(uquotes, i[0], []) for i in quotes[h][1] if not uquotes.has_key(i[0])]
outputq(u'uquotes : %s' % uquotes)
# Get associated pages.
for q in quotes[h][1]:
if q[0] in uquotes:
uquotes[q[0]].append(q[1])
debug(u'increasing count by one')
# Only increment by one, because all quotes with the same hash
# should be similar enough to be considered identical.
tcount += 1
pr = False
# Are all quotes the same?
if len(uquotes) > 1:
pr = True
outputq(u'Similar quotes found :')
articles = 0
for q in uquotes:
if pr:
outputq(u'\t%s (on:' % q, newline = False)
for a in uquotes[q]:
if pr:
outputq(u' %s' % trim(a.title()), newline = False)
if a in acount:
acount[a] += 1
else:
acount[a] = 1;
articles += 1
if pr:
outputq(u')')
if articles in dupcount:
dupcount[articles] += 1
else:
dupcount[articles] = 1
outputq(u'TOTAL NUMBER OF QUOTES : %d' % tcount)
output = msg[globalvar.lang][0] % (tcount, len(acount))
output += '<small style="font-size: 70%%">(%s)</small></h1>\n'
output += msg[globalvar.lang][1]
output += '\n\n'
# locale.setlocale(locale.LC_ALL, "fr_FR@euro")
# for l in sorted(acount.items(), lambda x,y: locale.strcoll(x[0].title().lower(), y[0].title().lower())):
# pywikibot.output(u'%s' % l[0].title())
# Sort by number of quotes (return list of tuples, not dict)
acount = sorted(acount.items(), lambda x,y: cmp(y[1], x[1]) or locale.strcoll(x[0].title().lower(), y[0].title().lower()))
output += msg[globalvar.lang][2]
output += msg[globalvar.lang][3]
# Make it Unicode because the titles will be Unicode.
for a in acount:
outputq(u'adding to table: %s' % a[0].title())
output += '|-\n| [[%s]] || %d\n' % (a[0].title(), a[1])
for p in globalvar.noquotes:
output += '|-\n| [[%s]] || 0\n' % p.title()
mean = 0
if len(acount) != 0:
mean = tcount / float(len(acount))
output2 = ""
output2 += msg[globalvar.lang][4] % mean
output2 += '|}\n\n'
output2 += msg[globalvar.lang][5]
output2 += msg[globalvar.lang][6]
output2 += msg[globalvar.lang][7]
output2 += msg[globalvar.lang][8]
output += output2
for a in dupcount:
output += '|-\n|%d || %d\n' % (a, dupcount[a])
output += '|}\n\n'
return [output, tcount, len(acount)]
class Main(object):
# Options
__start = None
__number = None
## Which page generator to use
__workonnew = False
__catname = None
__catrecurse = False
__linkpagetitle = None
__refpagetitle = None
__textfile = None
__pagetitles = []
__output = None
__outputprefix = None
__outputarticles = None
__outputquotes = None
def parse(self):
# Parse options
for arg in pywikibot.handleArgs():
if arg.startswith('-ref'):
if len(arg) == 4:
self.__refpagetitle = pywikibot.input(u'Links to which page should be processed?')
else:
self.__refpagetitle = arg[5:]
elif arg.startswith('-cat'):
if len(arg) == 4:
self.__catname = pywikibot.input(u'Please enter the category name:');
else:
self.__catname = arg[5:]
elif arg.startswith('-subcat'):
self.__catrecurse = True
elif arg.startswith('-links'):
if len(arg) == 6:
self.__linkpagetitle = pywikibot.input(u'Links from which page should be processed?')
else:
self.__linkpagetitle = arg[7:]
elif arg.startswith('-file:'):
if len(arg) == 5:
self.__textfile = pywikibot.input(u'File to read pages from?')
else:
self.__textfile = arg[6:]
elif arg == '-new':
self.__workonnew = True
elif arg.startswith('-start:'):
if len(arg) == 6:
self.__start = pywikibot.input(u'Which page to start from: ')
else:
self.__start = arg[7:]
elif arg.startswith('-number:'):
if len(arg) == 7:
self.__number = int(pywikibot.input(u'Number of pages to parse: '))
else:
self.__number = int(arg[8:])
elif arg.startswith('-output:'):
if len(arg) == 7:
self.__output = pywikibot.input(u'Which page to save report to: ')
else:
self.__output = arg[8:]
elif arg.startswith('-outputprefix:'):
if len(arg) == 12:
self.__outputprefix = pywikibot.input(u'Which page to save templates to: ')
else:
self.__outputprefix = arg[13:]
elif arg.startswith('-outputquotes:'):
if len(arg) == 12:
self.__outputquotes = pywikibot.input(u'Which page to save number of quotes to: ')
else:
self.__outputquotes = arg[13:]
elif arg.startswith('-outputarticles:'):
if len(arg) == 14:
self.__outputarticles = pywikibot.input(u'Which page to save number of quotes to: ')
else:
self.__outputarticles = arg[15:]
elif arg.startswith('-template:'):
if len(arg) == 9:
globalvar.quote_template = pywikibot.input(u'Which template is used for quotes?')
else:
globalvar.quote_template = arg[10:]
elif arg.startswith('-arg:'):
if len(arg) == 4:
globalvar.quote_arg = pywikibot.input(u'Which template arg is used for quotes?')
else:
globalvar.quote_arg = arg[5:]
elif arg == '-quiet':
globalvar.quiet = True
elif arg == '-debug':
globalvar.debug = True
else:
self.__pagetitles.append(arg)
def generator(self):
# Choose which generator to use according to options.
pagegen = None
if self.__workonnew:
if not self.__number:
self.__number = config.special_page_limit
pagegen = pagegenerators.NewpagesPageGenerator(number = self.__number)
elif self.__refpagetitle:
refpage = pywikibot.Page(pywikibot.getSite(), self.__refpagetitle)
pagegen = pagegenerators.ReferringPageGenerator(refpage)
elif self.__linkpagetitle:
linkpage = pywikibot.Page(pywikibot.getSite(), self.__linkpagetitle)
pagegen = pagegenerators.LinkedPageGenerator(linkpage)
elif self.__catname:
cat = catlib.Category(pywikibot.getSite(), 'Category:%s' % self.__catname)
if self.__start:
pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse, start = self.__start)
else:
pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse)
elif self.__textfile:
pagegen = pagegenerators.TextfilePageGenerator(self.__textfile)
else:
if not self.__start:
self.__start = '!'
namespace = pywikibot.Page(pywikibot.getSite(), self.__start).namespace()
start = pywikibot.Page(pywikibot.getSite(), self.__start).title(withNamespace=False)
pagegen = pagegenerators.AllpagesPageGenerator(start, namespace, includeredirects=False)
return pagegen
def main(self):
# Parse command line options
self.parse()
site = pywikibot.getSite()
# ensure that we don't try to change main page
globalvar.mainpagename = pywikibot.Page(pywikibot.Link("Main Page", site)).title(withNamespace=False)
if site.language() in msg:
globalvar.lang = site.language()
else:
globalvar.lang = 'en'
pagegen = self.generator()
generator = None
if self.__pagetitles:
pages = []
for p in self.__pagetitles:
try:
pages.append(pywikibot.Page(pywikibot.getSite(), p))
except pywikibot.NoPage: pass
generator = pagegenerators.PreloadingGenerator(iter(pages))
else:
generator = pagegenerators.PreloadingGenerator(pagegen)
for page in generator:
workon(page)
[output, nquotes, npages] = generate_output(globalvar.quotes)
if self.__output:
try:
outputpage = pywikibot.Page(site, self.__output)
if outputpage.exists():
oldtext = outputpage.get()
else:
oldtext = u''
if oldtext != output:
output = output % time.strftime('%d/%m/%y / %H:00')
outputpage.put(output, comment = msg[globalvar.lang][9])
except:
error(u'Getting/Modifying page %s failed, generated output was:\n%s' % (outputpage, output))
else:
pywikibot.output(output)
nquotespage = None
narticlespage = None
if self.__outputprefix:
if self.__outputquotes:
pname = self.__outputquotes
else:
pname = "NUMBEROFQUOTES"
nquotespage = pywikibot.Page(site, u'%s%s' % (self.__outputprefix, pname))
if self.__outputarticles:
pname = self.__outputarticles
else:
pname = "NUMBEROFARTICLES"
narticlespage = pywikibot.Page(site, u'%s%s' % (self.__outputprefix, pname))
else:
if self.__outputquotes:
nquotespage = pywikibot.Page(site, u'%s' % self.__outputquotes)
if self.__outputarticles:
narticlespage = pywikibot.Page(site, u'%s' % self.__outputarticles)
if nquotespage:
# April fool (replace %d by %s too)
#nquotes = hex(nquotes)
nquotespage.put("%d" % nquotes, comment = msg[globalvar.lang][9])
if narticlespage:
# Same as above
#npages = hex(npages)
narticlespage.put("%d" % npages, comment = msg[globalvar.lang][9])
globalvar = Global()
try:
if __name__ == "__main__":
Main().main()
finally:
pywikibot.stopme()