« Utilisateur:RimBot/interproject.py » : différence entre les versions
Contenu supprimé Contenu ajouté
ajout du code du bot interprojet |
(Aucune différence)
|
Version du 14 mars 2012 à 13:06
#!/usr/bin/python # -*- coding: utf-8 -*- """ Script to check interproject links for general pages. This works by downloading the page, and checking if a page with the same name exists on other wikis (list specified on command line). # TODO # - Check redirects to the original page # - Handle disambig pages on remote projects This script understands various command-line arguments: -cat Work on all pages which are in a specific category. Argument can also be given as "-cat:categoryname". -ref Work on all pages that link to a certain page. Argument can also be given as "-ref:referredpagetitle". -links Work on all pages that are linked from a certain page. Argument can also be given as "-links:linkingpagetitle". -new Work on the most recent new pages on the wiki -subcat When the pages to work on have been chosen by -cat, pages in subcategories of the selected category are also included. When -cat has not been selected, this has no effect. -file: used as -file:filename, read a list of pages to treat from the named file -start: used as -start:title, specifies that the robot should go alphabetically through all pages on the home wiki, starting at the named page. -select: ask for *every* link whether it should be included or not -ask: ask before any change is made to the wiki -dry: do not change wiki, just print what would have been done -autonomous run the script in autonomous mode : ask no question, only check if the current interproject links are valid (and if not, remove them) -compare: used as -compare:project, check that for every interproject linking to project, there is a link back and dump a list of pages that miss this link -output: used as -output:pagename, output the result of -compare into this pagename """ import wikipedia, pagegenerators, catlib, config import sys, re, string, difflib msg = { 'en': (u'robot: ', u'adding interproject links', u'removing interproject links', u'modifying interproject links'), 'fr': (u'robot : ', u'ajoute les liens interprojets', u'retire les liens interprojets', u'modifie les liens interprojets'), } class Global(object): """Container class for global settings. Use of globals outside of this is to be avoided.""" select = False ask = False dry = False autonomous = False compare = None outputpage = None output = "" noip = "" sites = {} siblings = [] mainpagename = None def check_backlink(page, links): if not 'w' in links and not 'wikipedia' in links: # wikipedia.output(u'no w or wikipedia in backlinks for %s' % page.title()) globalvar.noip += u'|-\n|[[%s]] || {{non}} || ' % page.title() if links: globalvar.noip += u'{{oui}}\n' else: globalvar.noip += u'{{non}}\n' return if 'wikipedia' in links: links['w'] = links['wikipedia'] rpage = links['w'][0] # Make sure it's a recent version we're getting. rpage.site().forceLogin() site = wikipedia.getSite() path = rpage.site().get_address(rpage.urlname()) # wikipedia.output(u'DBG: getting %s...' % path) text = rpage.site().getUrl(path) globalvar.output += u'|-\n|[[%s]]||[[:%s:%s]]' % (page.title(), rpage.site().sitename(), rpage.title()); # wikipedia.output(u'DBG: matching for <a href="//%s%s([^"]+)"' % (site.hostname(), site.nice_get_address(''))) interprojectR = re.compile(r'<a href="//%s%s([^"]+)"' % (site.hostname(), site.nice_get_address(''))) matches = interprojectR.findall(text) if not matches: # wikipedia.output(u'DBG: text for %s is' % page.title()) # wikipedia.output(u'%s' % text) globalvar.output += u'|| {{non}} || {{non}}\n' return else: globalvar.output += u'|| {{oui}}' backlink = False for m in matches: # wikipedia.output(u'DBG: %s ?= %s' % (m, page.urlname)) if m == page.urlname(): backlink = True if not backlink: globalvar.output += u'|| {{non}}\n' else: globalvar.output += u'|| {{oui}}\n' def parse_interproject(title, text): # Parse text to get the existing interproject template, if any Rtmpl = re.compile(ur'{{interprojet(.*?)}}', re.IGNORECASE | re.MULTILINE | re.DOTALL) i = Rtmpl.finditer(text) # List of links found lists = [] # List of (start, end) tuples where start is the beginning of the match and # end... the end offset = 0 for m in i: if lists: wikipedia.output(u'WARNING: %s includes {{interprojet}} more than once!' % title) s = m.group(1) # Cleanup : remove unneeded whitespaces, pipes and newlines s = s.replace("\n", "") s = s.replace("\r", "") nowhite = re.compile('[\s|]*([=|])[\s]*') s = nowhite.sub('\\1', s) projects = s.split('|') # wikipedia.output(u'projects : %s' % projects) lists.append(projects) text = text[:m.start() - offset] + text[m.end() - offset:] offset += (m.end() - m.start()) # Merge lists (linear time) d = {} for s in lists: for x in s: d[x] = 1 projects = d.keys() del d return (projects, text) def check_interprojects(title, projects, links): # Check if the pages specified in the template do exist # When they don't, ask or remove automatically flags = [] site = wikipedia.getSite() for project in projects: if not project: continue if (project.startswith('nolink')) or (project.startswith('etiq')): # Remove old, deprecated tags. # flags.append(project) continue # get explicit page name if given val = None l = project.find('=') if l != -1: val = project[l+1:] project = project[:l] else: val = title # "|code=" is ignored by the template, so ignore it (and remove it) if not val: continue l = project.find('-') if l != -1: pproject = project[:l] lang = project[l+1:] else: pproject = project lang = site.lang if not pproject in site.family.known_families: wikipedia.output(u'WARNING: %s has interproject link to unknown %s project' % (title, project)) continue family = site.family.known_families[pproject] if not project in globalvar.sites: if family in ['meta', 'commons']: lang = family try: globalvar.sites[project] = wikipedia.getSite(fam = family, code = lang) except ValueError: wikipedia.output(u'WARNING: %s has interproject link to known, but unimplemented family : %s' % (title, project)) continue # wikipedia.output(u'getting page %s from project %s' % (val, project)) rpage = wikipedia.Page(globalvar.sites[project], val) if not rpage.exists(): if globalvar.compare or globalvar.autonomous: continue c = wikipedia.inputChoice(u'WARNING: %s has link to nonexisting page [[%s:%s]]. Remove it?' % (title, project, val), ['Yes', 'No'], ['y', 'n']) if c == 'y': continue if rpage.isRedirectPage(): rpage = rpage.getRedirectTarget() if (project in links) and (rpage in links[project]): continue wikipedia.output(u'Adding new interproject to %s (%s)' % (project, val) ) try: links[project].append(rpage) except KeyError: links[project] = [rpage] return flags def check_projects(title, links): # Check the list of projects for articles with the same name # FIXME: (or with names redirecting on this one on local wikis) for s in globalvar.siblings: rpage = wikipedia.Page(globalvar.sites[s], title) if rpage.isRedirectPage(): rpage = rpage.getRedirectTarget() if (s in links) and (rpage in links[s]): wikipedia.output(u'%s already in %s links' % (title, s)) continue if rpage.exists(): wikipedia.output(u'Adding new interproject to %s (%s)' % (s, rpage.title())) try: links[s].append(rpage) except KeyError: links[s] = [rpage] def choose_links(title, links): # Choose which link should be added and which should not. for proj in links: l = links[proj] # Exclude non-article pages. for p in l: try: if p.get().find('[[') == -1: l.remove(p) except: l.remove(p) if (len(l) <= 1) and not globalvar.select: if len(l) == 1: links[proj] = [l[0]] continue wikipedia.output("The following pages have been found for %s" % title) i = 0 for p in l: i += 1 wikipedia.output(u" (%d) %s" % (i, p.aslink(True))) if globalvar.autonomous: wikipedia.output("More than one page found, not changing articles"); continue while True: c = wikipedia.input(u"Which page should be used [number, (n)one]? :") if c: if c == 'n': rpage = None break elif c.isdigit(): c = int(c) try: rpage = l[c-1] except IndexError: pass else: break # Use a list to make it easy to compare with oldlinks. links[proj] = [rpage] def generate_interproject(links, flags): if not links: return "" tmpl = "{{interprojet" for s in flags: tmpl += "|%s" % s i = 0 for s in links: if not links[s] or not links[s][0]: continue i = i + 1 tmpl += "|%s=%s" % (s, links[s][0].title()) if i == 0: tmpl = None else: tmpl += "}}" return tmpl def getDefaultSort(text): Rtmpl = re.compile(ur'(\{\{ *?(DEFAULTSORT|CLEDETRI|CLEFDETRI)\s*:.+? *}})', re.DOTALL) match = Rtmpl.search(text) if match: return match.group(1) + '\r\n' else: return "" def removeDefaultSort(text): Rtmpl = re.compile(ur'\{\{ *?(DEFAULTSORT|CLEDETRI|CLEFDETRI)\s*:.+? *}}', re.DOTALL) text = wikipedia.replaceExcept(text, Rtmpl, '', ['nowiki', 'comment', 'math', 'pre']) return text.strip() def commit(page, text, interproject, oldtext): site = wikipedia.getSite() categories = wikipedia.getCategoryLinks(text, site = site) defaultsort = getDefaultSort(text) interwiki = wikipedia.getLanguageLinks(text) text = wikipedia.removeCategoryLinks(text, site) text = wikipedia.removeLanguageLinks(text, site) text = removeDefaultSort(text) if interproject: text = text + '\r\n\r\n' + interproject # wikipedia.output(u'avant %s apres' % defaultsort) # wikipedia.output(u'avant %s apres' % wikipedia.categoryFormat(categories)) # wikipedia.output(u'avant %s apres' % (defaultsort + wikipedia.categoryFormat(categories))) text = wikipedia.replaceCategoryLinks(text, categories, site = site, prepend = defaultsort) # wikipedia.output(u'avant %s apres' % text) text = wikipedia.replaceLanguageLinks(text, interwiki, site = site) # wikipedia.output(u'avant %s apres' % text) # diff = wikipedia.showDiff(oldtext, text) diff = "" for l in difflib.unified_diff(oldtext.splitlines(), text.splitlines()): diff += l + '\n' wikipedia.output(u'%s' % diff) return [text, diff] def workon(page): if page.title() == globalvar.mainpagename: return # Redirects point to pages which would be handled at some point # anyway, so ignore them. if page.isRedirectPage(): return wikipedia.output(u'handling %s' % page.title()) try: text = page.get() except wikipedia.IsRedirectPage: pagename = page.getRedirectTarget() page = wikipedia.Page(page.site(), pagename) try: text = page.get() except wikipedia.NoPage: wikipedia.output(u'Broken redirect to %s' % pagename) return # wikipedia.output(u'text : %s' % text) # Hash, key is project link code ('q', 'n'...) links = {} (projects, text) = parse_interproject(page.title(), text) flags = check_interprojects(page.title(), projects, links) if globalvar.compare: check_backlink(page, links) return oldlinks = links.copy() if not globalvar.autonomous: check_projects(page.title(), links) choose_links(page.title(), links) interproject = generate_interproject(links, flags) [text, diff] = commit(page, text, interproject, page.get()) # Generate edit summary lang = page.site().lang if not projects: summary = wikipedia.translate(lang, msg)[1] else: if not links: summary = wikipedia.translate(lang, msg)[2] else: summary = wikipedia.translate(lang, msg)[3] wikipedia.output(u'diff : %s' % diff) if diff: wikipedia.output(u'diff!') else: wikipedia.output(u'notdiff!') # wikipedia.output(u'new text : %s' % text) # Do not do unneeded changes in autonomous mode. if (globalvar.autonomous and (oldlinks != links)) or diff: if globalvar.ask: answer = wikipedia.inputChoice(u'Submit?', ['Yes', 'No'], ['y', 'n']) if answer == 'n': return try: page.put(text, comment = wikipedia.translate(lang, msg)[0] + summary) except wikipedia.LockedPage: wikipedia.output(u'Page %s is locked. Skipping.' % page.title()) class Main(object): # Options __start = None __number = None ## Which page generator to use __workonnew = False __catname = None __catrecurse = False __linkpagetitle = None __refpagetitle = None __textfile = None __pagetitles = [] def parse(self): # Parse options for arg in wikipedia.handleArgs(): if arg.startswith('-ref'): if len(arg) == 4: self.__refpagetitle = wikipedia.input(u'Links to which page should be processed?') else: self.__refpagetitle = arg[5:] elif arg.startswith('-cat'): if len(arg) == 4: self.__catname = wikipedia.input(u'Please enter the category name:'); else: self.__catname = arg[5:] elif arg.startswith('-subcat'): self.__catrecurse = True elif arg.startswith('-links'): if len(arg) == 6: self.__linkpagetitle = wikipedia.input(u'Links from which page should be processed?') else: self.__linkpagetitle = arg[7:] elif arg.startswith('-file:'): if len(arg) == 5: self.__textfile = wikipedia.input(u'File to read pages from?') else: self.__textfile = arg[6:] elif arg == '-new': self.__workonnew = True elif arg.startswith('-start:'): if len(arg) == 6: self.__start = wikipedia.input(u'Which page to start from: ') else: self.__start = arg[7:] elif arg.startswith('-number:'): if len(arg) == 7: self.__number = int(wikipedia.input(u'Number of pages to parse: ')) else: self.__number = int(arg[8:]) elif arg == '-select': globalvar.select = True elif arg == '-ask': globalvar.ask = True elif arg == '-dry': globalvar.dry = True elif arg == '-autonomous': globalvar.autonomous = True elif arg.startswith('-compare:'): if len(arg) == 8: globalvar.compare = wikipedia.input(u'Project to compare to: ') else: globalvar.compare = arg[9:] elif arg.startswith('-output:'): if len(arg) == 7: globalvar.outputpage = wikipedia.input(u'Page to print output to: ') else: globalvar.outputpage = arg[8:] else: self.__pagetitles.append(arg) def generator(self): # Choose which generator to use according to options. pagegen = None if self.__workonnew: if not self.__number: self.__number = config.special_page_limit pagegen = pagegenerators.NewpagesPageGenerator(number = self.__number) elif self.__refpagetitle: refpage = wikipedia.Page(wikipedia.getSite(), self.__refpagetitle) pagegen = pagegenerators.ReferringPageGenerator(refpage) elif self.__linkpagetitle: linkpage = wikipedia.Page(wikipedia.getSite(), self.__linkpagetitle) pagegen = pagegenerators.LinkedPageGenerator(linkpage) elif self.__catname: cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % self.__catname) if self.__start: pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse, start = self.__start) else: pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse) elif self.__textfile: pagegen = pagegenerators.TextfilePageGenerator(self.__textfile) else: if not self.__start: self.__start = '!' namespace = wikipedia.Page(wikipedia.getSite(), self.__start).namespace() start = wikipedia.Page(wikipedia.getSite(), self.__start).titleWithoutNamespace() pagegen = pagegenerators.AllpagesPageGenerator(start, namespace) return pagegen def getSites(self, site): # Get Site objects for all projects, since we are going to need them no # matter what. # NOTE: we are excluding wikinews because it doesn't make sense to look for # the same names in wikinews AFAICS. # FIXME: right now, we are excluding wiktionary because it's case-sensitive. # Handle that. # That's a bad idea if you want it to be able to show the "has # interprojects?" column. # if globalvar.compare: # globalvar.siblings = [globalvar.compare] # else: globalvar.siblings = ['b', 'commons', 's', 'w'] for s in globalvar.siblings: if s == 'commons': code = 'commons' else: code = site.lang globalvar.sites[s] = wikipedia.getSite(code, site.family.known_families[s]) def main(self): wikipedia.setLogfileStatus(True, 'interproject.log') # ensure that we don't try to change main page try: site = wikipedia.getSite() globalvar.mainpagename = site.family.mainpages[site.language()] except: wikipedia.output(u'Missing main page name') # Parse command line options self.parse() # Fill globalvar.sites self.getSites(site) pagegen = self.generator() generator = None if self.__pagetitles: pages = [] for p in self.__pagetitles: try: pages.append(wikipedia.Page(wikipedia.getSite(), p)) except wikipedia.NoPage: pass generator = pagegenerators.PreloadingGenerator(iter(pages)) else: generator = pagegenerators.PreloadingGenerator(pagegen) for page in generator: workon(page) if globalvar.compare: # Add wikipedia comparison globalvar.output = "{|\n|-\n!Article||Article Wikipedia||Lien vers Wikiquote ?||Lien vers l'article correspondant ?\n" + globalvar.output globalvar.output += "|}\n" # Add articles without interprojects globalvar.output += "\n== Articles sans interprojet(s) ==\n\n" globalvar.output += "{|\n|-\n!Article||Lien vers Wikipedia ?||Lien interprojet ?\n" globalvar.output += globalvar.noip globalvar.output += "|}\n" try: outputpage = wikipedia.Page(site, globalvar.outputpage) outputpage.put(globalvar.output, comment = "Mise a jour de la liste") except: wikipedia.output(u'Getting/Modifying page %s failed, generated output was:\n%s' % (globalvar.outputpage, globalvar.output)) globalvar = Global() try: if __name__ == "__main__": Main().main() finally: wikipedia.stopme()