Jump to content

User:CobraBot/Code2

From Wikipedia, the free encyclopedia
#!/usr/bin/env python
# -*- coding: utf-8  -*-

import wikipedia
import pagegenerators
import re
import warnings
from time import sleep
from contextlib import closing
from sys import stdout
from oclc import isbn2oclc, ParsingProblem, NotInOclc
from congress import isbn2classes, NotInLoc
from json import dump, load

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;': pagegenerators.parameterHelp
}

TEMPLATE_PREFIX = u"Template:"
SITE = wikipedia.getSite()
def pagesUsingTemplate(templateName):
    transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName
    transclusionPage = wikipedia.Page(SITE, transclusionPageName)
    gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True)
    return gen

class BailOut(StandardError):
    """Immediately stop processing the current page"""

class AlreadyFilled(BailOut):
    """Field already filled in"""

class CobraBot:
    # EDIT_SUMMARY = u'Adding [[OCLC]]# to book infobox based on [[ISBN]] ([[User:CobraBot|CobraBot]]; PLEASE [[User talk:CobraBot|report any problems]])'
    EDIT_SUMMARY = u'Adding [[Dewey Decimal Classification|Dewey Decimal]] and/or [[Library of Congress Classification|LCC]] to book infobox based on [[ISBN]] ([[User:CobraBot|CobraBot]]; PLEASE [[User talk:CobraBot|report any problems]])'
    BOOK_INFOBOX = u"Infobox Book"
    DASHES = [u'-', u'‒', u'–', u'—', u'―']
    TERMINATOR = re.compile(u"(}})|\\|")
    INFOBOX_START = re.compile(u"\\{\\{[ \t\n]*infobox[ _]((book(s)?)|(novel))", re.IGNORECASE)
    GENERIC_PARAM = u"\\|[ \t\n]*%s[ \t\n]*=[ \t\n]*"
    # OCLC_PARAM = GENERIC_PARAM % u"oclc"
    DEWEY_PARAM = GENERIC_PARAM % u"dewey"
    LOC_PARAM = GENERIC_PARAM % u"congress"
    ISBN_MIN_LEN = 10
    OFFSET_FILE = 'N.json'
    JUST_FIC = "[Fic]"

    def __init__(self, automatic, debug):
        """
        Constructor. Parameters:
            * generator - The page generator that determines on which pages
                          to work on.
            * debug     - If True, doesn't do any real changes, but only shows
                          what would have been changed.
        """
        
        self.generator = pagesUsingTemplate(self.BOOK_INFOBOX)
        self.debug = debug
        self.editCount = 0
        self.log = file("skipped.log", 'a')
        self.log.write("BEGIN NEW SESSION\n")
        self.automatic = automatic
        wikipedia.setAction(self.EDIT_SUMMARY)

    def run(self):
        with closing(file(self.OFFSET_FILE, 'r')) as f:
            N = load(f)
        # Set the edit summary message
        print "Advancing by %s..." % N
        stdout.flush()
        for i in xrange(N):
            next(self.generator)
        print "Done advancing!"
        stdout.flush()
        try:
            for pageIndex, page in enumerate(self.generator):
                self.treat(page, pageIndex)
        finally:
            self.log.close()
            with closing(file(self.OFFSET_FILE, 'w')) as f:
                dump(N+pageIndex-5, f)
    
    def runManual(self):
        index = 0
        while True:
            title = raw_input("Page: ").decode('utf8')
            page = wikipedia.Page(None, title)
            self.treat(page, index)
            index += 1

    #########
    def partition(self, text):
        boxmatch = self.INFOBOX_START.search(text)
        if not boxmatch:
            wikipedia.output(u"SKIPPING: Page either uses 'Book infobox' alias or is false positive")
            raise BailOut, "SKIPPING: Page either uses 'Book infobox' alias or is false positive"

        boxStart = boxmatch.start()
        boxEnd = boxStart + re.search(u"\\}\\}", text[boxStart:]).end()
        
        prebox = text[:boxStart]
        box = text[boxStart:boxEnd]
        postbox = text[boxEnd:]
        return prebox, box, postbox
    
    def checkForField(self, box, field_regex):
        paramMatch = re.search(field_regex, box)
        if paramMatch: #has |oclc=
            fieldValAndRest = box[paramMatch.end():]
            fieldTermMatch = self.TERMINATOR.search(fieldValAndRest)
            value = fieldValAndRest[:fieldTermMatch.start()].strip() # | oclc = VALUE |
            if value: #already has |oclc= filled in
                wikipedia.output(u"SKIPPING: param already filled")
                raise AlreadyFilled, "SKIPPING: param already filled"
            else: #remove the |oclc=
                # print "REMOVED OCLC:", repr(paramMatch.group())
                box = box[:paramMatch.start()] + box[paramMatch.start()+len(paramMatch.group()):]
                # print "NEW BOX:"
                # print box
                return box
        return box
    
    def findIsbnVal(self, box):
        paramMatch = re.search(u"\\|([ \t\n])*isbn([ \t\n])*=([ \t\n])*", box)
        if not paramMatch: #no ISBN present
            wikipedia.output(u"SKIPPING: No isbn param present")
            raise BailOut, "SKIPPING: No isbn param present"
        isbnValAndRest = box[paramMatch.end():]
        termMatch = self.TERMINATOR.search(isbnValAndRest)
        isbnVal = isbnValAndRest[:termMatch.start()]
        relIsbnTerm = self.TERMINATOR.search(isbnValAndRest).start()
        isbnTerm = paramMatch.end() + relIsbnTerm
        isbnFrag = isbnValAndRest[:relIsbnTerm]
        if '[[' in isbnFrag and ']]' not in isbnFrag:
            wikipedia.output(u"SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle")
            raise BailOut, "SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle"
        return isbnVal, isbnTerm
    
    def removeDashes(self, isbn):
        for dash in self.DASHES:
            isbn = isbn.replace(dash, '')
        return isbn

    def checkForNA(self, isbn):
        if re.match(u"N/?A", isbn, re.IGNORECASE):
            wikipedia.output(u"SKIPPING: ISBN Not/Applicable")
            raise BailOut, "SKIPPING: ISBN Not/Applicable"
    
    def removeExtraISBN(self, isbnVal):
        match = re.match(u"([ \t\n])*ISBN([ \t\n])*", isbnVal)
        if match:
            return isbnVal[match.end():]
        return isbnVal

    def firstWord(self, isbnVal):
        wordMatch = re.search("[^ \t\n<,;\\[\\]]+", isbnVal)
        return wordMatch.group()

    def normalize(self, string):
        return string.replace(u' ',u'').replace(u"-",u'').replace(u"and",  u"&").replace(u',', u'').replace(u'.', u'').replace(u"'", u'').replace(u'"', u'').replace(u"’", u'').lower().replace(u"the", u'')

    def treat(self, page, pageIndex):
        """
        Loads the given page, does some changes, and saves it.
        """
        
        print "=================================================================="
        
        print "PAGE TITLE:", page.title()
        print "PAGE#:", pageIndex+1
        print "EDIT COUNT:", self.editCount
        if page.namespace() != 0:
            wikipedia.output(u"SKIPPING: Non-article namespace!")
            return
        
        try:
            # Load the page
            text = page.get()
        except wikipedia.NoPage:
            wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())
            return
        except wikipedia.IsRedirectPage:
            wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
            return

        try:
            prebox, box, postbox = self.partition(text)
            # print "BOX:"
            # print box
            doLOC = True
            try:
                box = self.checkForField(box, self.LOC_PARAM)
            except AlreadyFilled:
                doLOC = False
            
            doDewey = True
            try:
                box = self.checkForField(box, self.DEWEY_PARAM)
            except AlreadyFilled:
                doDewey = False
            
            if not doDewey and not doLOC:
                return #skip since both filled in

            isbnVal, isbnTerm = self.findIsbnVal(box)
            # print "INITIAL ISBN:", repr(isbnVal)
            isbnVal = self.removeDashes(isbnVal).strip()
            # print "ISBN SANS DASH:", repr(isbnVal)
            isbnVal = self.removeExtraISBN(isbnVal)
            self.checkForNA(isbnVal)
            # print "ISBN SANS ISBN:", repr(isbnVal)
            if not isbnVal: #empty |isbn=
                wikipedia.output(u"SKIPPING: Empty isbn param")
                raise BailOut, "SKIPPING: Empty isbn param"
            isbn = self.firstWord(isbnVal)
            # print "ONE TRUE ISBN:", isbn
            if not self.automatic:
                print "ISBN#:", isbn
            if len(isbn) < self.ISBN_MIN_LEN:
                wikipedia.output(u"SKIPPING: Malformed ISBN, too short (%s)" % isbn)
                raise BailOut, ("SKIPPING: Malformed ISBN, too short (%s)" % isbn)
            if not re.search("[0-9]", isbn):
                wikipedia.output(u"SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)
                raise BailOut, ("SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)
            
            #do lookup
            try:
                loc, dewey = isbn2classes(isbn)
            except NotInLoc:
                wikipedia.output(u"SKIPPED: Given ISBN not in LOC database")
                raise BailOut, "SKIPPED: Given ISBN not in LOC database"
            except RuntimeError as e:
                wikipedia.output(u"ABORTED: Problem looking up data (%s)" % e.message)
                raw_input("Enter to continue")
                raise BailOut, e.message

            doDewey &= dewey is not None and dewey != self.JUST_FIC

            # try:
            #     oclc, oclcTitle = isbn2oclc(isbn)
            # except ParsingProblem:
            #     wikipedia.output(u"SKIPPED: Problem parsing OCLC response")
            #     raw_input("Enter to continue")
            #     raise BailOut, "SKIPPED: Problem parsing OCLC response"

            
        except BailOut as e:
            try:
                self.log.write(page.title().encode('utf8')+"; "+e.message+"\n")
            except:
                pass
            return
        
        print "LOC Class:", loc
        print "Dewey Class:", dewey
        # print "OCLC#:", oclc
        
        # if not self.automatic:
        #     wikiCanon = self.normalize(page.title().split(u"(")[0])
        #     oclcCanon = self.normalize(oclcTitle.split(u":")[0])
        #     titlesMatch = oclcCanon.startswith(wikiCanon)
        #     if titlesMatch:
        #         # print
        #         print "--Canonical titles DO MATCH--"
        #     else:
        #         print "!!Canonical titles DON'T MATCH!!"
        #         print "PAGE TITLE:", page.title()
        #         print "OCLC TITLE:", oclcTitle
            #     print wikiCanon
            #     print oclcCanon
        
        addition = ""
        if doDewey: addition = "| dewey= "+dewey+(" " if self.debug else "\n")
        if doLOC: addition += "| loc= "+loc+(" " if self.debug else "\n")
        box = box[:isbnTerm] + addition + box[isbnTerm:]
        text = prebox + box + postbox

        # only save if something was changed
        if text != page.get():
            # Show the title of the page we're working on.
            if not self.automatic:
                # Highlight the title in purple.
                wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
                # show what was changed
                wikipedia.showDiff(page.get(), text)
            if not self.debug:
                if False: pass
                elif self.automatic:
                    pass
                else:
                    choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
                    if choice == 'n':
                        return
                try:
                    # Save the page
                    page.put(text)
                except wikipedia.LockedPage:
                    wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
                except wikipedia.EditConflict:
                    wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
                except wikipedia.SpamfilterError, error:
                    wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
                else:
                    self.editCount += 1


def main():
    DEBUG = True # False
    AUTO = False
    bot = CobraBot(AUTO, DEBUG)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bot.run()
        #bot.runManual()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()