User:PotatoBot/Code/3
Appearance
< User:PotatoBot | Code
#!/usr/bin/python
# -*- coding: utf-8 -*-
import codecs, wikipedia, catlib, pagegenerators, time, mysave
from datetime import date
# PotatoBot Task 3: Creates redirects from ATC codes to drug articles and ATC lists;
# adds {{anchor}}s to ATC list sections;
# checks ATC codes in {{drugbox}}es and {{chembox}}es
templatenames = ['Drugbox', 'Chembox Identifiers']
ATClist = {}
def findLink(line, ltype):
"""Check whether line contains an ATC code and a simple drug name (as opposed to combinations etc.)."""
link = ''
if line != '' and line[0] == ':' and (ltype == 1 or line[1] == 'Q') and line[ltype].isupper()\
and line[1+ltype:3+ltype].isdigit() and line[3+ltype:5+ltype].isupper() and line[5+ltype:7+ltype].isdigit()\
and line[2] != 'I' and line[1:4] != 'V01':
# Todo: what about ATCvet code QI, ATC code V01? (excluded by last condition) #
# Todo: include combinations #
start = line.find('[[')
if start == 8 + ltype:
bracket = line.partition(' (')
pipe = bracket[0].find('|')
end = bracket[0].find(']]')
if pipe == -1:
pipe = end
if True not in [str in bracket[2].lower() for str in ('<sup>', 'human', 'bovine', 'beef', 'porcine', 'pork')]\
and ' ' not in bracket[0][end:]:
link = bracket[0][10+ltype:pipe]
elif start == -1 and True not in [str in line.lower() for str in\
('various', 'other', 'combination', 'compound', ' and ', ' with ', 'including', 'producing')]:
link = line[8+ltype:]
if link == '':
return (0, None, None, None)
else:
return (ltype, wikipedia.Page(wikipedia.getSite(), 'ATC' + (ltype == 2)*'vet' + ' code ' + line[1:7+ltype]),\
wikipedia.Page(wikipedia.getSite(), 'ATCvet code Q' + line[1:8]),\
mysave.resolveredir(wikipedia.Page(wikipedia.getSite(), link)))
def fromATClist(line, vetalso):
"""Treat a line from an ATC codes list."""
global ATClist
# human ATC code
ltype, redirpage, redirpageVet, page = findLink(line, 1)
if ltype == 0:
# ATCvet code
ltype, redirpage, redirpageWaste, page = findLink(line, 2)
# if this line contains an ATC code with an identifiable target
if ltype > 0 and page.exists():
wikipedia.output('> ' + redirpage.title() + ' ' + page.title())
ATClist[line[1:7+ltype]] = page.title()
result = mysave.makeredir(redirpage, page, '{{R from ATC' + (ltype == 2)*'vet' + ' code|' + line[ltype:7+ltype] + '}}')
if ltype == 1 and vetalso:
result += mysave.makeredir(redirpageVet, page, '{{R from ATCvet code|' + line[ltype:7+ltype] + '}}')
wikipedia.output('')
return result
else:
return ''
def fromTemplate(idx):
"""Check code in {{drugbox}} or {{chembox}}. In later versions, this should also add codes to these boxes."""
global ATClist, templatenames
ATC_prefix = ('ATC_prefix', 'ATCCode_prefix')
ATC_suffix = ('ATC_suffix', 'ATCCode_suffix')
ATC_supplemental = ('ATC_supplemental', 'ATC_Supplemental')
wikipedia.output('\n>> Template:' + templatenames[idx])
result = ''
for page in pagegenerators.ReferringPageGenerator(wikipedia.Page(wikipedia.getSite(),\
'Template:' + templatenames[idx]), onlyTemplateInclusion = True):
# look for ATC codes in infoboxes
ATCvet, prefix, suffix, supp = False, '', '', ''
ATCvetthere, prefixthere, suffixthere, suppthere = False, False, False, False
for tuple in page.templatesWithParams():
if tuple[0] == templatenames[idx]:
for param in tuple[1]:
value = param.partition('=')
if value[0].strip() == 'ATCvet':
ATCvet = value[2].strip() == 'yes'
ATCvetthere = True
elif value[0].strip() == ATC_prefix[idx]:
prefix = value[2].strip()
prefixthere = True
elif value[0].strip() == ATC_suffix[idx]:
suffix = value[2].strip()
suffixthere = True
elif value[0].strip() == ATC_supplemental[idx]:
supp = value[2].strip()
suppthere = True
break
codes = (prefix != '') * [(ATCvet*'Q' + prefix + suffix)]
for tupleSupp in page.templatesWithParams(supp):
if tupleSupp[0] in ['ATC', 'ATCvet']:
codes.append((tupleSupp[0] == 'ATCvet')*'Q' + tupleSupp[1][0] + tupleSupp[1][1])
# compare with ATClist
notfound = []
for code in codes[:]:
if ATClist.get(code) == page.title():
del ATClist[code]
else:
notfound.append(code)
for code in ATClist[:]:
if ATClist[code] == page.title():
codes.append(code)
del ATClist[code]
codes.sort(lambda s:(s[:1] == 'Q') * 'Z' + s)
if len(codes) > 0:
ATCvet = codes[0][0] == 'Q'
prefix = codes[0][ATCvet:ATCvet+3]
suffix = codes[0][ATCvet+3:]
supp = ((idx == 1) * ',' + ' ').join(['{{ATC' + (code[0] == 'Q') *'vet' + '|'\
+ code[code[0] == 'Q':(code[0] == 'Q')+3] + '|' + code[(code[0] == 'Q')+3:] + '}}' for code in codes[1:]])
wikipedia.output(' \03{green} -> ATCvet = %s, prefix = %s, suffix = %s,\nsupplemental = %s\03{default}'\
% (ATCvet, prefix, suffix, supp))
# Todo: write ATC codes to infobox # needs BRFA
if len(notfound) > 0:
wikipedia.output(' \03{yellow}ATC code(s) %s in %s not found in ATC lists\03{default}' % (notfound, page.title()))
result += '# %s: ATC code%s %s not found in ATC lists\n' % (page.aslink(), (len(notfound) > 1)*'s', ', '.join(notfound))
return result
def main():
global ATClist
excludeATCvet = ['A07CA', 'J01EA', 'J01EB', 'J01EC', 'J01ED', 'J01EE'] # only fourth-level codes supported
# Prepare log
listout = 'Log for the creation of [[ATC code]] redirects<!--, {{tl|drugbox}} and {{tl|chembox}} updates--> ([[Wikipedia:Bots/Requests for approval/PotatoBot 3|Task 3]]). Date: %s.\n'\
% mysave.fmtdate(date.today())
# Treat links from ATC code pages
for page in pagegenerators.CategorizedPageGenerator(catlib.Category(wikipedia.getSite(), 'Category:ATC codes'), False):
if (page.title()[0:8] == 'ATC code' or page.title()[0:11] == 'ATCvet code') and not page.title()[-1:].isalpha():
wikipedia.output('\n>> ' + page.title())
text = page.get()
editTime = page.editTime()
lines = text.splitlines(True)
vetalso = text.replace(' ', '').find('vet=no') == -1
for n in range(len(lines)):
listout += fromATClist(lines[n].strip(), vetalso and lines[n][1:6] not in excludeATCvet)
# Anchors in ATC lists, including redirects
if lines[n][0:2] == '==':
level4 = lines[n][0:3] == '==='
vet = lines[n][2+level4:].strip()[0:1] == 'Q'
code = lines[n][2+level4:].strip()[:4+level4+vet]
if code[1+vet:3+vet].isdigit():
wikipedia.output('> ATC' + vet*'vet' + ' code ' + code)
listout += mysave.makeredir(wikipedia.Page(wikipedia.getSite(), 'ATC' + vet*'vet' + ' code ' + code),
wikipedia.Page(wikipedia.getSite(), page.title() + '#' + code),
'{{R from ATC' + vet*'vet' + ' code|' + code[vet:] + '}}{{R to section|Atc' + vet*'vet'\
+ ' code ' + code + '}}')
if not vet and vetalso and code[:5] not in excludeATCvet:
listout += mysave.makeredir(wikipedia.Page(wikipedia.getSite(), 'ATCvet code Q' + code),
wikipedia.Page(wikipedia.getSite(), page.title() + '#' + code),
'{{R from ATCvet code|' + code + '}}{{R to section|Atcvet code Q' + code + '}}')
if '{{anchor' not in lines[n]:
lines[n] = lines[n][:2+level4] + '{{anchor|' + code + '}}' + lines[n][2+level4:]
wikipedia.output('')
text = ''.join(lines)
if text != page.get():
if editTime == page.editTime():
listout += mysave.savepage(page, text, 'ATC code anchors for sections', minor = True)
else:
listout += '# %s: edit conflict occurred\n' % page.aslink()
# Check ATC codes in {{drugbox}} and {{chembox}} transclusions
listout += fromTemplate(0) + fromTemplate(1) +\
''.join(['# [[%s]]: ATC code %s not found in article\n' % (p, a) for (a, p) in ATClist.iteritems()])
# Todo: direct obsolete redirects at ATC list subsection # needs BRFA
# Output log
wikipedia.output('')
mysave.savepage(wikipedia.Page(wikipedia.getSite(), 'User:PotatoBot/Lists/ATC codes log'), listout, 'Creating [[ATC code]]s log')
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
mysave.py
[edit]#!/usr/bin/python
# -*- coding: utf-8 -*-
import pywikibot as w
import re
# Code for saving redirects and other pages
def savepage(page, text, BRFANo, summary = '', minor = False):
"""Save text to a page and log exceptions."""
if summary != '':
w.setAction(summary + '. See [[Wikipedia:Bots/Requests for approval/PotatoBot ' + BRFANo + '|approval]]. Report errors and suggestions at [[User talk:PotatoBot]].')
try:
if not '#' in page.title():
page.put(text, minorEdit = minor)
w.output(' \03{green}saving %s -> \03{gray}%s\03{default}' % (page.title(), text))
return ''
else:
w.output(' \03{red}cannot save %s because it is a section\03{default}' % page.title())
return '# %s: this is a secion title' % page.title(aslink=True)
except w.LockedPage:
w.output(' \03{red}cannot save %s because it is locked\03{default}' % page.title())
return '# %s: page was locked\n' % page.title(aslink=True)
except w.EditConflict:
w.output(' \03{red}cannot save %s because of edit conflict\03{default}' % page.title())
return '# %s: edit conflict occurred\n' % page.title(aslink=True)
except w.SpamfilterError, error:
w.output(' \03{red}cannot save %s because of spam blacklist entry %s\03{default}' % (page.title(), error.url))
return '# %s: spam blacklist entry\n' % page.title(aslink=True)
except:
w.output(' \03{red}unknown error on saving %s\03{default}' % page.title())
return '# %s: unknown error occurred\n' % page.title(aslink=True)
def resolveredir(page):
"""Return target if input is a redirect, else return input."""
try:
if page.isRedirectPage():
try:
w.output(' \03{gray}resolving redir %s to %s\03{default}'\
% (page.title(), page.getRedirectTarget().title()))
return page.getRedirectTarget()
except:
w.output(' \03{yellow}target %s is a broken redir\03{default}' % page.title())
return w.Page(w.getSite(), page.title() + ' (broken redirect)')
else:
return page
except:
w.output(' \03{yellow}target %s is a bad link\03{default}' % page.title())
return w.Page(w.getSite(), page.title() + ' (bad link)') # workaround for wikipedia.py breaking wikiasite: links
def makeredir(redirpage, page, BRFANo, templates = ''):
"""Create a redirect and log existing page that isn't a redirect to the desired article."""
page = resolveredir(page)
if redirpage.exists():
comment = ''
if redirpage.isDisambig():
comment = ' (disambiguation)'
dab = redirpage
if redirpage.isRedirectPage():
try:
if redirpage.getRedirectTarget().title() == page.title() or \
redirpage.getRedirectTarget().sectionFreeTitle() == page.title():
# Already a redir to the desired article
return ''
elif redirpage.getRedirectTarget().isDisambig():
comment = ' (redirect to disambiguation)'
dab = redirpage.getRedirectTarget()
else:
comment = ' (redirect)'
except:
comment = ' (broken redir)'
if 'disambiguation' in comment and page in [resolveredir(p) for p in dab.linkedPages()]:
w.output(' link to %s already on dab page %s' % (page.title(), redirpage.title()))
return ''
elif redirpage.title() != page.title():
w.output(' \03{yellow}redir to %s failed, page %s already exists\03{default}' % (page.title(), redirpage.title()))
return '# %s: redirecting to %s failed, page already exists%s\n' % (redirpage.title(aslink=True), page.title(aslink=True), comment)
else:
return ''
# Else create redirect, or write page name to list if an error occurs
else:
return savepage(redirpage, '#REDIRECT %s %s' % (page.title(aslink=True), templates), BRFANo, 'Redirect to ' + page.title(aslink=True))
def findATCs(page, includeVet = True):
"""Look for ATC codes in infoboxes."""
ATCvet, prefix, suffix, supp = False, '', '', ''
ATCvetpos, prefixpos, suffixpos, supppos = -1, -1, -1, -1
templatenames = ('Drugbox', 'Chembox Identifiers')
templates = page.templatesWithParams()
for tuple in templates:
if tuple[0] in templatenames:
idx = templatenames.index(tuple[0])
templatepos = templates.index(tuple)
for param in tuple[1]:
value = param.partition('=')
if value[0].strip() == 'ATCvet':
ATCvet = value[2].strip() == 'yes' and includeVet
ATCvetpos = tuple[1].index(param)
elif value[0].strip() == ('ATC_prefix', 'ATCCode_prefix')[idx] and value[2].strip().lower != 'none':
prefix = value[2].strip()
prefixpos = tuple[1].index(param)
elif value[0].strip() == ('ATC_suffix', 'ATCCode_suffix')[idx]:
suffix = value[2].strip()
suffixpos = tuple[1].index(param)
elif value[0].strip() == ('ATC_supplemental', 'ATC_Supplemental')[idx]:
supp = value[2].strip()
supppos = tuple[1].index(param)
codes = (prefix != '') * [(ATCvet*'Q' + prefix + suffix)]
for tupleSupp in page.templatesWithParams(supp):
if tupleSupp[0] in ['ATC', 'ATCvet']:
codes.append((tupleSupp[0] == 'ATCvet')*'Q' + tupleSupp[1][0] + tupleSupp[1][1])
return (codes, ATCvetpos, prefixpos, suffixpos, supppos)
def addTemplateParam(page, newtemplates, BRFANo, summary = 'Updating template', minor = False):
text = page.get()
oldtemplates = page.templatesWithParams()
pointer = 0
for i in range(len(oldtemplates)):
search1 = re.compile(r'\{\{\s*(%s|%s)%s\s*\|' % (oldtemplates[i][0][0].upper(), oldtemplates[i][0][0].lower(),\
oldtemplates[i][0].replace(' ', '( |_)'))).search(text, pointer)
if search1:
pointer = end() - 1
if newtemplates[i] != oldtemplates[i]:
if newtemplates[i][0].strip() == oldtemplates[i][0].strip():
for j in range(len(oldtemplates[i][1])):
oldparam = oldtemplates[i][1][j].partition('=')
newparam = newtemplates[i][1][j].partition('=')
# Todo: unnamed params #
if newparam[0] == oldparam[0]:
span = re.compile(r'\|\s*%s\s*=\s*([^|}\s]*)\s*(}|\|)' % oldparam[0]).\
search(text, pointer).span(1)
pointer = span(1)
if newparam[2].strip() != oldparam[2].strip():
text = text[:span(0)] + newparam[2] + text[span(1):]
else:
text = text[:] + newtemplates[i][1][j] + text[:]
pointer = len(text[:] + newtemplates[i][1][j])
else:
w.output('\03{yellow}template list does not match page %s: %s vs. %s\03{default}' % \
(page.title(), newtemplates[i][0].strip(), oldtemplates[i][0].strip()))
return '# %s: template list did not match templates on page' % page.title(aslink=True)
if text != page.get():
return savepage(page, text, BRFANo, summary, minor)
else:
return ''
def fmtdate(date):
"""Format date in English w style."""
return '%d %s %d' % (date.day, ('', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',\
'September', 'October', 'November', 'December')[date.month], date.year)