User:BogBot/Source code/Task 01
Appearance
#!/usr/bin/python
# Bot Script to replace the partially overlapping
# {{tl|enzyme links}}, {{tl|enzyme references}}, {{tl|GO code links}}, and {{tl|CAS registry}} templates
# with the single {{tl|enzyme}} template
# in Wikipedia enzyme articles.
import re
import wikipedia
# Included for bot exclusion compliance (see http://en.wikipedia.org/wiki/Template:Bots)
user = "BogBot"
regexp_ab = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')
def Allowbots(text):
return not regexp_ab.search(text)
# compiled regular expression
# {{Enzyme references|EC_number=1.1.1.1|IUBMB_EC_number=1/1/1/1}}
regexp_er = re.compile(r"\{\{\s*Enzyme references\s*\|\s*EC_number\s*=\s*(?P<ECN>\d+\.\d+\.\d+\.\d+)\s*\|" # EC_number
r"\s*IUBMB_EC_number\s*=\s*(?P<IECN>\d+\/\d+\/\d+\/\d+)\s*\}\}") # IUBMB_EC_number
# {{CAS registry|9046-27-9}}
regexp_cas = re.compile(r"\{\{\s*CAS registry\s*\|\s*(?P<CAS>\d+-\d+-\d+)\s*\}\}")
#{{GO code links | GO_code=0003840 | name=gamma-glutamyltransferase}}
regexp_go = re.compile(r"\{\{\s*GO code links\s*\|\s*GO_code\s*=\s*(?P<GOC>\d+)\s*\|" # GO_code
r"\s*name\s*=\s*(?P<GON>.+)\s*\}\}") # GO_name
# {{Enzyme links|EC_number=1.1.1.1|IUBMB_EC_number=1/1/1/1}}
regexp_el = re.compile(r"\{\{\s*Enzyme links\s*\|\s*EC_number\s*=\s*(?P<ECN>\d+\.\d+\.\d+\.\d+)\s*\|" # EC_number
r"\s*IUBMB_EC_number\s*=\s*(?P<IECN>\d+\/\d+\/\d+\/\d+)\s*\}\}") # IUBMB_EC_number
regexp_external_links = re.compile(r"==\s*External links\s*==\.*\[http:\/\/", re.DOTALL)
regexp_elh = re.compile(r"\n*==\s*External links\s*==\n*")
regexp_goh = re.compile(r"\n*===\s*Gene Ontology \(GO\) codes\s*===\n*")
# ">Trypsin</a> (transclusion)
regexp_article = re.compile(r"\">(?P<title>.*)<\/a> \(transclusion\)")
# main loop
articles = []
f = open('.../wikipedia_enzyme_links.html', 'r')
for line in f:
result_title = regexp_article.search(line)
title = result_title.group('title')
articles.append(title)
# articles = ['User:BogBot/Test']
for article in articles:
log_string = "* [[" + article + "]]"
print log_string,
site = wikipedia.getSite()
page = wikipedia.Page(site, article)
text = page.get(get_redirect = True)
if not Allowbots(text):
break
# print "Allowbots: " + article
EC_number = ""
GO_name = ""
GO_code = ""
EC_number = ""
CAS_number = ""
IUBMB_EC_number = ""
result_el = regexp_er.search(text)
if result_el:
EC_number = result_el.group('ECN')
IUBMB_EC_number = result_el.group('IECN')
# print ' EC_number: ' + EC_number
# print 'IUBMB_EC_number: ' + IUBMB_EC_number
result_cas = regexp_cas.search(text)
if result_cas:
CAS_number = result_cas.group('CAS')
# print ' CAS_number: ' + CAS_number
result_go = regexp_go.search(text)
if result_go:
GO_code = result_go.group('GOC')
GO_name = result_go.group('GON')
# print ' GO_code: ' + GO_code
# print ' GO_name: ' + GO_name
if not GO_name:
GO_name = article
log_string = ", GO_name: " + GO_name + ", EC_number: " + EC_number + ", CAS_number: " + CAS_number + ", GO_code: " + GO_code
print log_string,
enzyme_template = "\n".join ( [ "{{enzyme",
"| Name = " + GO_name,
"| EC_number = " + EC_number,
"| CAS_number = " + CAS_number,
"| IUBMB_EC_number = " + IUBMB_EC_number,
"| GO_code = " + GO_code,
"| image = ",
"| width = ",
"| caption = ",
"}}" ] )
# remove external link headers
text = re.sub(regexp_goh, "", text)
if not regexp_external_links.search(text):
text = re.sub(regexp_elh, "\n", text)
lines = text.splitlines()
new_text = enzyme_template + "\n"
if EC_number:
for line in lines:
if not ( regexp_er.search(line) or regexp_cas.search(line) or regexp_go.search(line) or regexp_el.search(line) ):
new_text += line + "\n"
# print new_text
page.put(new_text, comment='merging {{enzyme links}}, {{enzyme references}}, {{GO code links}}, and {{CAS registry}} templates into {{enyzme}} template', watchArticle = None, minorEdit = True)
print ", page updated"
wikipedia.stopme()