User:Gdr/authority.py
Appearance
< User:Gdr
#!/usr/bin/python
#
#
# AUTHORITY.PY -- ADD AUTHORITY TO TAXOBOX
# Gdr, 2005-07-05
#
#
# 1. INTRODUCTION
#
# This Python script add an authority to a selected taxobox on the
# English wikipedia.
#
#
# 1.1 USAGE
#
# ./authority.py --rebuild Rebuild abbreviation table
# ./authority.py --query=ABBREV Query abbreviation
# ./authority.py TAXON Find authority and add it to taxon
# ./authority.py TAXON AUTHORITY Add authority to taxon
#
#
# 1.2 OPTIONS
#
# -r --rebuild Rebuild abbreviation table
# -q X --query=X Query abbreviation
# -a A --article=A Start at article A instead of TAXON
# -n --noexpand Don't expand abbreviations
# -d --disambig Solve disambiguations for abbrevs
#
#
# 1.2 EXAMPLES
#
# ./authority.py Magnolia
# ./authority.py 'Boa constrictor'
# ./authority.py Quercus L.
# ./authority.py 'Passer domesticus' '(Linnaeus, 1758)'
# ./authority.py 'Plasmodium vivax' 'Grassi & Feletti 1890'
# ./authority.py -a 'Homo (genus)' Homo
#
#
# 1.3 LICENCE
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
import getopt
import htmlentitydefs
import os
import pickle
import re
import sys
import time
import unicodedata
import urllib
import wikipedia
class Error(Exception):
def __init__(self, s):
wikipedia.output(unicode(s))
self.s = s
def __repr__(self):
return self.s
class Authority:
# 2. CONFIGURATION
# 2.1 USER CONFIGURATION
# Which Wikipedia we are editing.
site = wikipedia.Site('en')
# 'authfile' is the filename in which the tables of author names and
# abbreviations will be saved.
authfile = 'authority.dat'
# A regular expression that matches an authority and abbreviation in
# a Wikipedia article. (This is the default; you can override it for
# particular sources; see below.)
auth_re = re.compile(ur'^\*[ \']*([\w\'., -]+[\w.])[ \']*'
ur' +(?:[-\u2013]|&[nm]dash;) +'
ur'\[\[([^\]|]+).*\r?\n', re.M|re.U)
# 'wiki_abbrev_sources' is a dictionary mapping a code letter to a
# Wikipedia sources for authority abbreviations. Each source is a
# dictionary with these keys:
#
# name ---- name of the Wikipedia article containing authorities and
# their abbreviations
# re ------ a regular expression matching an authority and its
# abbreviation(s). There must be two groups, one for the
# abbreviation(s) for that authority and one for the name
# of the article about that authority. If omitted, auth_re
# is used as the default. Abbreviations are presumed to be
# separated by commas.
# groups -- a tuple giving the group for the abbreviation(s) and the
# article; if omitted, (1,2) is the default.
# fmt ----- format string for a new authority. Use %A for the
# abbreviation and %B for the authority.
# sort ---- How to sort (by 'surname' or by 'abbrev').
wiki_abbrev_sources = {
'b': {'name': 'List of botanists by author abbreviation',
'fmt': "* '''%A''' - [[%B]]\n",
'sort': 'abbrev'},
'z': {'name': 'List of zoologists by author abbreviation',
'fmt': "* %A - [[%B]]\n",
'sort': 'surname'},
}
# 'other_abbbrev_sources' is a list of other (non-Wikipedia) sources
# for abbreviations. Each entry is a dictionary with keys:
#
# taxon --- a regular expression matching a taxon; means that this
# entry is only appropriate for articles contained in taxa
# matching this regexp. For example 'Plant' for a source
# listing only botanists, or 'Arthropod' for a source
# listsing only entomologists.
# re ------ a regular expression matching the abbreviation and its
# expansion. %A will be replaced by the regexp-escaped
# form of the abbreviation we are looking for. It should
# contain one group, matching the expansion.
# url ----- the URL to visit to find the abbreviation. %A will be
# replaced by the URL-encoded form of the abbreviation we
# are looking for.
other_abbrev_sources = [
{'taxon': 'Plant',
'url': 'http://www.ipni.org/ipni/authorsearch?find_abbreviation=%A&query_type=by_query',
're': r'(?u)>%A</a> - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'},
{'url': 'http://www.ipni.org/ipni/authorsearch?find_surname=%A&query_type=by_query',
're': r'(?u)>%A</a> - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'},
]
# 'auth_sources' is a list of sources to consult to find the
# authority for a taxon. Each entry is a dictionary with these keys:
#
# taxon --- a regular expression matching a taxon; means that this
# entry is only appropriate for articles contained in taxa
# matching this regexp. For example 'Plant' for a source
# listing only plant names, or 'Coleoptera' for a source
# listsing only beetles.
# url ----- the URL to visit to find the taxon. %T will be replaced
# by the URL-encoded form of the taxon we are looking
# for, and %S by the SN2000 "subject" area.
# re ------ a regexp for getting the authority. %A will be replaced
# by the regexp-escaped form of the abbreviation we are
# looking for. It should contain one group, matching the
# expansion.
auth_sources = [
{'taxon': 'Plant',
'url': ('http://www.ipni.org/ipni/plantsearch?'
'find_wholeName=%T&query_type=by_query'),
're': r'<i>%T</i> (.*)</a>'},
{'url': ('http://sn2000.taxonomy.nl/Taxonomicon/TaxonList.aspx?'
'searchBy=ScientificName&subject=%S&search=%T'),
're': r'<i>%T</i>[^<]*<font size="-1"> *(\(?[^<,]+,? +[0-9]+\)?)'},
# {'url': ('http://www.itis.usda.gov/servlet/SingleRpt/SingleRpt?' 'search_topic=Scientific_Name&search_value=%T'), 're': (r'(?i)<SPAN CLASS="taxon_head"><I>%T</I></SPAN>' r'[ \r\n]*<A.*[ \r\n]*<SPAN CLASS="taxon_head">[ \r\n]*' r' <B>([^<]+)</B></A>'),}
]
# 2.2 OTHER CONFIGURATION
# 'rank_to_subject' is a dictionary mapping Linnaean rank in Latin
# (as used in Wikipedia taxobox template names) to the SN2000
# "Subject area" in which a taxon can be looked up. Ranks not listed
# here are looked up in the subject area "High".
rank_to_subject = {
'subspecies': 'Species',
'species': 'Species',
'subgenus': 'Genus',
'genus': 'Genus',
'tribus': 'Family',
'subfamilia': 'Family',
'familia': 'Family',
'superfamilia': 'Family',
}
# Don't ask easy questions of the user?
noquery = False
def __init__(self):
for s in self.wiki_abbrev_sources.values():
s['page'] = wikipedia.Page(self.site, s['name'])
self.restore_abbreviations()
# 3. ABBREVIATIONS
#
# We want to be able to find abbreviations and turn them into links
# to the appropriate article. For example, given the abbreviation
# 'L.' we need to generate the wikitext '[[Carolus Linnaeus|L.]]'.
# This section includes the code for finding, storing, and updating
# these abbreviations.
# 3.1 LOADING AND SAVING ABBREVIATIONS
# Load abbreviations from disk.
def restore_abbreviations(self):
self.abbrev = {}
if os.path.isfile(self.authfile):
f = open(self.authfile, 'r')
if f:
self.abbrev = pickle.load(f)
f.close()
# Save authorities to disk.
def save_abbreviations(self):
f = file('authority.dat', 'w')
pickle.dump(self.abbrev, f)
f.close()
def unhtmlify(self, s):
s = s.decode('iso-8859-1')
while True:
m = re.search(r'&([a-z]+);', s)
if not m:
break
s = s[:m.start(0)] \
+ unichr(htmlentitydefs.name2codepoint[m.group(1)]) \
+ s[m.end(0):]
return s
# Normalize the unicode string 's' into ASCII. The idea is to store
# the authority Lac'ep`ede under the key 'Lacepede' so that
# inconsistent accentuation doesn't cause us to miss an
# abbreviation. We decompose all composed characters and then ignore
# everything non-ASCII. (This converts eacute->e etc.)
def normalize(self, s):
return unicodedata.normalize('NFD', unicode(s)).encode('ascii', 'ignore')
# Add an abbreviation to the table. 'abbrev' is the abbreviation;
# 'article' is the title of the Wikipedia article on that authority;
# 'code' is the code for the list from which it came, if any.
def add_abbreviation(self, abbrev, article, code = None):
key = self.normalize(abbrev)
if key not in self.abbrev:
self.abbrev[key] = []
for a in self.abbrev[key]:
# Do we already have this authority under this abbreviation?
if abbrev == a[0] and article == a[1]:
return
self.abbrev[key].append((abbrev, article, code))
# 3.2 USER INTERFACE FOR ADDING A NEW ABBREVIATION
# If we don't find an abbreviation in any of wiki_abbrev_sources, we can
# prompt the user to tell us the article title corresponding to the
# abbreviation; then we can add it to the relevant source.
# Return the normalized surname of the abbreviation.
def surname(self, abbrev):
m = re.search(r'(?ui)(?:de |von |d\')?[\w-]+\.?$',
self.normalize(abbrev))
if m:
return m.group(0)
else:
wikipedia.output(u"No surname for %s" % abbrev)
return 'a'
# 'abbrev' is the abbreviation for the authority described at
# 'article'. Add this to the source given by 'code'.
def add_abbreviation_to_source(self, abbrev, article, code):
source = self.wiki_abbrev_sources[code]
text = source['page'].get()
if source['sort'] == 'surname':
sortkey = self.surname(abbrev)
else:
sortkey = abbrev
groups = source.get('groups', (1,2))
# Format authority for insertion into the source.
fmt = source['fmt']
fmt = re.sub('%A', abbrev, fmt)
if article[-1] == '(':
fmt = re.sub('%B', article + '|', fmt)
else:
fmt = re.sub('%B', article, fmt)
# Go through abbreviations in the source until we get to the
# appropriate point in alphabetical order by surname.
for m in re.finditer(source.get('re', self.auth_re), text):
newtext = None
if source['sort'] == 'surname':
s2 = self.surname(m.group(groups[0]))
else:
s2 = m.group(groups[0])
if sortkey[0] != s2[0]:
# Sort keys not in the same letter of the alphabet.
continue
elif sortkey < s2:
# New abbrev goes before this one.
newtext = text[:m.start(0)] + fmt + text[m.start(0):]
elif re.match(r'(?: *\r?\n)*==', text[m.end(0):]):
# We've reached the end of the section for the right
# letter, but not found anywhere to put the new
# abbrev. So it goes at the end.
newtext = text[:m.end(0)] + fmt + text[m.end(0):]
else:
continue
# Found a place for it.
wikipedia.showDiff(source['page'].get(), newtext)
if wikipedia.input(u'OK? [yN]') == 'y':
source['page'].put(newtext, 'nomialbot - adding %s = %s'
% (abbrev, article))
return
wikipedia.output(u'Sorry, nowhere to put authority %s' % fmt)
# 'abbrev' is the abbreviation for the authority described at
# 'article'. Ask the user which source to add it to.
def user_add_abbreviation(self, abbrev, article):
for code, source in self.wiki_abbrev_sources.items():
wikipedia.output(u'(%s) %s' % (code, source['name']))
if self.noquery:
inp = None
else:
inp = wikipedia.input(u"Add abbreviation %s = %s to which source? [%s]"
% (abbrev, article,
''.join(self.wiki_abbrev_sources.keys())))
if inp in self.wiki_abbrev_sources:
self.add_abbreviation(abbrev, article, inp)
self.save_abbreviations()
self.add_abbreviation_to_source(abbrev, article, inp)
else:
self.add_abbreviation(abbrev, article)
self.save_abbreviations()
# 3.3 FINDING EXPANSIONS FOR ABBREVIATIONS
# Rebuild table of authorities from the Wikipedia articles listed in
# 'wiki_abbrev_sources'.
def rebuild_abbreviations(self):
wikipedia.getall(self.site,
map(lambda l: l['page'], self.wiki_abbrev_sources.values()))
for code, s in self.wiki_abbrev_sources.items():
for m in re.finditer(s.get('re', self.auth_re), s['page'].get()):
groups = s.get('groups', (1,2))
abbrevs = m.group(groups[0])
pagename = m.group(groups[1])
for a in re.split(r', +', abbrevs):
self.add_abbreviation(a, pagename, code)
self.save_abbreviations()
# User interface for finding an abbreviation using the stored
# abbrevs, returning the pair (abbrev, expansion) or None.
def find_abbreviation_in_store(self, abbrev):
key = self.normalize(abbrev)
if key in self.abbrev:
if len(self.abbrev[key]) == 1:
return self.abbrev[key][0]
for i in range(len(self.abbrev[key])):
wikipedia.output(u'(%d) %s' % (i + 1, self.abbrev[key][i][1]))
while True:
i = wikipedia.input(u"Which authority? [1-%d]"
% len(self.abbrev[key]))
if re.match(r'[0-9]+$', i) \
and int(i) - 1 in range(len(self.abbrev[key])):
break
return (abbrev, self.abbrev[key][int(i) - 1][1])
return None
# Find abbreviation using 'other_abbrev_sources', returning the pair
# (abbrev, expansion) or None.
def find_abbreviation_other(self, abbrev):
# TODO: check source[taxon]
for source in self.other_abbrev_sources:
url = re.sub('%A', urllib.quote(abbrev), source['url'])
wikipedia.output(u'Trying %s' % url)
f = urllib.urlopen(url)
r = re.sub('%A', re.escape(abbrev), source['re'])
m = re.search(r, f.read())
f.close()
if m:
e = self.unhtmlify(m.group(1))
self.user_add_abbreviation(abbrev, e)
return (abbrev, e)
return None
# User interface for finding abbreviation using Wikipedia, returning
# its expansion, or None.
def find_abbreviation_wiki(self, abbrev):
# See if there's a Wikipedia page for the abbrev.
pl = wikipedia.Page(self.site, abbrev)
if not pl.exists():
expansions = []
elif pl.isRedirectPage():
expansions = [wikipedia.Page(self.site, pl.getRedirectTarget())]
elif pl.isDisambig():
expansions = pl.linkedPages()
else:
expansions = []
for i in range(len(expansions)):
wikipedia.output(u'(%d) %s' % (i + 1, expansions[i].title()))
while True:
if expansions:
inp = wikipedia.input(u'Expansion for %s? [1-%d;aecq]'
% (abbrev, len(expansions)))
else:
inp = wikipedia.input(u'Expansion for %s? [aecq]'
% abbrev)
if inp == 'a':
abbrev = wikipedia.input(u'Enter new abbrev:')
return self.find_abbreviation(abbrev)
elif inp == 'e':
expansion = wikipedia.input(u'Enter expansion for %s:'
% abbrev)
self.user_add_abbreviation(abbrev, expansion)
return (abbrev, expansion)
elif re.match(r'[0-9]+$', inp) \
and int(inp) - 1 in range(len(expansions)):
expansion = expansions[int(inp) - 1].title()
self.user_add_abbreviation(abbrev, expansion)
return (abbrev, expansion)
elif inp == 'c':
return None
elif inp == 'q':
raise Error, "Quit requested"
elif inp == 'l':
for i in range(len(expansions)):
wikipedia.output(u'(%d) %s' % (i + 1, expansions[i]))
else:
wikipedia.output(
u'<number> = choose expansion;\n'
u'a = enter new abbreviation\n'
u'e = enter expansion\n'
u'c = continue (with no expansion for abbreviation)\n'
u'l = list expansions\n'
u'q = quit\n')
# Find expansion for abbreviation using all available methods,
# returning the pair (abbrev, expansion) or just abbrev if nothing
# found.
def find_abbreviation(self, abbrev):
if abbrev:
return self.find_abbreviation_in_store(abbrev) \
or self.find_abbreviation_other(abbrev) \
or self.find_abbreviation_wiki(abbrev) \
or (abbrev,)
else:
return ('',)
def wikify_abbreviation(self, expansion):
if 2 <= len(expansion):
return u'[[%s|%s]]' % (expansion[1], expansion[0])
else:
return expansion[0]
# 4. FINDING THE AUTHORITY FOR A TAXON
# 'format_authority' takes an 'authority', splits it into its
# component authorities, makes wikilinks for those components, and
# returns a wikitext string.
def format_authority(self, authority):
r = re.compile(r'^\(|, +[0-9]*| +[0-9]+| +in +| +and +|'
r' *\bex\.? +| +& +| +& +|\) *|'
r' +et al\.?')
abbrevs = r.split(authority)
joins = r.findall(authority)
expansions = map(self.wikify_abbreviation,
map(self.find_abbreviation, abbrevs))
return sum(x+y for x,y in zip(expansions, joins + ['']))
# 'find_authority' returns the authority for the given taxon. 'text'
# is the text of the Wikipedia article about that taxon.
def find_authority(self, taxon, text):
rank = self.rank_of_taxon(taxon, text)
subject = self.rank_to_subject.get(rank, 'High')
for source in self.auth_sources:
if 'taxon' in source and not \
re.search(r'(?m)^\| [a-z_]+ *= *\[\[%s' % source['taxon'], text):
continue
url = re.sub('%T', urllib.quote(taxon), source['url'])
url = re.sub('%S', subject, url)
url = re.sub('%R', rank, url)
wikipedia.output(u'Trying %s' % url)
f = urllib.urlopen(url)
r = re.sub('%T',
re.sub(r'\\? +', r'(?: +|</i> +<i>)', re.escape(taxon)),
source['re'])
m = re.search(r, f.read())
f.close()
if m:
return self.unhtmlify(m.group(1))
wikipedia.output(u'No authority found for %s' % taxon)
return None
# 5. UPDATING THE AUTHORITY FOR AN ARTICLE
kingdom_map = {
'Plant': 'Plantae',
'Animal': 'Animalia',
'Bacterium': 'Bacteria',
'Fungus': 'Fungi',
'Protist': 'Protista',
}
def kingdom(self, text):
m = re.search(r'(?m)^\| *regnum *= *\[\[([^\|\]]+)', text)
if m:
return self.kingdom_map.get(m.group(1), m.group(1))
else:
raise Error, "No kingdom found."
def rank_of_taxon(self, taxon, text):
if re.match(r'^[\w-]+ [\w-]+ [\w-]+$', taxon):
return 'subspecies'
elif re.match(r'^[\w-]+ [\w-]+$', taxon):
return 'species'
m = re.search(r'(?m)^\| *((?!name)[a-z_]+) *= *'
r'[ \']*\[*%s[^\w]\]*[ \']*$' % re.escape(taxon), text)
if not m:
raise Error, "Can't find taxon %s in taxobox" % taxon
return m.group(1)
kingdom_to_color = {
'Animalia': 'pink',
'Plantae': 'lightgreen',
'Fungi': 'lightblue',
'Archaea': 'darkgray',
'Protista': 'khaki',
'Bacteria': 'lightgrey',
}
# 'find_article' takes the name of an article to start looking at,
# and returns a Page object.
def find_article(self, article):
while True:
pl = wikipedia.Page(self.site, article)
if not pl.exists():
wikipedia.output(u"No page %s" % pl.title())
i = wikipedia.input(u"Redirect to:")
if not i:
raise Error, "Quit requested"
pl.put(u"#REDIRECT [[%s]]" % i,
u"nomialbot - redirecting scientific name %s to %s"
% (article, i))
article = i
elif pl.isRedirectPage():
article = pl.getRedirectTarget()
elif pl.isDisambig():
links = pl.linkedPages()
for i in range(len(links)):
wikipedia.output(u'(%d) %s' % (i + 1, links[i]))
inp = wikipedia.input(u'Choose which article? [1-%d]'
% len(links))
if re.match(r'[0-9]+$', inp) \
and int(inp) - 1 in range(len(links)):
article = links[int(inp) - 1].title()
else:
raise Error, "Quit requested"
else:
return pl
# 'add_authority_to_article' takes a Page object, a taxon and an
# authority. It adds the authority to that page.
def add_authority_to_article(self, pl, taxon, authority, expand = True):
text = pl.get()
text = self.tidy_taxobox(text)
if expand:
authority = self.format_authority(authority)
rank = self.rank_of_taxon(taxon, text)
kingdom = self.kingdom(text)
if rank == 'species':
test_param = 'binomial'
auth_param = 'binomial_authority'
elif rank == 'subspecies':
test_param = 'trinomial'
auth_param = 'trinomial_authority'
else:
test_param = rank
auth_param = rank + '_authority'
m = re.search('(?m)^\| *%s *=.*$' % re.escape(test_param), text)
if not m:
raise Error, "Can't find rank %s in %s" % (test_param, pl.title())
m1 = re.search(r'(?m)^\| *%s *= *(.*)' % re.escape(auth_param), text)
if not m1:
text = (text[:m.end(0)]
+ u'\n| %s = %s' % (auth_param, authority)
+ text[m.end(0):])
elif wikipedia.input(u'%s already has authority "%s". '
u'Replace? [yN]' % (taxon, m1.group(1))) == 'y':
text = (text[:m1.start(0)]
+ u'\n| %s = %s' % (auth_param, authority)
+ text[m1.end(0):])
wikipedia.showDiff(pl.get(), text)
if pl.get() != text and (self.noquery or (wikipedia.input(u"OK? [yN]") == 'y')):
pl.put(text, u'nomialbot - adding authority for %s %s'
% (taxon, authority))
def add_authority(self, article, taxon, authority, expand = True):
pl = self.find_article(article)
if pl:
self.add_authority_to_article(pl, taxon, authority, expand)
def find_and_add_authority(self, article, taxon, expand = True):
pl = self.find_article(article)
if not pl:
return
authority = self.find_authority(taxon, pl.get())
if authority:
self.add_authority_to_article(pl, taxon, authority, expand)
# 7. GENERAL TIDYING
subs = [
# Capitalize "Taxobox"
(r'{{taxobox', '{{Taxobox'),
# Italicise genus entry.
(r'(?m)^\| * genus *=[ \']*\[\[([^\]]+)\]\][ \']*$',
'| genus = \'\'[[\\1]]\'\''),
# Abbreviate genus in species entry.
(r'(?m)^\| *species *= *([\']*)([A-Z])[a-z]+ ([a-z]+)',
r'| species = \1\2. \3'),
# Supply missing genus abbrev in species entry.
(r'(?m)^(\| *genus *=[ \'\[]*([A-Z])[a-z]+[\] \']* *\n'
r'\| *species *=[ \']*)([a-z-]+[ \']*$)',
r'\1\2. \3'),
# Supply missing species entry.
(r'(?m)(^\| *genus *=.*\n)'
r'(\| * binomial *= *'
r'([A-Z])[a-z]+ ([a-z-]+))',
r"\1| species = '''''\3. \4'''''\n\2"),
# Italicise genus or species if it appears as the title.
(r'(?ms)^\| *name *= *([a-z -]+[a-z]) *(\n.*'
r'^\| *(?:genus|species) *=[ \'\[]*\1[ \'\]]*$)',
'| name = \'\'\\1\'\'\\2'),
# Bold genus if unlinked.
(r'(?m)^\| *genus *= *\'*(\w+)\'* *$',
"| genus = '''''\\1'''''"),
# Cut superfluous taxa.
(r'(?m)(?:^\| *(?!(?:regnum|phylum|divisio|classis|ordo|familia|genus|species))'
r'(?:super|sub|infra|nano)(?:regnum|phylum|divisio|classis|ordo|familia|genus|species) *=.*\n)+'
r'(^\| *(?:regnum|phylum|divisio|classis|ordo|familia|genus|species)'
r' *=.*\n)'
r'(?=^\| *[a-z]+ *=.*$)',
r'\1'),
]
conditional_subs = [
# Bold species entry if subject of article.
([r'(?m)^\| *binomial *='],
r'(?m)^\| *species *=[ \']*([^\]\'\}]+)[ \']*$',
'| species = \'\'\'\'\'\\1\'\'\'\'\''),
# Bold subspecies entry if subject of article.
([r'(?m)^\| *trinomial *='],
r'(?m)^\| *subspecies *=[ \']*([^\]\'\}]+)[ \']*$',
'| subspecies = \'\'\'\'\'\\1\'\'\'\'\''),
]
anticonditional_subs = [
# Supply missing binomial entry.
([r'(?m)^\| *binomial *=',
r'(?m)^\| *subspecies *='],
r'(?m)(^\| *genus *=[ \'\[]*([A-Z])([a-z]+)[ \'\]]*\n(?:.*\n)*'
r'(?m)^\| *species *=[ \']*\2. ([a-z-]+)[ \']*\n)',
r"\1| binomial = ''\2\3 \4''\n"),
([r'(?m)^\| *binomial *=',
r'(?m)^\| *subspecies *='],
r'(?m)(^\| *species *=[ \']*([A-Z][a-z]+ [a-z-]+)[ \']*\n)',
r"\1| binomial = ''\2''\n"),
]
def tidy_taxobox(self, text):
for s in self.subs:
text = re.sub(s[0], s[1], text)
for s in self.conditional_subs:
if all(re.search(c, text) for c in s[0]):
text = re.sub(s[1], s[2], text)
for s in self.anticonditional_subs:
if not any(re.search(c, text) for c in s[0]):
text = re.sub(s[1], s[2], text)
# Add FishBase reference.
if re.search(r'(?m)^\| *[a-z_]+ *= *'
r'\[\[(?:Actinopterygii|Chondrichthyes)\]\]$', text) \
and not re.search(r'{{FishBase', text):
m1 = re.search(r'(?m)^\| * genus *=[ \'\[]*'
r'([A-Z][a-z]+)[ \'\]]*$', text)
m2 = re.search(r'(?m)^\| species *=[ \']*'
r'(?:[A-Z]\. )?([a-z-]+)[ \']*$', text)
if m1 and m2:
ref = time.strftime('{{FishBase species | genus = %s | '
'species = %s | month = %%B | year = %%Y}}'
% (m1.group(1), m2.group(1)))
elif m1:
ref = time.strftime('{{FishBase genus | genus = %s | '
'month = %%B | year = %%Y}}'
% m1.group(1))
else:
ref = None
if ref:
m1 = re.search(r'==+ *References? *==+ *\n+', text)
m2 = re.search(r'(?:(?:{{.*-stub}}|\[\[[a-z][a-z]:.*\]\]'
r'|\[\[Category:.*\]\])[ \n]*)*$',
text)
if m1:
text = text[:m1.end(0)] \
+ '* ' + ref + '\n' \
+ text[m1.end(0):]
elif m2:
text = text[:m2.start(0)] \
+ '\n==References==\n* ' + ref + '\n' \
+ text[m2.start(0):]
else:
raise Error, "Nowhere to put FishBase reference"
return text
# 6. DISAMBIGUATION
# Run solve_disambiguation on all botanical abbreviations.
def disambiguate(self):
import solve_disambiguation
for a in self.abbrev.values():
for aa in a:
if aa[2] == 'b' and aa[0][-1] == '.':
bot = solve_disambiguation.DisambiguationRobot(
'0', [aa[1]], False, False, [aa[0]], False, True)
bot.run()
def badusage():
raise Error, ('Usage:\n'
'%s --rebuild Rebuild abbreviation table\n'
'%s --query=abbrev Query abbreviation\n'
'%s taxon Find authority and add it to taxon\n'
'%s taxon authority Add authority to taxon\n'
% (sys.argv[0], sys.argv[0], sys.argv[0], sys.argv[0]))
def main():
wikipedia.username = 'nomialbot'
try:
auth = Authority()
article = None
expand = True
try:
opts, args = getopt.getopt(sys.argv[1:], 'zdnra:q:',
['noexpand', 'rebuild', 'article=',
'query=', 'disambig', 'noquery'])
for o, a in opts:
if o in ('-q', '--query'):
print auth.find_abbreviation(a.decode())
elif o in ('-r', '--rebuild'):
auth.rebuild_abbreviations()
elif o in ('-d', '--disambig'):
auth.disambiguate()
elif o in ('-a', '--article'):
article = a
elif o in ('-n', '--noexpand'):
expand = False
elif o in ('-z', '--noquery'):
auth.noquery = True
else:
badusage()
return
except getopt.GetoptError:
badusage()
return
if len(args) == 1:
auth.find_and_add_authority(article or args[0], args[0], expand)
elif len(args) == 2:
auth.add_authority(article or args[0], args[0], args[1], expand)
else:
badusage()
return
except Error:
return
if __name__ == '__main__':
try:
main()
finally:
wikipedia.stopme()