User:PresN/taxolistgenerator
Appearance
WPTaxoListGenerator python script, as of 2020-02-28.
Given a Family name, attempts to build and print a json structure of data from MSW3, Wikidata, IUCN, ADW, and Wikipedia, for use in a second script. Also prints IUCN refs.
Only tested with a few animal families in Carnivora; bugs to be expected and bug reports welcome. Bad Wikidata data will wreck it, and MSW3 is not always up to date.
Not packaged for easy reuse; needs python3 to run and expects your system to have Selenium, Pydash, and BeautifulSoup modules installed.
Alternate version that takes in a species name exists
Final json structure looks like (top level is list of subfamilies):
[
{
"Name": "Ailuropodinae",
"Lineages": [
{
"Genuses": [
{
"Name": "Ailuropoda",
"Namer": "Henri Milne-Edwards|H. Milne-Edwards",
"Founded": "1870",
"Species": [
{
"Name": "Giant panda",
"Latin": "A. melanoleuca",
"Namer": "Armand David|David",
"Founded": "1869",
"IUCNNumber": "712",
"IUCNCat": "VU",
"Population": "500-1,000",
"PopDirection": "up",
"Size": "",
"Habitat": "Forest",
"Hunting": "",
"Range": "File:Mapa distribuicao Ailuropoda melanoleuca.png",
"RangeAlt": "",
"Image": "File:Grosser Panda.JPG",
"ImageAlt": "",
"Subspecies": [
]
}
]
}
]
}
]
}
]
USAGE: py WPTaxoListGenerator.py <Family>
import re
import copy
import datetime
import time
import json
import sys
from datetime import timedelta
from datetime import date
import urllib.request
from urllib.parse import quote
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from pydash import _
from bs4 import BeautifulSoup
WD_SEARCH_URL = 'https://www.wikidata.org/w/api.php?action=wbsearchentities&language=en&format=json&limit=50&search='
WD_GET_URL = 'https://www.wikidata.org/wiki/Special:EntityData/'
WD_TAXON_CLAIM_ID = 'P225'
WD_TAXON_AUTHOR_CLAIM_ID = 'P405'
WD_MSW3_CLAIM_ID = 'P959'
WD_TAXO_AUTHOR_CITATION = 'P835'
WD_TAXO_DATE = 'P574'
WD_IUCN_ID = 'P627'
MSW3_URL = 'https://www.departments.bucknell.edu/biology/resources/msw3/browse.asp?id='
IUCN_URL = 'http://www.iucnredlist.org/details/'
WP_URL = 'https://en.wikipedia.org/wiki/'
ADW_URL = 'https://animaldiversity.org/accounts/'
CITES = []
###############
#
# WP_TAXO_LIST_GENERATOR - builds a json source file for an animal family list page
#
# USAGE: py WPTaxoListGenerator.py <Family>
#
###############
def main():
stop_count = 999 # Use to force an early stop for testing purposes
input_family_name = sys.argv[1]
# Get wikidata family
family_data = search_for_taxo_data(input_family_name, 'family')
family_msw3_id = regular_claim(family_data, WD_MSW3_CLAIM_ID)
# Get MSW3 taxo subtree for this family
msw3_items = get_msw3_items(family_msw3_id)
taxo_state = "Family"
last_seen_genus =
last_seen_species =
print_family_start()
# For each MSW3 taxo item, find in wikidata and build json
# TODO: subspecies are not alphabetical in MSW3- type subs come first.
for item in msw3_items:
if stop_count > 0:
msw3_type = get_type_of_msw3_node(item)
msw3_latin = _.title_case(item.findChildren()[1].contents[0])
close_old_states(msw3_type, taxo_state)
taxo_state = msw3_type
if msw3_type == 'Subfamily':
print_subfamily_start()
print_subfamily(msw3_latin)
if msw3_type == 'Genus':
last_seen_genus = msw3_latin
taxo_claims = get_wd_taxo_claims(msw3_latin, msw3_type)
print_genus_start()
print_genus({
"name": _.get(taxo_claims, 'name'),
"latin": _.get(taxo_claims, 'latin'),
"authority": _.get(taxo_claims, 'authority')
})
if msw3_type == 'Species':
last_seen_species = msw3_latin
# Grab Wikidata data
taxo_claims = get_wd_taxo_claims(f'{last_seen_genus} {msw3_latin}', )
# Grab IUCN data
iucn_data = get_iucn_data(_.get(taxo_claims, 'iucn_id'))
CITES.append(f'[1]')
# Grab Wikipedia data
wp_data = get_wp_data(_.get(taxo_claims, 'name'))
# Grab ADW data
adw_data = get_adw_data(_.get(taxo_claims, 'full_latin'), _.get(taxo_claims, 'name'))
if _.get(adw_data, "cite"):
CITES.append(f'[2]')
print_species_start()
print_species({
"name": _.get(taxo_claims, 'name'),
"latin": _.get(taxo_claims, 'latin'),
"authority": _.get(taxo_claims, 'authority'),
"iucn_id": _.get(taxo_claims, 'iucn_id'),
"assessment": _.get(iucn_data, 'assessment'),
"pop_direction": _.get(iucn_data, 'pop_direction'),
"size": _.get(adw_data, 'size'),
"population": _.get(iucn_data, 'population'),
"habitat": _.get(iucn_data, 'habitat'),
"images": _.get(wp_data, 'images')
})
if msw3_type == 'Subspecies':
taxo_claims = get_wd_taxo_claims(f'{last_seen_genus} {last_seen_species} {msw3_latin}', )
print_subspecies_start()
print_subspecies({
"name": _.get(taxo_claims, 'name'),
"latin": _.get(taxo_claims, 'latin'),
"authority": _.get(taxo_claims, 'authority')
})
stop_count = stop_count - 1
print_species_end()
print_genus_end()
print_subfamily_end()
print_family_end()
for cite in CITES:
print(cite)
####################################
##### Wikidata access functions ####
####################################
def get_wd_taxo_claims(msw3_full_latin, taxo_type_filter):
data = search_for_taxo_data(msw3_full_latin, taxo_type_filter)
name = get_wikidata_enwiki_name(data)
full_latin = msw3_full_latin
latin = get_abbr_latin(msw3_full_latin)
authority = get_authority(data)
iucn_id = regular_claim(data, WD_IUCN_ID)
return {"name": name, "latin": latin, "full_latin": full_latin, "authority": authority, "iucn_id": iucn_id}
def search_for_taxo_data(name, taxo_type_filter):
claim_id = search_for_name(name, taxo_type_filter)
return get_wikidata_json(claim_id)
def search_for_name(name, taxo_type_filter):
url = f"{WD_SEARCH_URL}{name.replace(' ', '%20')}"
page = urllib.request.urlopen(url).read()
results = json.loads(page)
if taxo_type_filter != :
for result in results['search']:
if _.head(result['description'].split(' ')) == _.to_lower(taxo_type_filter):
return result['id']
else:
return _.head(results['search'])['id'] # Optimistically assume first result is right, since description isn't a sure thing
print(f"Unable to find {taxo_type_filter} named {name} in Wikidata!")
exit()
def get_wikidata_json(claim_id):
if claim_id:
url = f'{WD_GET_URL}{claim_id}.json'
page = urllib.request.urlopen(url).read()
results = json.loads(page)
return results['entities'][claim_id]
def get_wikidata_item_name(data):
return data['labels']['en']['value']
def get_wikidata_enwiki_name(data):
if 'enwiki' in data['sitelinks']:
return data['sitelinks']['enwiki']['title']
def get_claim(data, claim_id):
if claim_id in data['claims']:
return _.head(data['claims'][claim_id])
def regular_claim(data, claim_id):
claim = get_claim(data, claim_id)
if claim:
return claim['mainsnak']['datavalue']['value']
def embedded_claim(data, claim_id, subclaim_id):
claim = get_claim(data, claim_id)
if claim and 'qualifiers' in claim and subclaim_id in claim['qualifiers']:
return _.head(claim['qualifiers'][subclaim_id])['datavalue']['value']
def get_authority(taxo_data):
authority_claim = embedded_claim(taxo_data, WD_TAXON_CLAIM_ID, WD_TAXON_AUTHOR_CLAIM_ID)
if authority_claim:
namer_data = get_wikidata_json(authority_claim['id'])
if namer_data:
namer = get_wikidata_enwiki_name(namer_data)
namer_short = clean_initials(regular_claim(namer_data, WD_TAXO_AUTHOR_CITATION))
if namer and not namer_short:
namer_short = _.last(namer.split(' '))
namer_date = embedded_claim(taxo_data, WD_TAXON_CLAIM_ID, WD_TAXO_DATE)['time'][1:5]
return {"namer": namer, "namer_short": namer_short, "namer_date": namer_date}
return {}
################################
##### MSW3 access functions ####
################################
def get_msw3_items(msw3_id):
msw3_url = f'{MSW3_URL}{msw3_id}'
msw3_page = urllib.request.urlopen(msw3_url).read()
soup = BeautifulSoup(msw3_page, "html.parser")
return soup.find('td', attrs={"width": "50%"}).find_all('p')[1:]
def get_type_of_msw3_node(element):
return _.title_case(str(element).split('>', 1)[1].split(' ', 1)[0])
################################
##### IUCN access functions ####
################################
def get_iucn_data(iucn_id):
iucn_soup = get_iucn_soup(iucn_id)
if iucn_soup:
assessment = get_iucn_assess(iucn_soup)
pop_direction = get_iucn_pop_direction(iucn_soup)
population = get_iucn_pop(iucn_soup)
habitat = get_iucn_habitat(iucn_soup)
cite = get_iucn_cite(iucn_soup)
return {"assessment": assessment, "pop_direction": pop_direction, "population": population, "habitat": habitat, "cite": cite}
return {}
def get_iucn_soup(iucn_id):
iucn_url = f'{IUCN_URL}{iucn_id}/0'
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get(iucn_url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
return soup
def get_iucn_assess(soup):
assessment_div = soup.find('div', 'species-category-scale')
if assessment_div:
return _.to_upper(_.find(assessment_div['class'], lambda x: _.starts_with(x, 'species-category-scale--')).split('--')[1])
def get_iucn_pop_direction(soup):
translator = {
'Decreasing': 'down',
'Increasing': 'up',
'Stable': 'stable',
'Unknown': 'unknown',
'Unspecified': 'unknown'
}
iucn_direction_block = soup.find('p', 'species-population-panel')
if iucn_direction_block:
iucn_direction = _.head(iucn_direction_block.contents)
return translator[iucn_direction]
return translator['Unspecified']
def get_iucn_pop(soup):
return soup.find_all('p', 'panel__data--key')[0].text
# TODO: rename wetlands (inland)-> inland wetlands, swap marine *, etc.
def get_iucn_habitat(soup):
text = soup.find_all('p', 'panel__data--key')[1].text
habitats = _.filter(_.to_lower(text).split(', '), lambda h: not _.starts_with(h, 'artificial'))
if len(habitats) == 1:
return _.capitalize(habitats[0])
if len(habitats) == 2:
return _.capitalize(habitats[0]) + ' and ' + habitats[1]
if len(habitats) > 2:
lower_bits = _.concat([_.capitalize(habitats[0])], habitats[1:])
return _.join(lower_bits[:-1], ', ') + ', and ' + habitats[-1]
def get_iucn_cite(soup):
cite_div = soup.find('div', 'layout-assessment__major')
if cite_div:
cite = cite_div.find('div', 'text-body').p.text
wp_cite = "{{cite iucn"
names = cite.split(' 20', 1)[0]
namebits = _.join(names.split(' & '), ', ')
nameparts = namebits.split(', ')
parts = {
'0': ' |last1=', '1': ' |first1=', '2': ' |last2=', '3': ' |first2=', '4': ' |last3=', '5': ' |first3=', '6': ' |last4=', '7': ' |first4=', '8': ' |last5=', '9': ' |first5=', '10': ' |last6=', '11': ' |first6=', '12': ' |last7=', '13': ' |first7=', '14': ' |last8=', '15': ' |first8=', '16': ' |last9=', '17': ' |first9=', '18': ' |last10=', '19': ' |first10=', '20': ' |last11=', '21': ' |first11=', '22': ' |last12=', '23': ' |first12=', '24': ' |last13=', '25': ' |first13=', '26': ' |last14=', '27': ' |first14=', '28': ' |last15=', '29': ' |first15='
}
for i, part in enumerate(nameparts):
wp_cite += parts[str(i)] + clean_initials(part)
year = '20' + cite.split(' 20', 1)[1].split('. ', 1)[0]
title = cite.split(' 20', 1)[1].split('. ', 1)[1].split('.', 1)[0]
volume = cite.split('Threatened Species ')[1][0:4]
page = cite.split(': ', 1)[1].split('. http')[0]
try:
doi = cite.split('dx.doi.org/')[1].split('. Downloaded')[0].split('. Downloaded')[0] # Weird spacing issue
except:
doi = ""
wp_cite += " |title=" + title.strip() + " |volume=" + volume + " |page=" + page + " |date=" + year + " |doi=" + doi + "}}"
return wp_cite
##############################
##### WP access functions ####
##############################
def get_wp_data(name):
soup = get_wp_soup(name)
if soup:
images = get_wp_images(soup)
return {"images": images}
return {}
def get_wp_soup(name):
if name:
url = f'{WP_URL}{urllib.parse.quote(name)}'
page = urllib.request.urlopen(url).read()
return BeautifulSoup(page, "html.parser")
def get_wp_images(soup):
infobox = soup.find('table', 'infobox biota')
if infobox:
rows = infobox.tbody.find_all('tr')
if rows[1].td and rows[1].td.find('a', 'image'):
image = rows[1].td.a.img
image_name = 'File:' + image['alt']
adjusted_width = int((180/int(image['height']))*int(image['width']))
else:
image_name =
adjusted_width = 999
for row in rows[2:]:
if row.td and row.td.find('a', 'image'):
range_map = row.td.a.img
map_name = 'File:' + range_map['alt']
adjusted_map_width = int((180/int(range_map['height']))*int(range_map['width']))
break
else:
map_name =
adjusted_map_width = 999
return {"image_name": image_name, "width_override": adjusted_width, "map_name": map_name, "map_width_override": adjusted_map_width}
return {}
###############################
##### ADW access functions ####
###############################
def get_adw_data(full_latin, name):
soup = get_adw_soup(full_latin)
if soup:
size = get_adw_size(soup, name)
if size:
cite = get_adw_cite(soup)
return {"size": size, "cite": cite}
return {}
def get_adw_soup(full_latin):
url = f'{ADW_URL}{urllib.parse.quote(full_latin)}'
try:
page = urllib.request.urlopen(url).read()
return BeautifulSoup(page, "html.parser")
except:
pass
def get_adw_size(soup, name):
texts = soup.find_all('p')
for raw_text in texts:
text = raw_text.getText()
if has_body_text(text):
body = get_body_size(text)
tail = get_tail_size(text)
if body and tail:
return f'{{body[0]}} long, plus {{tail[0]}} tail[3]'
def has_body_text(text):
return 'head and body range from' in text or 'head and body length ranges from' in text or 'head and body length is' in text or 'head and body length ranging from' in text or 'body length ranges from' in text
def get_body_size(text):
if 'head and body range from' in text:
body = find_lengths('head and body range from', text)
elif 'head and body length ranges from' in text:
body = find_lengths('head and body length ranges from', text)
elif 'head and body length is' in text:
body = find_lengths('head and body length is', text)
elif 'head and body length ranging from' in text:
body = find_lengths('head and body length ranging from', text)
elif 'body length ranges from' in text:
body = find_lengths('body length ranges from', text)
if body:
return [mm_to_cm(body.group('len1'), 'down'), mm_to_cm(body.group('len2'), 'up')]
return [,]
def get_tail_size(text):
if 'tail measures' in text:
tail = find_lengths('tail measures', text)
elif 'tail length is' in text:
tail = find_lengths('tail length is', text)
elif 'tail length of' in text:
tail = find_lengths('tail length of', text)
if tail:
return [mm_to_cm(tail.group('len1'), 'down'), mm_to_cm(tail.group('len2'), 'up')]
return [,]
def find_lengths(prefix_text, text):
return re.search(prefix_text + ' (?P<len1>[0-9.]+?) (mm |cm )?to (?P<len2>[0-9.]+?) (mm|cm)?\.', text)
def mm_to_cm(mm, direction):
if '.' in mm: #definately cm already
return mm
else:
if direction == 'down':
return str(int(mm) // 10)
else:
return str((int(mm) + 9) // 10)
def get_adw_cite(soup):
wp_cite = "{{cite web"
name_div = soup.find('div', 'byline').text
names = name_div.split('By ')[1]
namebits = _.join(_.join(names.split(' & '), ' ').split(', '), ' ')
nameparts = namebits.split(' ')
parts = {
'0': ' |first1=', '1': ' |last1=', '2': ' |first2=', '3': ' |last2=', '4': ' |first3=', '5': ' |last3=', '6': ' |first4=', '7': ' |last4=', '8': ' |first5=', '9': ' |last5=', '10': ' |first6=', '11': ' |last6=', '12': ' |first7=', '13': ' |last7=', '14': ' |first8=', '15': ' |last8=', '16': ' |first9=', '17': ' |last9=', '18': ' |first10=', '19': ' |last10=', '20': ' |first11=', '21': ' |last11=', '22': ' |first12=', '23': ' |last12=', '24': ' |first13=', '25': ' |last13=', '26': ' |first14=', '27': ' |last14=', '28': ' |first15=', '29': ' |last15='
}
for i, part in enumerate(nameparts):
wp_cite += parts[str(i)] + clean_initials(part)
adw_citation = soup.find('div', 'page-citation').p.text
year = '20' + adw_citation.split(' 20', 1)[1].split('. ', 1)[0]
title = adw_citation.split(' "', 1)[1].split('" ', 1)[0]
url = 'https://animaldiversity.org/accounts/' + _.join(title.split(' '), '_') + '/'
today = date.today()
accessdate = f'{today:%B} {today.day}, {today.year}'
wp_cite += " |title=" + title + " |url=" + url + " |date=" + year + " |website=Animal Diversity Web |publisher=University of Michigan" + " |accessdate=" + accessdate + "}}"
return wp_cite
#################################
#### Util/Printing functions ####
#################################
def get_abbr_latin(msw3_full_latin):
parts = msw3_full_latin.split(' ')
if len(parts) == 3: # Subspecies
return f'{parts[0][0]}. {_.to_lower(parts[1][0])}. {_.to_lower(parts[2])}'
if len(parts) == 2: # Species
return f'{parts[0][0]}. {_.to_lower(parts[1])}'
return parts[0] # Genus
def ref_name(name):
if name:
return name.replace(" ", "")
return name
def clean_initials(name):
if name and re.search('[A-Z]\.[A-Z]', name):
return _.join(name.split('.'), '. ').strip()
return name
def close_old_states(new_state, old_state): # TODO numbered level hierarchy could clean this up
if old_state == 'Subspecies' and new_state == 'Subspecies':
print_subspecies_end(',')
if old_state == 'Subspecies' and new_state == 'Species':
print_subspecies_end()
print_species_end(',')
if old_state == 'Species' and new_state == 'Species':
print_species_end(',')
if old_state == 'Species' and new_state == 'Genus':
print_species_end()
print_genus_end(',')
if old_state == 'Subspecies' and new_state == 'Genus':
print_subspecies_end()
print_species_end()
print_genus_end(',')
if old_state == 'Species' and new_state == 'Subfamily':
print_species_end()
print_genus_end()
print_subfamily_end(',')
if old_state == 'Subspecies' and new_state == 'Subfamily':
print_subspecies_end()
print_species_end()
print_genus_end()
print_subfamily_end(',')
def print_family_start():
print('[')
def print_subfamily_start():
print(' {')
def print_subfamily(name):
print(f' "Name": "{name}",')
print(' "Lineages": [') # TODO handle tribes
print(' {')
print(' "Genuses": [')
def print_genus_start():
print(' {')
def print_genus(details):
print(f' "Name": "{_.get(details, "latin")}",') # TODO: deal with cases like Eira (genus)|Eira
if _.get(details, "authority"):
if _.get(details, "authority.namer_short"):
print(f' "Namer": "{_.get(details, "authority.namer")}|{_.get(details, "authority.namer_short")}",')
else:
print(f' "Namer": "{_.get(details, "authority.namer")}",')
print(f' "Founded": "{_.get(details, "authority.namer_date")}",')
print(' "Species": [')
def print_species_start():
print(' {')
def print_species(details):
print(f' "Name": "{_.get(details, "name")}",')
print(f' "Latin": "{_.get(details, "latin")}",')
if _.get(details, "authority"):
if _.get(details, "authority.namer_short"):
print(f' "Namer": "{_.get(details, "authority.namer")}|{_.get(details, "authority.namer_short")}",')
else:
print(f' "Namer": "{_.get(details, "authority.namer")}",')
print(f' "Founded": "{_.get(details, "authority.namer_date")}",')
print(f' "IUCNNumber": "{_.get(details, "iucn_id")}",')
print(f' "IUCNCat": "{_.get(details, "assessment")}",')
print(f' "Population": "{_.get(details, "population")}",')
print(f' "PopDirection": "{_.get(details, "pop_direction")}",')
if _.get(details, "size"):
print(f' "Size": "{_.get(details, "size")}",')
else:
print(' "Size": "",')
print(f' "Habitat": "{_.get(details, "habitat")}",')
print(' "Hunting": "",')
print(f' "Range": "{_.get(details, "images.map_name")}",')
print(' "RangeAlt": "",')
if _.get(details, "images.map_width_override") and _.get(details, "images.map_width_override") < 180:
print(f' "RangeSizeOverride": "{_.get(details, "images.map_width_override")}",')
print(f' "Image": "{_.get(details, "images.image_name")}",')
print(' "ImageAlt": "",')
if _.get(details, "images.width_override") and _.get(details, "images.width_override") < 180:
print(f' "ImageSizeOverride": "{_.get(details, "images.width_override")}",')
print(' "Subspecies": [')
def print_subspecies_start():
print(' {')
def print_subspecies(details): # TODO: Handle finding "name" if no link?
if _.get(details, "name"):
if _.last(_.get(details, "name").split(' ')) == _.last(_.get(details, "latin").split(' ')): # wiki article at latin name
print(' "Name": "",')
print(f' "Latin": "[[{_.get(details, "name")}|{_.get(details, "latin")}]]"')
else:
print(f' "Name": "[[{_.get(details, "name")}]]",')
print(f' "Latin": "{_.get(details, "latin")}"')
else:
print(' "Name": "",')
print(f' "Latin": "{_.get(details, "latin")}"')
def print_subspecies_end(term):
print(' }' + term)
def print_species_end(term):
print(' ]')
print(' }' + term)
def print_genus_end(term):
print(' ]')
print(' }' + term)
def print_subfamily_end(term): # TODO assumes 1 subfamily per lineage
print(' ]')
print(' }') # subfamily end
print(' ]')
print(' }' + term) #lineage end
def print_family_end():
print(']')
if __name__ == '__main__':
main()