User:Dispenser/Link scraper
Appearance
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This is a concept for a user edit system for retrieving meta-data from newspaper sites
The following functions are available:
hre() - Html Regular Expression match, case insensative
fmttime() - Same as strftime(), but uses gmtime() instead of localtime()
pageurl() - returns the URL of the current link
pagetitle() - returns the title of the page as given by beautifulsoup
"""
sites = (
nytime-abstract = {
# New York Times Abstracts (pre-1987)
urlmatch: r'http://.*nytimes\.com/.*abstract.*',
template: "cite web",
parameters: {
url: pageurl(),
title: hre(r'<h1>(.*?)</h1>', '\1')
publisher: "New York Times",
date: hre(r'<meta name="WT.z_pud" content="\d{4}\d{2}\d{2}"[^<>]>', r'\1-\2-\3'),
page: hre(r'<\w+[^<>]*note[^<>]*>Page (\d+).*?</p>', r'\1'),
author: hre(r'<\w+[^<>]*note[^<>]*>By .*?</p>', r'\1'),
accessdate: fmttime("%Y-%m-%d"),
}
},
nytime-metadata = {
# New York Times with metadata
# http://open.blogs.nytimes.com/2007/10/23/messing-around-with-metadata/
urlmatch: r'http://(www\.)?nytimes\.com/.*\.html',
template: "cite web",
parameters: {
url: pageurl(),
title: hre(r'<meta name="hdl" content="([^"]*)">', r'\1'),
publisher: hre(r'<meta name="cre" content="([^"]*)">', r'\1'),
#location: hre(r'<meta name="geo" content="([^"]*)"[^<>]>', r'\1'),
# Place of the news source... of the building or where it happend?
date: hre(r'<meta name="pdate" content="(\d{4})(\d{2})(\d{2})">', r'\1-\2-\3'),
author: hre(r'<meta name="byl" content="by *([^"]*)">', r'\1'),
accessdate: fmttime("%Y-%m-%d"),
}
},
)
#end