Jump to content

User:Dispenser/Link scraper

From Wikipedia, the free encyclopedia
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This is a concept for a user edit system for retrieving meta-data from newspaper sites

The following functions are available:
	hre()		- Html Regular Expression match, case insensative
	fmttime()	- Same as strftime(), but uses gmtime() instead of localtime()
	pageurl()	- returns the URL of the current link
	pagetitle()	- returns the title of the page as given by beautifulsoup
"""
sites = (
	nytime-abstract = {
		# New York Times Abstracts (pre-1987)
		urlmatch: r'http://.*nytimes\.com/.*abstract.*',
		template: "cite web", 
		parameters: {
			url:        pageurl(),
			title:      hre(r'<h1>(.*?)</h1>', '\1')
			publisher:	"New York Times",
			date:       hre(r'<meta name="WT.z_pud" content="\d{4}\d{2}\d{2}"[^<>]>', r'\1-\2-\3'),
			page:       hre(r'<\w+[^<>]*note[^<>]*>Page (\d+).*?</p>', r'\1'),
			author:     hre(r'<\w+[^<>]*note[^<>]*>By .*?</p>', r'\1'),
			accessdate: fmttime("%Y-%m-%d"),
		}
	},
	nytime-metadata = {
		# New York Times with metadata
		# http://open.blogs.nytimes.com/2007/10/23/messing-around-with-metadata/
		urlmatch: r'http://(www\.)?nytimes\.com/.*\.html',
		template: "cite web", 
		parameters: {
			url:        pageurl(),
			title:      hre(r'<meta name="hdl" content="([^"]*)">', r'\1'),
			publisher:	hre(r'<meta name="cre" content="([^"]*)">', r'\1'),
			#location:	hre(r'<meta name="geo" content="([^"]*)"[^<>]>', r'\1'),
			# Place of the news source... of the building or where it happend?
			date:       hre(r'<meta name="pdate" content="(\d{4})(\d{2})(\d{2})">', r'\1-\2-\3'),
			author:     hre(r'<meta name="byl" content="by *([^"]*)">', r'\1'),
			accessdate: fmttime("%Y-%m-%d"),
		}
	},
)
#end