File talk:NZ opinion polls 2009-2011 -smallparties.png

New code (from Trevva) - small modification for 1 decimal place labels in the small party graph.
rm(list=ls())

#==========================================
#Parameters
major.parties <- FALSE
if(major.parties) {
  selected.parties <- c("Green","Labour","National")   #use precise names from Table headers
  ylims <- c(0,65)   #Vertical range
  output.fname <- "NZ_opinion_polls_2009-2011 -parties.png"
} else {  #Small parties - please use "Maori" for the Maori party
  selected.parties <- c("ACT","Maori","NZ First","United Future","Mana")   #use precise names from Table headers
  ylims <- c(0,6)   #Vertical range
  output.fname <- "NZ opinion polls 2009-2011 -smallparties.png"

}

#==========================================
#Shouldn't need to edit anything below here

#Misc preparation
selected.parties <- gsub(" ","_",selected.parties)  #Handle the space in some names

#Load the complete HTML file into memory
html <- readLines(url("http://en.wikipedia.org/wiki/Opinion_polling_for_the_New_Zealand_general_election,_2011",encoding="UTF-8"))
closeAllConnections()

#Extract the opinion poll data table
tbl.no <- 2
tbl <- html[(grep("<table.*",html)[tbl.no]):(grep("</table.*",html)[tbl.no])]

#Now split it into the rows, based on the <tr> tag
tbl.rows <- list()
open.tr <- grep("<tr",tbl)
close.tr <- grep("</tr",tbl)
for(i in 1:length(open.tr)) tbl.rows[[i]] <- tbl[open.tr[i]:close.tr[i]]

#Extract table headers
hdrs <- grep("<th",tbl,value=TRUE)
hdrs <- hdrs[1:(length(hdrs)/2)]
party.names <- gsub("<.*?>","",hdrs)[-c(1:2)]
party.names <- gsub(" ","_",party.names)  #Replace space with a _
party.names <- gsub("M.{1}ori","Maori",party.names)  #Apologies, but the hard "a" is too hard to handle otherwise
party.cols   <- gsub("^.*bgcolor=\"(.*?)\".*$","\\1",hdrs)[-c(1:2)]
names(party.cols) <- party.names

#Extract data rows
tbl.rows <- tbl.rows[sapply(tbl.rows,function(x) length(grep("<td",x)))>1]

#Now extract the data
survey.dat <- lapply(tbl.rows,function(x) {
  #Start by only considering where we have <td> tags
  td.tags <- x[grep("<td",x)]
  #Polling data appears in columns 3-10
  dat     <- td.tags[3:11]
  #Now strip the data and covert to numeric format
  dat     <- gsub("<td>|</td>","",dat)
  dat     <- gsub("%","",dat)
  dat     <- gsub("-","0",dat)
  dat     <- gsub("<","",dat)
  dat     <- as.numeric(dat)
  names(dat) <- party.names
  #Getting the date strings is a little harder. Start by tidying up the dates
  date.str <- td.tags[2]                        #Dates are in the second column
  date.str <- gsub("<sup.*</sup>","",date.str)   #Throw out anything between superscript tags, as its an reference to the source
  date.str <- gsub("<td>|</td>","",date.str)  #Throw out any tags
  #Get numeric parts of string
  digits.str <- gsub("[^0123456789]"," ",date.str)
  digits.str <- gsub("^ +","",digits.str)    #Drop leading whitespace
  digits     <- strsplit(digits.str," +")[[1]]
  yrs        <- grep("[0-9]{4}",digits,value=TRUE)
  days       <- digits[!digits%in%yrs]
  #Get months
  month.str <- gsub("[^A-Z,a-z]"," ",date.str)
  month.str <- gsub("^ +","",month.str)        #Drop leading whitespace
  mnths     <- strsplit(month.str," +",month.str)[[1]]
  #Now paste together to make standardised date strings
  days  <- rep(days,length.out=2)
  mnths <- rep(mnths,length.out=2)
  yrs <- rep(yrs,length.out=2)
  dates.std <- paste(days,mnths,yrs)
#  cat(sprintf("%s\t -> \t %s, %s\n",date.str,dates.std[1],dates.std[2]))
  #And finally the survey time
  survey.time <- mean(as.POSIXct(strptime(dates.std,format="%d %B %Y")))
  #Get the name of the survey company too
  survey.comp <- td.tags[1]
  survey.comp <- gsub("<sup.*</sup>","",survey.comp)
  survey.comp <- gsub("<td>|</td>","",survey.comp)
  survey.comp <- gsub("<U+2013>","-",survey.comp,fixed=TRUE)
  survey.comp <- gsub("(?U)<.*>","",survey.comp,perl=TRUE)

  #And now return results
  return(data.frame(Company=survey.comp,Date=survey.time,date.str,t(dat)))
})

#Combine results
surveys <- do.call(rbind,survey.dat)

#Restrict plot(manually) to selected parties
selected.parties <- sort(selected.parties)
selected.cols <- party.cols[selected.parties]
polls   <- surveys[,c("Company","Date",selected.parties)]
polls <- subset(polls,!is.na(surveys$Date))
polls <- polls[order(polls$Date),]
polls$date.num  <- as.double(polls$Date)

#Setup plot
ticks <- ISOdate(c(rep(2009,2),rep(2010,2),rep(2011,2),2012),c(rep(c(1,7),3),1),1)
xlims <- range(c(ISOdate(2008,11,1),ticks))
png(output.fname,width=778,height=487,pointsize=16)
par(mar=c(5,4,1,1))
matplot(polls$date.num,polls[,selected.parties],pch=NA,xlim=xlims,ylab="Party support (%)",
    xlab="",col=selected.cols,xaxt="n",ylim=ylims,yaxs="i")
abline(h=seq(0,95,by=5),col="lightgrey",lty=3)
abline(v=as.double(ticks),col="lightgrey",lty=3)
box()
axis(1,at=as.double(ticks),labels=format(ticks,format="1 %b\n%Y"),cex.axis=0.8)
axis(4,at=axTicks(4),labels=rep("",length(axTicks(4))))

#Now calculate the loess smoothers and add the confidence interval
smoothed <- list()
predict.x <- seq(min(polls$date.num),max(polls$date.num),length.out=100)
for(i in 1:length(selected.parties)) {
  smoother <- loess(polls[,selected.parties[i]] ~ polls[,"date.num"],span=0.5)
  smoothed[[i]] <- predict(smoother,newdata=predict.x,se=TRUE)
  polygon(c(predict.x,rev(predict.x)),
    c(smoothed[[i]]$fit+smoothed[[i]]$se.fit*1.96,rev(smoothed[[i]]$fit-smoothed[[i]]$se.fit*1.96)),
    col=rgb(0.5,0.5,0.5,0.5),border=NA)
}
names(smoothed) <- selected.parties
#Then add the data points
matpoints(polls$date.num,polls[,selected.parties],pch=20,col=selected.cols)
#And finally the smoothers themselves
for(i in 1:length(selected.parties)) {
  lines(predict.x,smoothed[[i]]$fit,col=selected.cols[i],lwd=2)
}

#Add election date too
#abline(v=election.date,lwd=4)
#text(election.date,0,format(election.date,"%d %b %Y"),srt=90,pos=4)

legend("bottom",legend=gsub("_"," ",selected.parties),col=selected.cols,pch=20,bg="white",lwd=2,horiz=TRUE,inset=-0.225,xpd=NA)
#Add best estimates
for(i in 1:length(smoothed)) {
if(major.parties) {
  lbl <- sprintf("%2.0f±%1.0f %%",round(rev(smoothed[[i]]$fit)[1],0),round(1.96*rev(smoothed[[i]]$se.fit)[1],0))
} else{
  lbl <- sprintf("%2.1f±%1.1f %%",round(rev(smoothed[[i]]$fit)[1],1),round(1.96*rev(smoothed[[i]]$se.fit)[1],1))
}
  text(rev(polls$date.num)[1],rev(smoothed[[i]]$fit)[1],labels=lbl,pos=4,col=selected.cols[i])
}
dev.off()

cat("Complete.\n")
Ridcully Jack (talk) 02:51, 14 August 2011 (UTC)[reply]