library(RCurl)
library(reshape)
library(htmltab)
library(ggplot2)
library(stringr)
library(scales)
#get the table from the url
theurl <- getURL("https://en.wikipedia.org/wiki/Nationwide_opinion_polling_for_the_Democratic_Party_2016_presidential_primaries", ssl.verifyPeer=FALSE)
table <- htmltab(theurl, which=3)
table2 <- htmltab(theurl, which=4)
table = table[,1:6]
table2 = table2[,c(1:5,7)]
names(table) <- c("Source", "Size", "Error", "Date", "Clinton", "Sanders")
names(table2) <- names(table)
df = rbind(table, table2)
for (i in c(3, 5:7)) {
df[[i]] = as.numeric(sub("%", "", df[[i]]))/100
}
df$Date = sub("[0-9]+\\s*–\\s*([0-9]+)", "\\1", df$Date)
df$Date = sub(".*–", "", df$Date)
df$Date = sub("[0-9]+\\s*-\\s*([0-9]+)", "\\1", df$Date)
df$Date = sub(".*-", "", df$Date)
df$Date = trimws(df$Date)
df$Date = as.Date(df$Date, format="%B %d, %Y")
df$Size = as.numeric(sub(",", "", df$Size))
df$Error = 1/sqrt(df$Size)
mdata <- melt(df, id=c("Date", "Source", "Error", "Size"))
names(mdata)[5:6] = c("Candidate", "Support")
colors = c("#D4AA00", "#228b22")
labels = c("Clinton", "Sanders")
results = mdata
#breaks() returns n evenly spaced numbers between x and y
#whose squares are divisible by p
#the function is used for the legend
breaks <- function(x, y, n, p) {
x = sqrt(ceiling(as.integer(x^2) / p) * p)
y = sqrt(floor(as.integer(y^2) / p) * p)
s = seq(x, y, length.out=n)
for (i in 2:(n-1)) {
s[i] = sqrt(round(s[i]^2 / p) * p)
}
return(unique(s))
}
d = ggplot(results, aes(x=Date, y=Support, colour=Candidate, size=1/Error, weight=1/Error)) +
geom_point(alpha=0.7) +
geom_smooth(span=0.8, show.legend=F) +
scale_colour_manual(values = colors) +
labs(title="Nationwide opinion polling for the 2016 Democratic Party primaries") +
scale_size_area(max_size=3,
breaks=function(x) breaks(x[1], x[2], 5, 100), #5 numbers divisible by 100
labels=function(x) comma_format()(x^2),
name="Sample Size") +
scale_y_continuous(breaks=seq(0,1,0.1), minor_breaks=seq(0,1,0.02),
labels=percent, limits=c(0.2,0.7)) +
scale_x_date(labels=date_format("%b %d"),
breaks=sort(c(seq(as.Date("2016/1/1"), as.Date("2016/7/28"), "month"),
seq(as.Date("2016/1/15"), as.Date("2016/7/28"), "month"),
as.Date("2016/7/28"))),
limits=c(as.Date("2016/1/1"), as.Date("2016/7/28"))) +
theme(panel.grid.minor=element_line(size=0.2),
panel.grid.major=element_line(size=0.6))
#save plot as "dp.svg"
svg(filename="dp.svg",
width=9,
height=5,
pointsize=12,
bg="transparent")
d
dev.off()