# Script to process warmest100 data I collected myself # setwd("/home/daedalus/src/hottest100") load_data = function() { votes = read.csv("votedata.csv") songvotes = as.data.frame(table(song=c(paste(votes$artist, votes$track, sep=" - "))), responseName="votes") # Sort by votes, in descending order sorted = songvotes[with(songvotes, order(-votes, song)),] revsorted = songvotes[with(songvotes, order(votes, song)),] sorted$fract = sorted$votes/sum(sorted$votes) sorted$percent = sorted$votes/sum(sorted$votes) * 100 sorted$sd = sqrt(sorted$votes * (1-sorted$fract)) artists = as.data.frame(table(artist=votes$artist), responseName="votes") rankartists = artists[with(artists, order(-votes, artist)),] # The Warmest100 dataset fulldata = read.csv("Warmest100_FullList.csv") fullvotes = as.data.frame(table(song=fulldata$Song), responseName="votes") # Sort by votes, in descending order fullsorted = fullvotes[with(fullvotes, order(-votes, song)),] revfullsorted = fullvotes[with(fullvotes, order(votes, song)),] fullsorted$fract = fullsorted$votes/sum(fullsorted$votes) fullsorted$percent = fullsorted$votes/sum(fullsorted$votes) * 100 fullsorted$sd = sqrt(fullsorted$votes * (1-fullsorted$fract)) } plot_votes = function(sorteddata) { plot(1:length(sorteddata$votes[1:100]), sorteddata$votes[1:100], type="s", xlab="Song Rank", ylab="Votes", main="The Warmest 100, 2012", col="blue") } check_dist = function(x.norm) { # Check the distribution matches something h = hist(x.norm,breaks=15) xhist = c(min(h$breaks),h$breaks) yhist = c(0,h$density,0) xfit = seq(min(x.norm),max(x.norm),length=40) yfit = dnorm(xfit, mean=mean(x.norm), sd=sd(x.norm)) plot(xhist,yhist,type="s",ylim=c(0,max(yhist,yfit)), main="Histogram and pdf fit") lines(xfit,yfit, col="red") } check_qq = function(x.norm) { z.norm = (x.norm - mean(x.norm))/sd(x.norm) qqnorm(z.norm) abline(0,1) } fit_expo = function(data) { # Fit an exponential function to the data, and see how close it matches. h = hist(data,breaks=25) xhist = c(min(h$breaks),h$breaks) yhist = c(0,h$density,0) xfit = seq(min(data),max(data),length=length(data)) rate = fitdistr(data, "exponential") #rate$estimate = 0.35 yfit = dexp(xfit, rate=rate$estimate) plot(xhist,yhist,type="s",ylim=c(0,max(yhist,yfit)), main="Exponential Distribution Fit", xlab="Song Index", ylab="Votes") lines(xfit,yfit, col="red") } # Plot the first 2 overlaps plot_overlaps = function(data, rank1=1, compare_rank=2) { png( paste("Warmest100_place",rank1,"_place",compare_rank,".png",sep="") ) mu1 = data[rank1,]$votes sigma1 = data[rank1,]$sd startx1 = mu1 - 3 * sigma1 endx1 = mu1 + 3 * sigma1 f = function(x) { return(dnorm(x, mu1, sigma1)) } title = paste("Warmest 100 Probabilities,", "Rank", rank1, "vs. Rank", compare_rank) plot( startx1:endx1, dnorm(startx1:endx1, mu1, sigma1), type="l", col="blue", xlab="Votes", ylab="Normal Density Plots", main=title) for ( index in c(compare_rank) ) { mu2 = data[index,]$votes sigma2 = data[index,]$sd startx2 = mu2 - 4 * sigma2 endx2 = mu2 + 4 * sigma2 g = function(x) { return(dnorm(x, mu2, sigma2)) } lines( startx2:endx2, dnorm(startx2:endx2, mu2, sigma2), type="l", col="red") # Find the intersection of f and g fg = function(x) { return( f(x) - g(x) ) } intersect = uniroot( fg, c(mu2, mu1) )$root #print(intersect) # Calculate the probability that the votes swap places, i.e. vote1 is less than # the intersect point, and vote2 is more than that. prob1 = pnorm(intersect, mu1, sigma1) prob2 = 1 - pnorm(intersect, mu2, sigma2) print( paste(intersect, prob1, prob2, prob1 * prob2, prob1 + prob2) ) # Shade the area of overlap x.coords = c( seq(startx2, intersect, 0.1), seq(intersect, endx1, 0.1), endx1) y.coords = c( dnorm(seq(startx2, intersect, 0.1), mu1, sigma1), dnorm(seq(intersect, endx1, 0.1), mu2, sigma2), 0) polygon(x.coords, y.coords, col="skyblue", border=NA) # plot both means lines( c(mu1, mu1), c(0, dnorm(mu1, mu1, sigma1)), col="gray") lines( c(mu2, mu2), c(0, dnorm(mu2, mu2, sigma2)), col="gray") # intersection point lines( c(intersect, intersect), c(0, dnorm(intersect, mu1, sigma1)), col="black") #abline(v=intersect, col="lightgrey") text(intersect, 0.0001, as.integer(intersect), pos=4) #text(mu1 + 2 * sigma1, 0.015, paste( format( prob1 * prob2 * 100, nsmall=2, digits=2), "%", sep="")) } dev.off() } hilo = function(data) { for ( index in c(1:10) ) { mu = data[index,]$votes sigma = data[index,]$sd startx = mu - 2 * sigma endx = mu + 2 * sigma prob = pnorm(data[index+1,]$votes, mu, sigma) print( paste(index, mu, startx, endx, prob) ) } }