Machine Learning/brief statistics r
Jump to navigation
Jump to search
R code from A Brief Tour of Statistics[edit]
b_prob = c(0.05,0.5,0.35,0.1) b_names = c("swish","basket","brick","air ball") b_colors = c(2,2,1,1) b_in_prob = c(0.55,0.45) b_in_names = c("in","out") b_in_colors = c(2,1) png("basketball_shots.png") barplot(b_prob,names.arg=b_names,legend.text=TRUE,ylim=c(0,1),col=b_colors,ylab="Probability") dev.off() png("basketball_in.png") barplot(b_in_prob,names.arg=b_in_names,legend.text=TRUE,ylim=c(0,1),col=b_in_colors,ylab="Probability") dev.off() apple_lambda = 1181.25/(8*30) # based on http://answers.yahoo.com/question/index?qid=20090205070515AAWn8PL 7.5 bushels * 45 lbs./bushel * 3.5 apples/lb. # growing season around 8 months apple_range = (qpois(0.001,lambda=apple_lambda):qpois(0.999,lambda=apple_lambda)) apple_prob = dpois(apple_range,lambda=apple_lambda) png("apple_distribution.png") barplot(apple_prob,names.arg=apple_range,legend.text=TRUE,xlab="Apples per tree per day",ylab="Probability") dev.off() # Bus waiting times are Poisson bus_lambda = 10 bus_range = (0:qpois(0.999,lambda=bus_lambda)) bus_prob = dpois(bus_range,lambda=bus_lambda) png("bus_distribution.png",width=10*300,height=2*300) barplot(bus_prob,names.arg=bus_range,legend.text=TRUE,xlab="Wait time",ylab="Probability") dev.off() # Rainfall is lognormal avg_rainfall = 4.4 rain_amt = seq(0,50,by=0.01) rain_prob = dlnorm(rain_amt,meanlog=log(avg_rainfall)) png("rain_distribution.png") plot(rain_amt,rain_prob,type="l",xlab="Rain (inches)",ylab="Probability density") dev.off() png("rain_distribution_1020.png") plot(rain_amt,rain_prob,type="l",xlab="Rain (inches)",ylab="Probability density") for(rain in (seq(10,20,by=0.01))) { points(rep(rain,2),c(0,dlnorm(rain,meanlog=log(avg_rainfall))),type="l",col="grey") } points(rain_amt,rain_prob,type="l") dev.off() ## Balls in bins ball_color=c("yellow","green","blue","red","white") ball_label=c("yellow","green","blue","red","child") ball_probability=c(0.32,0.24,0.18,0.16,0.1) png("ball_distribution.png") barplot(ball_probability,names.arg=ball_label,legend.text=TRUE,xlab="Ball color",ylab="Probability",col=ball_color) dev.off() png("ball_distribution2.png") barplot(c(0.32,0.68),names.arg=c("yellow","non-yellow"),legend.text=TRUE,xlab="Ball color",ylab="Probability",col=c("yellow","white")) dev.off() ## Distribution of number of baskets n = 15 basket_range = (qbinom(0.001,size=n,prob=b_in_prob[1]):qbinom(0.999,size=n,prob=b_in_prob[1])) basket_prob = dbinom(basket_range,size=n,prob=b_in_prob[1]) png(sprintf("basket_distribution_%d.png",n)) barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%d shots",n)) dev.off() n = 30 basket_range = (qbinom(0.001,size=n,prob=b_in_prob[1]):qbinom(0.999,size=n,prob=b_in_prob[1])) basket_prob = dbinom(basket_range,size=n,prob=b_in_prob[1]) png(sprintf("basket_distribution_%d.png",n)) barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%d shots",n)) dev.off() horse_data=c(109,65,22,3,1) png("horse_kick.png") b=barplot(horse_data,names.arg=(0:4),legend.text=TRUE,xlab="Number of Deaths from Horse Kick",ylab="Occurrences") mean_death = sum(horse_data*(0:4))/sum(horse_data) death_points = (0:4) #points(b,sum(horse_data)*dpois(death_points,lambda=mean_death),col=2,pch=19) #points(b,sum(horse_data)*dpois(death_points,lambda=mean_death),type="l",col=2) dev.off() # png("uniform.png") barplot(rep(0.2,5),names.arg=(0:4),legend.text=TRUE,ylab="Probability",ylim=c(0,1)) dev.off() ## Ways of estimating # distribution of the mean n = 38 p = 21/38 basket_range = (qbinom(0.001,size=n,prob=p):qbinom(0.999,size=n,prob=p)) basket_prob = dbinom(basket_range,size=n,prob=p) png(sprintf("estimate_distribution_%0.2f_%d.png",p,n)) barplot(basket_prob,names.arg=sprintf("%0.2f",basket_range/n),legend.text=TRUE,xlab="Estimated Field Goal Rate",ylab="Probability",main=sprintf("%0.2f Field Goal rate",p)) dev.off() ## Determining basket shooting percentage n = 38 p = 0.5 for (p in c(0.25,0.5,0.75)) { basket_range = (qbinom(0.001,size=n,prob=p):qbinom(0.999,size=n,prob=p)) basket_prob = dbinom(basket_range,size=n,prob=p) png(sprintf("field_goal_distribution_%0.2f_%d.png",p,n)) barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%0.2f field goal rate",p)) dev.off() print(sprintf("%0.2f: %s of %d",p,paste(rbinom(5,size=n,prob=p),sep=", "),n)) } # 38 shots, make 21 basket_range = (qbinom(0.001,size=n,prob=p):qbinom(0.999,size=n,prob=p)) basket_prob = dbinom(basket_range,size=n,prob=p) png(sprintf("field_goal_distribution_%0.2f_%d.png",p,n)) barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%0.2f field goal rate",p)) dev.off() # confidence interval and p-value for (p in seq(0,1,by=0.01)) { print(sprintf("%0.2f: P(>=21 baskets) = %0.4f",p,1-pbinom(20,size=n,prob=p))) } for (p in c(0.30,0.34,0.41,0.44,0.47)) { basket_range = (0:38) basket_prob = dbinom(basket_range,size=n,prob=p) basket_color = rep(1,39) basket_color[basket_range >= 21] = 2 png(sprintf("field_goal_pval_%0.2f_%d.png",p,n)) barplot(basket_prob, names.arg=basket_range, legend.text = TRUE, xlab="Number of baskets",ylab="Probability",main=sprintf("%0.2f field goal rate",p),col=basket_color) dev.off() } # CLT p = 21/38 for (n in c(10,100,1000)) { basket_range = (qbinom(0.001,size=n,prob=b_in_prob[1]):qbinom(0.999,size=n,prob=b_in_prob[1])) basket_prob = dbinom(basket_range,size=n,prob=b_in_prob[1]) png(sprintf("basket_distribution_clt_%d.png",n)) barplot(basket_prob,names.arg=basket_range,legend.text=TRUE,xlab="Number of baskets",ylab="Probability",main=sprintf("%d shots",n)) dev.off() } # Regression n = 100000 x = rpois(n,lambda=17) y = 32.7 + x*12.3 + rnorm(n, mean=0, sd=50) my_lm = lm(y~x) sorted_x = unique(x[order(x)]) predicted = predict(my_lm,newdata=data.frame(x=sorted_x)) png("regression.png") plot(x,y) points(sorted_x,predicted,type="l",col=2,lwd=3) dev.off() png("residuals.png") residuals = y - predict(my_lm,newdata=data.frame(x=x)) plot(x,residuals) abline(a=0,b=0,col=2,lwd=3) dev.off() png("residuals2.png") plot(residuals,x) lines(x=c(0,0),y=range(x),col=2,lwd=3) dev.off() png("residuals_hist.png") hist(residuals) dev.off() # logit library("boot") x = (-10:10) png("inv_logit.png") plot(x,inv.logit(x),type="l") dev.off() # non-normal # mean 500, variance 30 mean = 500 variance = 500 k = 100000 png("normal.png") hist(rnorm(k,mean=mean,sd=sqrt(variance)),xlab="Value",ylab="Frequency",main="Normal") dev.off() # chi-squared #png("chisquared.png") #hist(rchisq(k,df=mean),xlab="Value",ylab="Frequency",main="Chi-Squared") #dev.off() # bimodal png("bimodal.png") diff = sqrt(variance)-2 bimodal = c(rnorm(k/2,mean=mean-diff,sd=sqrt(variance-diff^2)),rnorm(k/2,mean=mean+diff,sd=sqrt(variance-diff^2))) hist(bimodal, xlab="Value",ylab="Frequency",main="Bimodal") dev.off() png("uniform.png") range=sqrt(12*variance) hist(runif(k,mean-range/2,mean+range/2),xlab="Value",ylab="Frequency",main="Uniform") dev.off() # not enough black swans for (n in c(100,1000,10000)) { png(sprintf("hurricane_%d.png",n)) barplot(c(0,n),names.arg=c("Hurricane","No Hurricane"),legend.text=TRUE,ylab="Days") dev.off() } ####################################################### sf_monthly_rainfall_mean = c(4.4,3.3,3.1,1.4,0.3,0.1,0.0,0.1,0.3,1.3,2.9,3.1) this_month = as.numeric(format(Sys.time(),format="%m")) cur_mean = sf_monthly_rainfall_mean[this_month] png("heads_tails_fair.png") barplot(c(0.5-1/12000,0.5-1/12000,1/6000),names.arg=c("heads","tails","side"),legend.text=TRUE,ylim=c(0,1),ylab="Probability") dev.off() png("heads_tails_biased.png") barplot(c(0.8-1/12000,0.2-1/12000,1/6000),names.arg=c("heads","tails","side"),legend.text=TRUE,ylim=c(0,1),ylab="Probability") dev.off()