# Machine Learning/brief statistics r

(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

## R code from A Brief Tour of Statistics

```b_prob = c(0.05,0.5,0.35,0.1)
b_colors = c(2,2,1,1)
b_in_prob = c(0.55,0.45)
b_in_names = c("in","out")
b_in_colors = c(2,1)

barplot(b_prob,names.arg=b_names,legend.text=TRUE,ylim=c(0,1),col=b_colors,ylab="Probability")
dev.off()

barplot(b_in_prob,names.arg=b_in_names,legend.text=TRUE,ylim=c(0,1),col=b_in_colors,ylab="Probability")
dev.off()

apple_lambda = 1181.25/(8*30)  # based on http://answers.yahoo.com/question/index?qid=20090205070515AAWn8PL 7.5 bushels * 45 lbs./bushel * 3.5 apples/lb.
# growing season around 8 months
apple_range = (qpois(0.001,lambda=apple_lambda):qpois(0.999,lambda=apple_lambda))
apple_prob = dpois(apple_range,lambda=apple_lambda)

png("apple_distribution.png")
barplot(apple_prob,names.arg=apple_range,legend.text=TRUE,xlab="Apples per tree per day",ylab="Probability")
dev.off()

# Bus waiting times are Poisson
bus_lambda = 10
bus_range = (0:qpois(0.999,lambda=bus_lambda))
bus_prob = dpois(bus_range,lambda=bus_lambda)
png("bus_distribution.png",width=10*300,height=2*300)
barplot(bus_prob,names.arg=bus_range,legend.text=TRUE,xlab="Wait time",ylab="Probability")
dev.off()

# Rainfall is lognormal
avg_rainfall = 4.4
rain_amt = seq(0,50,by=0.01)
rain_prob = dlnorm(rain_amt,meanlog=log(avg_rainfall))

png("rain_distribution.png")
plot(rain_amt,rain_prob,type="l",xlab="Rain (inches)",ylab="Probability density")
dev.off()

png("rain_distribution_1020.png")
plot(rain_amt,rain_prob,type="l",xlab="Rain (inches)",ylab="Probability density")
for(rain in (seq(10,20,by=0.01))) {
points(rep(rain,2),c(0,dlnorm(rain,meanlog=log(avg_rainfall))),type="l",col="grey")
}
points(rain_amt,rain_prob,type="l")
dev.off()

##  Balls in bins

ball_color=c("yellow","green","blue","red","white")
ball_label=c("yellow","green","blue","red","child")
ball_probability=c(0.32,0.24,0.18,0.16,0.1)
png("ball_distribution.png")
barplot(ball_probability,names.arg=ball_label,legend.text=TRUE,xlab="Ball color",ylab="Probability",col=ball_color)
dev.off()

png("ball_distribution2.png")
barplot(c(0.32,0.68),names.arg=c("yellow","non-yellow"),legend.text=TRUE,xlab="Ball color",ylab="Probability",col=c("yellow","white"))
dev.off()

## Distribution of number of baskets
n = 15
dev.off()

n = 30
dev.off()

horse_data=c(109,65,22,3,1)
png("horse_kick.png")
b=barplot(horse_data,names.arg=(0:4),legend.text=TRUE,xlab="Number of Deaths from Horse Kick",ylab="Occurrences")
mean_death = sum(horse_data*(0:4))/sum(horse_data)
death_points = (0:4)
#points(b,sum(horse_data)*dpois(death_points,lambda=mean_death),col=2,pch=19)
#points(b,sum(horse_data)*dpois(death_points,lambda=mean_death),type="l",col=2)
dev.off()

#
png("uniform.png")
barplot(rep(0.2,5),names.arg=(0:4),legend.text=TRUE,ylab="Probability",ylim=c(0,1))
dev.off()

## Ways of estimating

# distribution of the mean

n = 38
p = 21/38

png(sprintf("estimate_distribution_%0.2f_%d.png",p,n))
dev.off()

n = 38
p = 0.5

for (p in c(0.25,0.5,0.75)) {

png(sprintf("field_goal_distribution_%0.2f_%d.png",p,n))
dev.off()

print(sprintf("%0.2f: %s of %d",p,paste(rbinom(5,size=n,prob=p),sep=", "),n))
}

# 38 shots, make 21
png(sprintf("field_goal_distribution_%0.2f_%d.png",p,n))
dev.off()

# confidence interval and p-value

for (p in seq(0,1,by=0.01)) {
}

for (p in c(0.30,0.34,0.41,0.44,0.47)) {
png(sprintf("field_goal_pval_%0.2f_%d.png",p,n))
dev.off()
}

# CLT

p = 21/38
for (n in c(10,100,1000)) {
dev.off()
}

# Regression

n = 100000

x = rpois(n,lambda=17)
y = 32.7 + x*12.3 + rnorm(n, mean=0, sd=50)

my_lm = lm(y~x)
sorted_x = unique(x[order(x)])
predicted = predict(my_lm,newdata=data.frame(x=sorted_x))

png("regression.png")
plot(x,y)
points(sorted_x,predicted,type="l",col=2,lwd=3)
dev.off()

png("residuals.png")
residuals = y - predict(my_lm,newdata=data.frame(x=x))
plot(x,residuals)
abline(a=0,b=0,col=2,lwd=3)
dev.off()

png("residuals2.png")
plot(residuals,x)
lines(x=c(0,0),y=range(x),col=2,lwd=3)
dev.off()

png("residuals_hist.png")
hist(residuals)
dev.off()

# logit

library("boot")
x = (-10:10)
png("inv_logit.png")
plot(x,inv.logit(x),type="l")
dev.off()

# non-normal

# mean 500, variance 30
mean = 500
variance = 500
k = 100000

png("normal.png")
hist(rnorm(k,mean=mean,sd=sqrt(variance)),xlab="Value",ylab="Frequency",main="Normal")
dev.off()

# chi-squared
#png("chisquared.png")
#hist(rchisq(k,df=mean),xlab="Value",ylab="Frequency",main="Chi-Squared")
#dev.off()

# bimodal
png("bimodal.png")
diff = sqrt(variance)-2
bimodal = c(rnorm(k/2,mean=mean-diff,sd=sqrt(variance-diff^2)),rnorm(k/2,mean=mean+diff,sd=sqrt(variance-diff^2)))
hist(bimodal, xlab="Value",ylab="Frequency",main="Bimodal")
dev.off()

png("uniform.png")
range=sqrt(12*variance)
hist(runif(k,mean-range/2,mean+range/2),xlab="Value",ylab="Frequency",main="Uniform")
dev.off()

# not enough black swans

for (n in c(100,1000,10000)) {
png(sprintf("hurricane_%d.png",n))
barplot(c(0,n),names.arg=c("Hurricane","No Hurricane"),legend.text=TRUE,ylab="Days")
dev.off()
}

#######################################################

sf_monthly_rainfall_mean = c(4.4,3.3,3.1,1.4,0.3,0.1,0.0,0.1,0.3,1.3,2.9,3.1)
this_month = as.numeric(format(Sys.time(),format="%m"))
cur_mean = sf_monthly_rainfall_mean[this_month]