- Machine Learning Overview
- Exploring Data
- Nearest Neighbors
- Naive Bayes
- Measuring Performance
- Linear Regression
Ilan Man
Strategy Operations @ Squarespace
data(iris)
# inspect the structure of the dataset
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# summarize the data - five number summary
summary(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.30 Min. :2.00 Min. :1.00 Min. :0.1
## 1st Qu.:5.10 1st Qu.:2.80 1st Qu.:1.60 1st Qu.:0.3
## Median :5.80 Median :3.00 Median :4.35 Median :1.3
## Mean :5.84 Mean :3.06 Mean :3.76 Mean :1.2
## 3rd Qu.:6.40 3rd Qu.:3.30 3rd Qu.:5.10 3rd Qu.:1.8
## Max. :7.90 Max. :4.40 Max. :6.90 Max. :2.5
max()
- min()
Q3
- Q1
quantile(iris$Sepal.Length, probs = c(0.10,0.50,0.99))
10% 50% 99%
4.8 5.8 7.7
library(scales) # format ggplot() axis
price <- seq(300000,600000,by=10000)
size <- price/1000 + rnorm(length(price),10,50)
houses <- data.frame(price,size)
ex <- ggplot(houses,aes(price,size))+geom_point()+scale_x_continuous(labels = comma)+
xlab("Price")+ylab("Size")+ggtitle("Square footage vs Price")
# 1) using loops
loop_dist <- 0
for(i in 1:nrow(houses)){
loop_dist[i] <- sqrt(sum((new_p-houses[i,])^2))
}
# 2) vectorized
vec_dist <- sqrt(rowSums(t(new_p-t(houses))^2))
closest <- data.frame(houses[which.min(vec_dist),])
print(closest)
price size
11 4e+05 468.6
new_house <- scale(houses)
new_new <- c((new[1]-mean(houses[,1]))/sd(houses[,1]),(new[2]-mean(houses[,2]))/sd(houses[,2]))
vec_dist <- sqrt(rowSums(t(new_new-t(new_house))^2))
which.min(vec_dist)
## [1] 8
data <- read.table('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', sep=',', stringsAsFactors=FALSE, header=FALSE)
# first column has the ID which is not useful
data <- data[,-1]
# names taken from the .names file online
n<-c("radius","texture","perimeter","area","smoothness","compactness",
"concavity","concave_points","symmetry","fractal")
ind<-c("mean","std","worst")
headers<-as.character()
for(i in ind){
headers<-c(headers,paste(n,i))
}
names(data)<-c("diagnosis",headers)
str(data[,1:10])
'data.frame': 569 obs. of 10 variables:
$ diagnosis : chr "M" "M" "M" "M" ...
$ radius mean : num 18 20.6 19.7 11.4 20.3 ...
$ texture mean : num 10.4 17.8 21.2 20.4 14.3 ...
$ perimeter mean : num 122.8 132.9 130 77.6 135.1 ...
$ area mean : num 1001 1326 1203 386 1297 ...
$ smoothness mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
$ compactness mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
$ concavity mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
$ concave_points mean: num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
$ symmetry mean : num 0.242 0.181 0.207 0.26 0.181 ...
# inspect remaining data more closely
prop.table(table(data$diagnosis)); head(data)[2:6]
B M
0.6274 0.3726
radius mean texture mean perimeter mean area mean smoothness mean
1 17.99 10.38 122.80 1001.0 0.11840
2 20.57 17.77 132.90 1326.0 0.08474
3 19.69 21.25 130.00 1203.0 0.10960
4 11.42 20.38 77.58 386.1 0.14250
5 20.29 14.34 135.10 1297.0 0.10030
6 12.45 15.70 82.57 477.1 0.12780
# scale each numeric value
scaled_data<-as.data.frame(lapply(data[,-1], scale))
scaled_data<-cbind(diagnosis=data$diagnosis, scaled_data)
head(scaled_data[2:6])
radius.mean texture.mean perimeter.mean area.mean smoothness.mean
1 1.0961 -2.0715 1.2688 0.9835 1.5671
2 1.8282 -0.3533 1.6845 1.9070 -0.8262
3 1.5785 0.4558 1.5651 1.5575 0.9414
4 -0.7682 0.2535 -0.5922 -0.7638 3.2807
5 1.7488 -1.1508 1.7750 1.8246 0.2801
6 -0.4760 -0.8346 -0.3868 -0.5052 2.2355
library(class) # get k-NN classifier
predict_1 <- knn(train = scaled_data[,2:31], test = scaled_data[,2:31],
cl = scaled_data[,1],
k = floor(sqrt(nrow(scaled_data))))
table(predict_1)
predict_1
B M
378 191
pred_B <- which(predict_1=="B")
actual_B <- which(scaled_data[,1]=="B")
pred_M <- which(predict_1=="M")
actual_M <- which(scaled_data[,1]=="M")
true_positive <- sum(pred_B %in% actual_B)
true_negative <- sum(pred_M %in% actual_M)
false_positive <- sum(pred_B %in% actual_M)
false_negative <- sum(pred_M %in% actual_B)
conf_mat <- matrix(c(true_positive,false_positive,false_negative,true_negative),nrow=2,ncol=2)
acc <- sum(diag(conf_mat))/sum(conf_mat)
tpr <- conf_mat[1,1]/sum(conf_mat[1,])
tn <- conf_mat[2,2]/sum(conf_mat[2,])
acc tpr tn
0.9596 0.9972 0.8962
tp fp
fn 356 1
tn 22 190
# create randomized training and testing sets
total_n <- nrow(scaled_data)
# train on 2/3 of the data
train_ind <- sample(total_n,total_n*2/3)
train_labels <- scaled_data[train_ind,1]
test_labels <- scaled_data[-train_ind,1]
train_set <- scaled_data[train_ind,2:31]
test_set <- scaled_data[-train_ind,2:31]
library(class)
predict_1 <- knn(train = train_set, test = test_set, cl = train_labels,
k = floor(sqrt(nrow(train_set))))
table(predict_1)
predict_1
B M
137 53
pred_B <- which(predict_1=="B")
test_B <- which(test_labels=="B")
pred_M <- which(predict_1=="M")
test_M <- which(test_labels=="M")
true_positive <- sum(pred_B %in% test_B)
true_negative <- sum(pred_M %in% test_M)
false_positive <- sum(pred_B %in% test_M)
false_negative <- sum(pred_M %in% test_B)
conf_mat<-matrix(c(true_positive,false_positive,false_negative,true_negative),nrow=2,ncol=2)
acc <- sum(diag(conf_mat))/sum(conf_mat)
tpr <- conf_mat[1,1]/sum(conf_mat[1,])
tn <- conf_mat[2,2]/sum(conf_mat[2,])
acc tpr tn
0.9474 0.9922 0.8525
tp fp
fn 128 1
tn 9 52
library(caret)
confusionMatrix(predict_1,test_labels)$table
Reference
Prediction B M
B 128 9
M 1 52
# let's try different values for k
k_params <- c(1,3,5,10,15,20,25,30,40)
perf_acc <- NULL
per<-NULL
for(i in k_params){
predictions <- knn(train = train_set,
test = test_set,
cl = train_labels,
k = i)
conf <- confusionMatrix(predictions,test_labels)$table
perf_acc <- sum(diag(conf))/sum(conf)
per <- rbind(per,c(i, perf_acc,conf[[1]],conf[[3]],conf[[2]],conf[[4]]))
}
K | Acc | TP | FP | FN | TN | |
---|---|---|---|---|---|---|
1 | 1.00 | 0.95 | 126.00 | 6.00 | 3.00 | 55.00 |
2 | 3.00 | 0.96 | 127.00 | 6.00 | 2.00 | 55.00 |
3 | 5.00 | 0.95 | 126.00 | 6.00 | 3.00 | 55.00 |
4 | 10.00 | 0.96 | 127.00 | 6.00 | 2.00 | 55.00 |
5 | 15.00 | 0.94 | 127.00 | 9.00 | 2.00 | 52.00 |
6 | 20.00 | 0.95 | 128.00 | 9.00 | 1.00 | 52.00 |
7 | 25.00 | 0.94 | 128.00 | 10.00 | 1.00 | 51.00 |
8 | 30.00 | 0.94 | 128.00 | 10.00 | 1.00 | 51.00 |
9 | 40.00 | 0.95 | 129.00 | 10.00 | 0.00 | 51.00 |
probability
event
trial
- e.g. 1 flip of a coin, 1 toss of a die# data frame with frequency of emails with the word "cash"
bayes_ex <- data.frame(cash_yes=c(10,3,13),
cash_no=c(20,67,87),
total=c(30,70,100),
row.names=c('spam','ham','total'))
bayes_ex
cash_yes cash_no total
spam 10 20 30
ham 3 67 70
total 13 87 100
Exercise:
\(P(ham \mid cash = no)\) = ?
cash_yes cash_no furniture_yes furniture_no total
spam 10 20 6 24 30
ham 3 67 20 50 70
total 13 87 26 74 100
\(P(spam \mid cash=yes \cap furniture=no) = \frac{P(cash=yes \cap furniture=no \mid spam) \times P(spam)}{P(cash=yes \cap furniture=no)}\)
Exercise
: \(P(ham \mid cash=yes \cap furniture=no) = ?\)
cash_yes cash_no furniture_yes furniture_no party_yes party_no total
spam 10 20 6 24 3 27 30
ham 3 67 20 50 0 70 70
total 13 87 26 74 3 97 100
\(P(ham \mid cash=yes \cap party=yes) = \frac{P(cash=yes \mid ham) \times P(party=yes \mid ham) \times P(ham)}{P(cash=yes) \times P(party=yes)} = ?\)
cash_yes cash_no furniture_yes furniture_no party_yes party_no total
spam 10 20 6 24 3 27 30
ham 3 67 20 50 0 70 70
total 13 87 26 74 3 97 100
\(P(ham \mid cash=yes \cap party=yes) = \frac{P(cash=yes \mid ham) \times P(party=yes \mid ham) \times P(ham)}{P(cash=yes) \times P(party=yes)} = ?\) \(\frac{\frac{3}{70} \times \frac{0}{70} \times \frac{70}{100}}{\frac{13}{100} \times \frac{3}{100}} = 0\)
\(P(ham \mid cash=yes \cap party=yes) =\) \(\frac{P(cash=yes \mid ham) \times P(party=yes \mid ham) \times P(ham)}{P(cash=yes) \times P(party=yes)} =\) \(\frac{\frac{3}{70} \times \frac{0}{70} \times \frac{70}{100}}{\frac{13}{100} \times \frac{3}{100}} = 0\)
sms_data <- read.table('SMSSpamCollection.txt',stringsAsFactors=FALSE,sep='\t',quote="", col.names=c("type","text"))
str(sms_data)
'data.frame': 5574 obs. of 2 variables:
$ type: chr "ham" "ham" "spam" "ham" ...
$ text: chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...
sms_data$type <- factor(sms_data$type)
and
, the
, or
library(tm)
# create collection of text documents, a corpus
sms_corpus <- Corpus(VectorSource(sms_data$text))
sms_corpus
A corpus with 5574 text documents
# look at first few text messages
for (i in 1:5){
print(sms_corpus[[i]])
}
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Ok lar... Joking wif u oni...
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
U dun say so early hor... U c already then say...
Nah I don't think he goes to usf, he lives around here though
# clean the data using helpful functions
corpus_clean <- tm_map(sms_corpus,tolower)
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
corpus_clean <- tm_map(corpus_clean, removePunctuation)
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
# now make each word in the corpus into it's own token
# each row is a message and each column is a word. Cells are frequency counts.
sms_dtm <- DocumentTermMatrix(corpus_clean)
# create training and testing set
total_n <- nrow(sms_data)
train_ind <- sample(total_n,total_n*2/3)
dtm_train_set <- sms_dtm[train_ind,]
dtm_test_set <- sms_dtm[-train_ind,]
corpus_train_set <- corpus_clean[train_ind]
corpus_test_set <- corpus_clean[-train_ind]
raw_train_set <- sms_data[train_ind,1:2]
raw_test_set <- sms_data[-train_ind,1:2]
# remove infrequent terms - not useful for classification
freq_terms <- c(findFreqTerms(dtm_train_set,7))
corpus_train_set <- DocumentTermMatrix(corpus_train_set, list(dictionary = freq_terms))
corpus_test_set <- DocumentTermMatrix(corpus_test_set, list(dictionary = freq_terms))
# convert frequency counts to "yes" or "no"
# implicitly weighing each term the same
convert <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0,1), labels=c('No','Yes'))
return(x)
}
corpus_train_set <- apply(corpus_train_set, MARGIN = 2 , FUN = convert)
corpus_test_set <- apply(corpus_test_set, MARGIN = 2 , FUN = convert)
# use naiveBayes() function from e1071 package
library(e1071)
naive_model <- naiveBayes(x = corpus_train_set, y = raw_train_set$type)
predict_naive <- predict(naive_model, corpus_test_set)
naive_conf <- confusionMatrix(predict_naive,raw_test_set$type)$table
naive_conf
Reference
Prediction ham spam
ham 1620 37
spam 3 198
# use naiveBayes() function from e1071 package
library(e1071)
naive_model <- naiveBayes(x = corpus_train_set, y = raw_train_set$type)
predict_naive <- predict(naive_model, corpus_test_set)
naive_conf <- confusionMatrix(predict_naive,raw_test_set$type)$table
naive_conf
Reference
Prediction ham spam
ham 1620 37
spam 3 198
Exercise:
1) Calculate the true positive and false positive rate.
2) Calculate the error rate (hint: error rate = 1 - accuracy)
3) Set the Laplace = 1 and rerun the model and confustion matrix. Does this improve the model?
predict()
function predict()
to probability
, prob
, raw
, ...# estimate a probability for each class
confidence <- predict(naive_model, corpus_test_set, type='raw')
as.data.frame(format(head(confidence),digits=2,scientific=FALSE))
ham spam
1 0.9999975488976 0.0000024511024
2 0.9999996374299 0.0000003625701
3 0.9997485224034 0.0002514775966
4 0.0000000000465 0.9999999999535
5 0.0000000000043 0.9999999999957
6 0.9999998601364 0.0000001398636
# estimate a probability for each class
spam_conf <- confidence[,2]
comparison <- data.frame(predict=predict_naive,
actual=raw_test_set[,1],
prob_spam=spam_conf)
comparison[,3] <- format(comparison[,3],digits=2,scientific=FALSE)
head(comparison)
predict actual prob_spam
1 ham ham 0.000002451102442
2 ham ham 0.000000362570063
3 ham ham 0.000251477596598
4 spam spam 0.999999999953453
5 spam spam 0.999999999995699
6 ham ham 0.000000139863590
head(comparison[with(comparison,predict!=actual),])
predict actual prob_spam
81 ham spam 0.199053484775318
104 ham spam 0.075148234687662
144 ham spam 0.234883419684168
186 ham spam 0.475632786648641
232 ham spam 0.115073916786998
237 ham spam 0.002081284191925
head(comparison[with(comparison,predict==actual),])
predict actual prob_spam
1 ham ham 0.000002451102442
2 ham ham 0.000000362570063
3 ham ham 0.000251477596598
4 spam spam 0.999999999953453
5 spam spam 0.999999999995699
6 ham ham 0.000000139863590
mean(as.numeric(comparison[with(comparison,predict!='spam'),]$prob_spam))
[1] 0.005393
mean(as.numeric(comparison[with(comparison,predict=='spam'),]$prob_spam))
[1] 0.9796
predicted <- sample(c("A","B","C"),1000,TRUE)
actual <- sample(c("A","B","C"),1000,TRUE)
fabricated <- table(predicted,actual)
# for our Naive Bayes classifier
table(comparison$predict,comparison$actual)
ham spam
ham 1620 37
spam 3 198
Exercise: Calculate the kappa statistic for the naive classifier.
Exercise: Find the specificity, sensitivity, precision and recall for the Naive classifier.
Exercise: Calculate the F-score for the Naive classifier.
library(ROCR)
# create a prediction function
pred <- prediction(predictions = as.numeric(comparison$predict), labels= raw_test_set[,1])
# create a performance function
perf <- performance(pred,measure='tpr',x.measure='fpr')
# create the ROC curve
plot(perf, main = "ROC Curve for Naive classifier", col = 'blue', lwd = 3)
abline(a = 0, b = 1, lwd = 2, lty = 2)
text(0.75,0.4, labels = "<<< Classifier with\n no predictive power")
auc <- performance(pred, measure='auc')
str(auc)
Formal class 'performance' [package "ROCR"] with 6 slots
..@ x.name : chr "None"
..@ y.name : chr "Area under the ROC curve"
..@ alpha.name : chr "none"
..@ x.values : list()
..@ y.values :List of 1
.. ..$ : num 0.92
..@ alpha.values: list()
auc@y.values
[[1]]
[1] 0.9204
library(caret)
new_data <- createDataPartition(sms_data$type,p=0.1, list=FALSE)
table(sms_data[new_data,1])
ham spam
483 75
folds <- createFolds(sms_data$type, k = 10)
str(folds)
List of 10
$ Fold01: int [1:558] 4 13 36 40 51 74 77 79 83 91 ...
$ Fold02: int [1:556] 11 17 53 59 60 87 98 108 109 110 ...
$ Fold03: int [1:558] 16 20 28 30 33 52 54 56 63 64 ...
$ Fold04: int [1:558] 9 18 27 45 50 65 66 67 70 73 ...
$ Fold05: int [1:558] 38 39 46 61 76 81 101 111 130 161 ...
$ Fold06: int [1:557] 8 21 22 25 29 44 47 55 71 80 ...
$ Fold07: int [1:558] 10 14 15 19 31 41 48 49 62 86 ...
$ Fold08: int [1:557] 2 26 32 42 43 57 69 92 94 96 ...
$ Fold09: int [1:557] 1 3 5 6 12 23 24 34 35 37 ...
$ Fold10: int [1:557] 7 58 78 82 88 97 104 117 124 128 ...
lm()
function does this)
x <- cbind(1,x) #Add ones to x
theta<- c(0,0) # initalize theta vector
m <- nrow(x) # Number of the observations
grad_cost <- function(X,y,theta) return(sum(((X%*%theta)- y)^2))
gradDescent<-function(X,y,theta,iterations,alpha){
m <- length(y)
grad <- rep(0,length(theta))
cost.df <- data.frame(cost=0,theta=0)
for (i in 1:iterations){
h <- X%*%theta
grad <- (t(X)%*%(h - y))/m
theta <- theta - alpha * grad
cost.df <- rbind(cost.df,c(grad_cost(X,y,theta),theta))
}
return(list(theta,cost.df))
}
## initialize X, y and theta
X1<-matrix(ncol=1,nrow=nrow(df),cbind(1,df$X))
Y1<-matrix(ncol=1,nrow=nrow(df),df$Y)
init_theta<-as.matrix(c(0))
grad_cost(X1,Y1,init_theta)
[1] 5389
iterations = 10000
alpha = 0.1
results <- gradDescent(X1,Y1,init_theta,iterations,alpha)
grad_cost(X1,Y1,theta[[1]])
[1] 356.4
## Make some predictions
intercept <- df[df$X==0,]$Y
pred <- function (x) return(intercept+c(x)%*%theta)
new_points <- c(0.1,0.5,0.8,1.1)
new_preds <- data.frame(X=new_points,Y=sapply(new_points,pred))
ggplot(data=df,aes(x=X,y=Y))+geom_point(size=2)
ggplot(data=df,aes(x=X,y=Y))+geom_point()+geom_point(data=new_preds,aes(x=X,y=Y,color='red'),size=3)+scale_colour_discrete(guide = FALSE)
[1] 0.04867
df <- transform(df, X2=X^2, X3=X^3)
summary(lm(Y~X+X2+X3,df))$coef[,1]
(Intercept) X X2 X3
0.3202 6.2780 -25.7736 21.0211
summary(lm(Y~X + X2 + X3,df))$adj.r.squared
[1] 0.799
(Intercept) X X2 X3 X4 X5
2.613e-03 1.070e+01 -1.093e+02 1.634e+03 -1.546e+04 8.849e+04
X6 X7 X8 X9 X10 X11
-3.320e+05 8.393e+05 -1.436e+06 1.646e+06 -1.222e+06 5.415e+05
X12 X14
-1.142e+05 2.690e+03
[1] 0.09883
poly()
functionortho.coefs <- with(df,cor(poly(X,degree=3)))
sum(ortho.coefs[upper.tri(ortho.coefs)]) # polynomials are uncorrelated
[1] -1.415e-16
linear.fit <- lm(Y~poly(X,degree=15),df)
summary(linear.fit)$coef[,1]
(Intercept) poly(X, degree = 15)1 poly(X, degree = 15)2
0.128085 -2.242253 6.145309
poly(X, degree = 15)3 poly(X, degree = 15)4 poly(X, degree = 15)5
5.716072 -3.668778 -1.652848
poly(X, degree = 15)6 poly(X, degree = 15)7 poly(X, degree = 15)8
0.645181 0.206527 -0.059154
poly(X, degree = 15)9 poly(X, degree = 15)10 poly(X, degree = 15)11
0.105895 0.046219 -0.029200
poly(X, degree = 15)12 poly(X, degree = 15)13 poly(X, degree = 15)14
-0.056042 0.005958 -0.020484
poly(X, degree = 15)15
-0.054331
summary(linear.fit)$adj.r.squared # R^2 is 98% and no errors
[1] 0.9775
sqrt(mean((predict(linear.fit)-df$Y)^2)) # RMSE = 0.472
[1] 0.09874
x <- seq(0,1,by=0.005)
y <- sin(3*pi*x) + rnorm(length(x),0,0.1)
indices <- sort(sample(1:length(x), round(0.5 * length(x))))
training.x <- x[indices]
training.y <- y[indices]
test.x <- x[-indices]
test.y <- y[-indices]
training.df <- data.frame(X = training.x, Y = training.y)
test.df <- data.frame(X = test.x, Y = test.y)
rmse <- function(y,h) return(sqrt(mean((y-h)^2)))
performance <- data.frame()
for (d in 1:20){
fits <- lm(Y~poly(X,degree=d),data=training.df)
performance <- rbind(performance, data.frame(Degree = d,
Data = 'Training',
RMSE = rmse(training.y, predict(fits))))
performance <- rbind(performance, data.frame(Degree = d,
Data = 'Test',
RMSE = rmse(test.y, predict(fits,
newdata = test.df))))
}