# Notes of Practical Machine Learning (Coursera PML)

2016 - 06 - 28

It has been a long time since I started using R. Recently, I found some old notes, and I prefer to put it in digital archive, this blog post is to achieve the purpose.

# DT (data.table)

This data.table (DT) instruction is also available on my github, ispiared by
this ref

# //TODO

//TODO: summarize Solve common R problems efficiently with data.table which is must-read. backup
//TODO: summarize High-performance Solution in R
//TODO: check if to summarize Data Analysis in R using data.table
//TODO: The official "Getting Started" of DT
//TODO： check tablewrangling.Rmd

See more here

# DT Join (Similar as Database)

I created a repo with example, see DT Join
backup rpubs or github. // HDD: r_data_table_start.Rproj

# DT Key Ideas

1. data.table is an extension of data.frame, it is capable of df
2. DT's syntax is similar to SQL: DT[i, j, by] = sql query [where, select, group_by] i.e. [WHERE_condition_of_rows, SELECT_of_columns, GROUP_BY_a_categorical_variable]

# DT Basic Commands

## create an empty DT with column names

# this will create an empty DT with 5 columns, each is numeric type
as.data.table(setNames(replicate(5, numeric(0), simplify = FALSE), c('col.a', 'b', 'c', '4', 'V5')))

# format:
#replicate(n, expr, simplify = "array") # in the rep() falimy, will repeat expr n times


## prepare data for commands

library(data.table)
library(dplyr)

system.time((dt = fread("D:/hNow/Dropbox/Github/test_r_dt/GB_full.csv")))

head(dt)

##    V1       V2                     V3       V4  V5            V6 V7
## 1: GB     AB10               Aberdeen Scotland SCT Aberdeenshire
## 2: GB AB10 1AA George St/Harbour Ward Scotland SCT
## 3: GB AB10 1AB George St/Harbour Ward Scotland SCT
## 4: GB AB10 1AF George St/Harbour Ward Scotland SCT
## 5: GB AB10 1AG George St/Harbour Ward Scotland SCT
## 6: GB AB10 1AH George St/Harbour Ward Scotland SCT
##               V8        V9      V10       V11 V12
## 1:                               NA        NA   4
## 2: Aberdeen City S12000033 57.14823 -2.096648   6
## 3: Aberdeen City S12000033 57.14960 -2.096916   6
## 4: Aberdeen City S12000033 57.14870 -2.097806   6
## 5: Aberdeen City S12000033 57.14823 -2.096648   6
## 6: Aberdeen City S12000033 57.14808 -2.094664   6

## subset

# subset rows (delete other rows)
subset_rows = dt[V4 == "England" & V3 == "Beswick" & V10 < 53.93]

# subset cols
subset_cols = dt[ , .(V2,V4)] # by col names
subset_cols = dt[ , c('V2','V4'), with = F] # by col names
subset_cols = dt[ , c(2,4), with = F] # by col numbers


wrong :

subset_cols = dt[ , .(V2:V4)] # wrong !!!


Right: for continuous col names, dt[ , c(2:4), with = F] can be used.

## order/sort

dt_order = dt[order(V4, -V8)] # radix sort, so it is fast


## modify by column

add, update, delete, etc. tip: :=

(add / unconditional update a whole column):

# eg 1
dt[ , V18 := V10 + V11]

# eg 2
dt[ , V19 := 'this-is-NEW.col']

# eg 3
dt[ , V20 := paste(V18, V19, sep = "_")]
# expensive, system.time = 16~20s. see also 'OBS' below for func paste()

head(dt)

##    V1       V2                     V3       V4  V5            V6 V7
## 1: GB     AB10               Aberdeen Scotland SCT Aberdeenshire
## 2: GB AB10 1AA George St/Harbour Ward Scotland SCT
## 3: GB AB10 1AB George St/Harbour Ward Scotland SCT
## 4: GB AB10 1AF George St/Harbour Ward Scotland SCT
## 5: GB AB10 1AG George St/Harbour Ward Scotland SCT
## 6: GB AB10 1AH George St/Harbour Ward Scotland SCT
##               V8        V9      V10       V11 V12      V18             V19
## 1:                               NA        NA   4       NA this-is-NEW.col
## 2: Aberdeen City S12000033 57.14823 -2.096648   6 55.05158 this-is-NEW.col
## 3: Aberdeen City S12000033 57.14960 -2.096916   6 55.05269 this-is-NEW.col
## 4: Aberdeen City S12000033 57.14870 -2.097806   6 55.05090 this-is-NEW.col
## 5: Aberdeen City S12000033 57.14823 -2.096648   6 55.05158 this-is-NEW.col
## 6: Aberdeen City S12000033 57.14808 -2.094664   6 55.05341 this-is-NEW.col
##                                 V20
## 1:               NA_this-is-NEW.col
## 2:   55.05158022833_this-is-NEW.col
## 3: 55.0526863914458_this-is-NEW.col
## 4: 55.0508972975867_this-is-NEW.col
## 5:   55.05158022833_this-is-NEW.col
## 6: 55.0534126323586_this-is-NEW.col

### update:

with WHERE condition

dt[V8 == "Aberdeen City", V8 := "updated city name"]

##    V1       V2                     V3       V4  V5            V6 V7
## 1: GB     AB10               Aberdeen Scotland SCT Aberdeenshire
## 2: GB AB10 1AA George St/Harbour Ward Scotland SCT
## 3: GB AB10 1AB George St/Harbour Ward Scotland SCT
## 4: GB AB10 1AF George St/Harbour Ward Scotland SCT
## 5: GB AB10 1AG George St/Harbour Ward Scotland SCT
## 6: GB AB10 1AH George St/Harbour Ward Scotland SCT
##                   V8        V9      V10       V11 V12      V18
## 1:                                   NA        NA   4       NA
## 2: updated city name S12000033 57.14823 -2.096648   6 55.05158
## 3: updated city name S12000033 57.14960 -2.096916   6 55.05269
## 4: updated city name S12000033 57.14870 -2.097806   6 55.05090
## 5: updated city name S12000033 57.14823 -2.096648   6 55.05158
## 6: updated city name S12000033 57.14808 -2.094664   6 55.05341
##                V19                              V20
## 1: this-is-NEW.col               NA_this-is-NEW.col
## 2: this-is-NEW.col   55.05158022833_this-is-NEW.col
## 3: this-is-NEW.col 55.0526863914458_this-is-NEW.col
## 4: this-is-NEW.col 55.0508972975867_this-is-NEW.col
## 5: this-is-NEW.col   55.05158022833_this-is-NEW.col
## 6: this-is-NEW.col 55.0534126323586_this-is-NEW.col

### delete col:

# delete one col
dt[ , V18 := NULL]
# or (i.e. both string and col-name itself are ok)
dt[ , 'V18' := NULL]

# delete multi cols (an array of their names)
dt[ , c('V19', 'V20') := NULL]


wrong format of delete:

dt[ , .(V11, V12) := NULL]
dt[ , c(V11, V12) := NULL]


### combine all tasks (just need to put []s together):

dt[V8 == "Aberdeen City", V8 := "updated_city"][ , V_New := V10 + V11][ , c("V6","V7") := NULL]


# DT Aggregate

(compute functions while grouping)
Let's take a look at the data using summary() and unique() before aggregate.

summary(dt$V_New)  ## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ## 43.55 50.45 51.30 51.10 52.04 59.99 27431 unique(dt$V4)

##  "Scotland"         "England"          "Northern Ireland"
##  "Wales"            ""

Let's do the real aggregation now.

# calculate a "func"
dt[V_New >= 50, mean(V10), by = V4]

##          V4       V1
## 1: Scotland 56.20603
## 2:  England 52.60817
## 3:    Wales 53.13679
# named calculation in "func", instead of auto-named as 'V1', 'V2' ...
# OBS:  .()  is necessary, otherwise confusion with arg of dt
dt[V_New >= 50, .(averaged_result = mean(V10)), by = V4]

##          V4 averaged_result
## 1: Scotland        56.20603
## 2:  England        52.60817
## 3:    Wales        53.13679
# multi-calculation
dt[V_New >= 50, .(sum(V10), mean(V10), .N), by = V4]

##          V4         V1       V2       N
## 1: Scotland  8720365.2 56.20603  155150
## 2:  England 63449185.9 52.60817 1206071
## 3:    Wales   365687.4 53.13679    6882
# multi-calculation with names in "func" and func in "by"
dt[V_New >= 50, .(sumTotal = sum(V10), avr = mean(V10), countNr = .N), by = substr(V4, 1,1)]

##    substr   sumTotal      avr countNr
## 1:      S  8720365.2 56.20603  155150
## 2:      E 63449185.9 52.60817 1206071
## 3:      W   365687.4 53.13679    6882
# group by multiple columns
..., by = list(V1, V4)
# or:
..., by = c('V1', 'V4')


# Performance DT vs DPLYR

# data.table
system.time((
dt[V_New >= 50, .(sumTotal = sum(V10), avr = mean(V10), countNr = .N), by = substr(V4, 1,1)]
))

##    user  system elapsed
##    0.14    0.00    0.14
# base::aggregate
system.time({
mainDataDf = subset(dt, V_New >= 50)
summary_t = as.data.frame(as.list(aggregate(
cbind(V10)
~ substr(V4, 1,1)
)))
})

##    user  system elapsed
##   13.08    0.11   13.41
# dplyr::summarise
system.time({
mainDataDf = subset(dt, V_New >= 50)
mainDataDf$t = substr(mainDataDf$V4, 1,1)
sumTotal = sum(V10),
avr = mean(V10),
countNr = n())
mainDataDf$t = NULL })  ## user system elapsed ## 0.98 0.21 1.19 # pip-ed dplyr::summarise system.time({ summary_t = dt %>% # prounced as "then" filter(V_New >= 50) %>% mutate(t = substr(V4, 1,1)) %>% group_by(t) %>% summarise(sumTotal = sum(V10), avr = mean(V10), countNr = n()) })  ## user system elapsed ## 0.28 0.08 0.36 summary_t  ## Source: local data table [3 x 4] ## ## t sumTotal avr countNr ## 1 E 63449185.9 52.60817 1206071 ## 2 S 8720365.2 56.20603 155150 ## 3 W 365687.4 53.13679 6882 Note: the results are the same, but rwos in different order. To make exactly the same, we can use base::order() and dplyr::arrange(). ## io performance • dt = fread("data.csv") 10x faster than read.csv("data.csv"). • fwrite() is faster than write.csv(). # DT Other Useful Features • automatic indexing • rolling joins • overlapping range joins • and a lot more ... # DT OBS Notice ## cat/paste in [] paste works fine. wrong usage of cat: system.time(dt[ , V20 := cat(V18, V19)]) # this will print all cat-results in console, and NOT adding/updating any column  # DPLYR # R Performance Profiling • system.time(cmd) • system.time({ multi-line cmd }) # OBS: need to explicate print the result • // to be cont. # PML Course Points and Scoring There are 100 available points for the course. They are broken down as follows Quiz 1 = 15 points Quiz 2 = 15 points Quiz 3 = 15 points Quiz 4 = 15 points Course Project Part 1: Writeup (peer-assessed) = 20 points Course Project Part 2: Submission (programming) = 20 points You must receive 70 points to pass the course and achieve the certificate and 90 points to earn distinction. # Week 1 ~ 2 & R Basics ## io redirection direct output to a file sink("myfile", append=FALSE, split=FALSE) return output to the terminal sink() ## remove row.names row.names(your.data.frame) = NULL ## grammer symbols = is the same as <-, assignment. ;cmd seperation, if no new line. will not influence results. diff in matlab: ; will suppress output ## variable names $表示隶属，例如：dataFrameName$columnName 表示提取名为dataFrameName的数据中的columnName列。 . (dot) is one of legal var name characters. x and x.df are two variables with no relationship. ## logic symbols logic equal == (double chars) logic or & (single char) ## data.frame subset (deprecated, plz use data.table) see also row/col manipulate in R/py subset(myDataframe, select = c(some columns)) arg: select argument lets you subset variables (columns). ref: R FAQ  比较下面三种用 列名 进行select 的过程： Subsetting rows using indices: ← 完全错误. 正确 ↓ ,之前有条件表达式， 表示select row； 选择标准是y == 1,之后没东西，表示所有列。 删除某列（__使用列名时，只能用subset ()函数，[ ]不支持列名__）： ## fit scattered-feature-matrix plot: featurePlot() ref featurePlot(x=training[,c("age","education","jobclass")], y = training$wage, plot="pairs")   lib carettrain()自动将nominal的变量分离成多个dummy variables （one-hot encoding）。
cmd for fit： OBS: the usage of +

modFit = train(wage ~ age + jobclass + education, method = "lm", data=training)
# 这里选了三个变量（age + jobclass + education）来预测wage.
# lm表示linear mod。
finMod = modFit$finalModel plot(finMod, 1, pch=19, cex=0.5 ,col="#00000010") # 有好几个图，下图只plot第一个。outliers被自动标出。 plot(finMod$residuals) # see below qplot(finMod$fitted,finMod$residuals) # w/o colors
qplot(finMod$fitted,finMod$residuals,colour=race,data=training) # with colors
# see below  If you want to use all covariates (i.e. use all attributes to predict wage) OBS: the usage of .

modFitAll = train(wage ~ .,data=training, method="lm")
pred = predict(modFitAll, testing)
qplot(wage,pred,data=testing)


## plot cex： _c_haracter _ex_pansion. zoom ratio of char & symbols in plot, default: 1 (no zoom in neither out)
pch： _p_lotting _ch_aracter： the index of character/symbol. # Week3

Trees, Assembled Methods, (Bagging, Random Forests, Boosting), Model Based Prediction.

## 019 predictingWithTrees, week 3.1

### pros:

• Easy to interpret
• Better performance in nonlinear settings

### cons:

• Without pruning/cross-validation can lead to overfitting
• Harder to estimate uncertainty
• Results may be variable
data(iris); library(ggplot2); library(caret);
inTrain  =  createDataPartition(y=iris$Species, p=0.7, list=FALSE) training = iris[inTrain, ] testing = iris[-inTrain, ] modFit = train(Species ~ .,method="rpart",data=training) # R part print(modFit$finalModel)

library(rattle)
fancyRpartPlot(modFit$finalModel) # see below predict(modFit, newdata=testing) ### notes and further resources: • Classification trees are non-linear models • They use interactions between variables • Data transformations may be less important (monotone transformations) • Trees can also be used for regression problems (continuous outcome) • There are multiple tree building options in R, both in the caret package ( party, rpart ) and out of the caret package ( tree ) ## 020 bagging, week 3.2 Bagging (= Bootstrap Aggregating) ### basic idea: 1. Resample cases and recalculate predictions 2. Average or majority vote ### notes (pros): • Similar bias • Reduced variance • More useful for non-linear functions Bagging example: loess(), which is a Local Polynomial Regression Fitting function. Manual: ll = matrix(NA,nrow=10,ncol=155) for(i in 1:10){ ss = sample(1:dim(ozone)[1 ],replace=T) ozone0 = ozone[ss, ]; ozone0 = ozone0[order(ozone0$ozone), ]
loess0  =  loess(temperature ~ ozone,data=ozone0,span=0.2)
ll[i, ]  =  predict(loess0,newdata=data.frame(ozone=1:155))
}
plot(ozone$ozone,ozone$temperature,pch=19,cex=0.5)
for(i in 1:10){lines(1:155,ll[i, ],col="grey",lwd=2)}
lines(1:155,apply(ll,2,mean),col="red",lwd=2) Figure 4 Bagging loess(), Reduces Variance

Bagging in caret:
In train() function consider method options: bagEarth, treebag, bagFDA.
More bagging in caret (need to build function by ourselves):
Doc

### notes and further resources:

Notes:

• Bagging is most useful for nonlinear models
• Often used with trees - an extension is random forests
• Several models use bagging in caret's train function

Further resources:

## 021 randomForests, week 3.3

### idea:

1. Bootstrap samples
2. At each split, bootstrap variables
3. Grow multiple trees and vote/average • Accuracy

### cons:

1. Speed
2. Interpretability
3. Overfitting

Thus, it is important to use cross validation.

### example:

data(iris); library(ggplot2)
inTrain = createDataPartition(y=iris$Species, p=0.7, list=FALSE) training = iris[inTrain, ] testing = iris[-inTrain, ] library(caret) modFit = train(Species~ .,data=training,method="rf",prox=TRUE) # prox will be used for visualizing class centers. modFit getTree(modFit$finalModel,k=2) # get the 2nd tree

# Class centers:
irisP = classCenter(training[,c(3,4) ], training$Species, modFit$finalModel$prox) irisP = as.data.frame(irisP); irisP$Species = rownames(irisP)
p = qplot(Petal.Width, Petal.Length, col=Species,data=training)
p + geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)

# Predict:
pred = predict(modFit,testing);
table(pred, testing$Species); plot(pred, testing$Species)

# 彩图显示错误:
testing$predRight = pred==testing$Species
qplot(Petal.Width,Petal.Length,colour=predRight,data=testing,main="newdata Predictions")


### notes and further resources:

Notes:

• Random forests are usually one of the two top performing algorithms along with boosting in prediction contests.
• Random forests are difficult to interpret but often very accurate.
• Care should be taken to avoid overfitting (see rfcv funtion) Random Forest Cross-Valdidation for feature selection

Further resources:

## 022 boosting, week 3.4

### basic idea:

1. Take lots of (possibly) weak predictors
2. Weight them and add them up
3. Get a stronger predictor

Create a classifier that combines a set of classifiers. (Examples: All possible trees, all possible regression models …) ### boosting in r:

• Boosting can be used with any subset of classifiers
• One large subclass is gradient boosting

• R has multiple boosting libraries. Differences include the choice of basic classification functions and combination rules.
• lib gbm - boosting with trees.

• lib mboost - model based boosting

• lib gamBoost for boosting generalized additive models

• Most of these are available in the caret package

## 023 modelBasedPrediction, week 3.5

### basic idea:

1. Assume the data follow a probabilistic model
2. Use Bayes' theorem to identify optimal classifiers

Pros:

• Can take advantage of structure of the data (distribution)
• May be computationally convenient
• Are reasonably accurate on real problems

Cons:

• When the model is incorrect you may get reduced accuracy

Our goal is to build parametric model for conditional distribution P(Y=k|X=x).
Typically prior probabilities πk are set in advance.

A range of models use this approach:
Linear discriminant analysis assumes fk(x) is multivariate Gaussian with same covariances.
Quadratic discrimant analysis assumes fk(x) is multivariate Gaussian with different covariances.
Model based prediction assumes more complicated versions for the covariance matrix.
Naive Bayes assumes independence between features for model building.
ref

# Week 4

## 024 regularizedRegression, week 4.1

### basic idea:

1. Fit a regression model
2. Penalize (or shrink) large coefficients

Pros:

• Can help with the bias/variance tradeoff
• Can help with model selection

Cons:

• May be computationally demanding on large data sets
• Does *not perform as well as RF and boosting??? *

### decomposing expected prediction error (???)

Assume \begin{equation}(Y_i = f(X_i) + \epsilon_i)
(EPE(\lambda) = E\left[{Y - \hat{f}_{\lambda}(X)}^2\right])\end{equation}
Suppose \begin{equation} (\hat{f}_{\lambda}) \end{equation} is the estimate from the training data and look at a new data point \begin{equation} (X = x^*) \end{equation}
\begin{equation} [E\left[{Y - \hat{f}_{\lambda}(x^*)}^2\right] = \sigma^2 + {E[\hat{f}_{\lambda}(x^*)] - f(x^*)}^2 + var[\hat{f}_\lambda(x_0)]] = Irreducible error + Bias(^2) + Variance\end{equation}
ppt

### lasso (???)

Lasso:

1. shrinks all the coefficients and
2. set some of them to 0 (model selection)

• In caret methods are: ridge, lasso, relaxo

## 025 combiningPredictors, week 4.2

### key idea:

• You can combine classifiers by averaging/voting
• Combining classifiers improves accuracy
• Combining classifiers reduces interpretability
• Boosting, bagging, and random forests are variants on this theme

### approaches for combining classifiers:

1. Combining similar classifiers
• Bagging, boosting, random forests
2. Combining different classifiers
• Model stacking
• Model ensembling

### example

mod1  =  train(wage ~.,method="glm",data=training)
mod2  =  train(wage ~.,method="rf", data=training, trControl = trainControl(method="cv"),number=3)
pred1  =  predict(mod1,testing); pred2  =  predict(mod2,testing)
middleData  =  data.frame(pred1, pred2, wage=*testing*$wage) modComb = train(wage ~., method="gam", data=middleData) predComb = predict(modComb, middleData) # Testing errors: sqrt(sum((pred1-testing$wage)^2)); sqrt(sum((pred2-testing$wage)^2)); sqrt(sum((combPred-testing$wage)^2))


### notes and further resources:

• Even simple blending can be useful
• Typical model for binary/multiclass data
• Build an odd number of models
• Predict with each model
• Predict the class by majority vote
• This can get dramatically more complicated

## 027 forecasting, week 4.3

### example 1

#### Data
library(quantmod)
from.dat  =  as.Date("01/01/08", format="%m/%d/%y")
to.dat  =  as.Date("12/31/13", format="%m/%d/%y")

#### Summarize monthly and store as time series
mGoog  =  to.monthly(GOOG)
googOpen  =  Op(mGoog)
ts1  =  ts(googOpen,frequency=12)
plot(ts1,xlab="Years+1", ylab="GOOG")

#### Decompose a time series into parts
plot(decompose(ts1),xlab="Years+1")

#### Training and testing
ts1Train  =  window(ts1,start=1,end=5)
ts1Test  =  window(ts1,start=5,end=(7-0.01))
plot(ts1Train)
lines(ma(ts1Train,order=3),col="red")

#### Exponential
ets1  =  ets(ts1Train,model="MMM")
fcast  =  forecast(ets1)
plot(fcast); lines(ts1Test,col="red")
accuracy(fcast,ts1Test)


### example 2: quiz 4, Q5

ibrary(lubridate) # For year() function below
training = data[year(data$date) < 2012, ] testing = data[(year(data$date)) > 2011, ]
tstrain = ts(training$visitsTumblr) tstest=ts(testing$visitsTumblr)
mod = bats(tstrain)
pred = forecast(mod, h=235)
num95 = sum(tstest >= pred$lower[, 2] & tstest <= pred$upper[,2])
numTotal = length(tstest)
acc = num95 / numTotal
acc # 0.9617021


## 026 unsupervisedPrediction, week4.4

### iris example ignoring species labels:

ata(iris); library(ggplot2)
inTrain  =  createDataPartition(y=iris$Species, p=0.7, list=FALSE) training = iris[inTrain, ] testing = iris[-inTrain, ] trainingNoSp = subset(training, select=-c(Species)) kMeans1 = kmeans(trainingNoSp, centers=3) training$clusters  =  as.factor(kMeans1$cluster) qplot(Petal.Width, Petal.Length, colour=clusters, data=training) table(kMeans1$cluster,training$Species) modFit = train(clusters ~.,data=subset(training, select=-c(Species)), method="rpart") table(predict(modFit,training),training$Species)


### notes and further resources:

• The cl_predict function in the clue package provides similar functionality
• Beware over-interpretation of clusters!
• This is one basic approach to recommendation engines

# Errors

## error - atomic for sort

table(predictTrainingClean, trainingClean\$classe)
error in sort.list(y) : 'x' must be atomic for 'sort.list'. Have you called 'sort' on a list?
table(predictTrainingClean[], trainingClean[])

## warning - row nr doesn't match

warning: from predict(): 'newdata' had 246 rows but variables found have 251 rows.
solution:
Check the modFit column names, predictors' names should be the same as newdata. 