#Load library
library(knitr)
library(markdown)
library(rmarkdown)
library(leaps)
library(glmnet)
#set working directory
setwd("~/Fall 2016/IS 777/Project")
#Fetching and reading data file
LowerBack=read.csv("lowerback_data.csv")
Check for missing values
sum(is.na(LowerBack))
## [1] 0
BEST SUBSET REGRESSION It looks through all possible regression models of all different subset sizes and looks for the best of each size, produces a sequence of models which is the best subset for each particular size. By default, it only goes up to best subset of size = 8 variables, so change it to all 12 variables:
LowerBack.full <- regsubsets(LowerBack$Class ~., LowerBack, nvmax = 12)
reg.summary <- summary(LowerBack.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(LowerBack$Class ~ ., LowerBack, nvmax = 12)
## 12 Variables (and intercept)
## Forced in Forced out
## pelvic_incidence FALSE FALSE
## pelvic_tilt FALSE FALSE
## lumbar_lordosis_angle FALSE FALSE
## pelvic_radius FALSE FALSE
## degree_spondylolisthesis FALSE FALSE
## pelvic_slope FALSE FALSE
## Direct_tilt FALSE FALSE
## thoracic_slope FALSE FALSE
## cervical_tilt FALSE FALSE
## sacrum_angle FALSE FALSE
## scoliosis_slope FALSE FALSE
## sacral_slope FALSE FALSE
## 1 subsets of each size up to 11
## Selection Algorithm: exhaustive
## pelvic_incidence pelvic_tilt lumbar_lordosis_angle sacral_slope
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " "*" " " " "
## 4 ( 1 ) "*" "*" " " " "
## 5 ( 1 ) "*" "*" "*" " "
## 6 ( 1 ) "*" "*" "*" " "
## 7 ( 1 ) "*" "*" "*" " "
## 8 ( 1 ) " " "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" " "
## 10 ( 1 ) "*" "*" "*" " "
## 11 ( 1 ) "*" "*" "*" " "
## pelvic_radius degree_spondylolisthesis pelvic_slope Direct_tilt
## 1 ( 1 ) " " "*" " " " "
## 2 ( 1 ) "*" "*" " " " "
## 3 ( 1 ) "*" "*" " " " "
## 4 ( 1 ) "*" "*" " " " "
## 5 ( 1 ) "*" "*" " " " "
## 6 ( 1 ) "*" "*" " " " "
## 7 ( 1 ) "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" " "
## 9 ( 1 ) "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*"
## thoracic_slope cervical_tilt sacrum_angle scoliosis_slope
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*"
## 8 ( 1 ) " " "*" " " "*"
## 9 ( 1 ) " " "*" " " "*"
## 10 ( 1 ) " " "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*"
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
plot(reg.summary$cp,xlab = "Number of Variables", ylab = "Cp")
which.min(reg.summary$cp)
## [1] 5
points(5,reg.summary$cp[5],pch=13,col="red")
Plot method for regsubsets() object
plot(LowerBack.full,scale = "Cp")
LowerBack.full$rss
## [1] 67.74194 67.41646 67.25583 60.30911 52.45674 52.06499 45.78566
## [8] 45.77848 45.75386 44.83162 44.80798 44.26195 44.26195
It’s the same function that did base subset. But we tell it method equals forward and we also want the number of variables.
LowerBack.fwd <- regsubsets(LowerBack$Class ~., data=LowerBack,nvmax = 12,method = "forward")
There is a summary for that just as there was for the base subset. And now this one if we look through the models that were selected, we will find that they exactly nested. So each new model includes all the variables that were before plus one new one.
summary(LowerBack.fwd)
## Subset selection object
## Call: regsubsets.formula(LowerBack$Class ~ ., data = LowerBack, nvmax = 12,
## method = "forward")
## 12 Variables (and intercept)
## Forced in Forced out
## pelvic_incidence FALSE FALSE
## pelvic_tilt FALSE FALSE
## lumbar_lordosis_angle FALSE FALSE
## pelvic_radius FALSE FALSE
## degree_spondylolisthesis FALSE FALSE
## pelvic_slope FALSE FALSE
## Direct_tilt FALSE FALSE
## thoracic_slope FALSE FALSE
## cervical_tilt FALSE FALSE
## sacrum_angle FALSE FALSE
## scoliosis_slope FALSE FALSE
## sacral_slope FALSE FALSE
## 1 subsets of each size up to 11
## Selection Algorithm: forward
## pelvic_incidence pelvic_tilt lumbar_lordosis_angle sacral_slope
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " "*" " " " "
## 4 ( 1 ) "*" "*" " " " "
## 5 ( 1 ) "*" "*" "*" " "
## 6 ( 1 ) "*" "*" "*" " "
## 7 ( 1 ) "*" "*" "*" " "
## 8 ( 1 ) "*" "*" "*" " "
## 9 ( 1 ) "*" "*" "*" " "
## 10 ( 1 ) "*" "*" "*" " "
## 11 ( 1 ) "*" "*" "*" " "
## pelvic_radius degree_spondylolisthesis pelvic_slope Direct_tilt
## 1 ( 1 ) " " "*" " " " "
## 2 ( 1 ) "*" "*" " " " "
## 3 ( 1 ) "*" "*" " " " "
## 4 ( 1 ) "*" "*" " " " "
## 5 ( 1 ) "*" "*" " " " "
## 6 ( 1 ) "*" "*" " " " "
## 7 ( 1 ) "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" " "
## 9 ( 1 ) "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*"
## thoracic_slope cervical_tilt sacrum_angle scoliosis_slope
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*"
## 8 ( 1 ) " " "*" " " "*"
## 9 ( 1 ) " " "*" " " "*"
## 10 ( 1 ) " " "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*"
We make the plot for this model, the schematic plot. This plot looks similar to base subset.
plot(LowerBack.fwd,scale = "Cp")
LowerBack.bwd <- regsubsets(LowerBack$Class ~., data = LowerBack,nvmax = 12,method = "backward")
summary(LowerBack.bwd)
## Subset selection object
## Call: regsubsets.formula(LowerBack$Class ~ ., data = LowerBack, nvmax = 12,
## method = "backward")
## 12 Variables (and intercept)
## Forced in Forced out
## pelvic_incidence FALSE FALSE
## pelvic_tilt FALSE FALSE
## lumbar_lordosis_angle FALSE FALSE
## pelvic_radius FALSE FALSE
## degree_spondylolisthesis FALSE FALSE
## pelvic_slope FALSE FALSE
## Direct_tilt FALSE FALSE
## thoracic_slope FALSE FALSE
## cervical_tilt FALSE FALSE
## sacrum_angle FALSE FALSE
## scoliosis_slope FALSE FALSE
## sacral_slope FALSE FALSE
## 1 subsets of each size up to 11
## Selection Algorithm: backward
## pelvic_incidence pelvic_tilt lumbar_lordosis_angle sacral_slope
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " "*" " " " "
## 4 ( 1 ) "*" "*" " " " "
## 5 ( 1 ) "*" "*" "*" " "
## 6 ( 1 ) "*" "*" "*" " "
## 7 ( 1 ) "*" "*" "*" " "
## 8 ( 1 ) "*" "*" "*" " "
## 9 ( 1 ) "*" "*" "*" " "
## 10 ( 1 ) "*" "*" "*" " "
## 11 ( 1 ) "*" "*" "*" " "
## pelvic_radius degree_spondylolisthesis pelvic_slope Direct_tilt
## 1 ( 1 ) " " "*" " " " "
## 2 ( 1 ) "*" "*" " " " "
## 3 ( 1 ) "*" "*" " " " "
## 4 ( 1 ) "*" "*" " " " "
## 5 ( 1 ) "*" "*" " " " "
## 6 ( 1 ) "*" "*" " " " "
## 7 ( 1 ) "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" " "
## 9 ( 1 ) "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*"
## thoracic_slope cervical_tilt sacrum_angle scoliosis_slope
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*"
## 8 ( 1 ) " " "*" " " "*"
## 9 ( 1 ) " " "*" " " "*"
## 10 ( 1 ) " " "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*"
plot(LowerBack.bwd,scale = "Cp")
Create predict Method first
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object,id=id)
xvars = names(coefi)
mat[,xvars] %*% coefi
}
We will set a random number seed. We are going to sample from the numbers 1 to 10. Each observation’s going to be assigned a fold number and there are 10 folds.We create a vector, 1 to 10, of length the number of rows of dataset. So that will try and make an equal number of ones, equal number of twos, up to equal number of 10s. And then we are going to randomize, basically shuffle that and that’s what the sample command does here.
k=10
set.seed(1)
folds=sample(1:k,nrow(LowerBack),replace=TRUE)
Now we make a matrix for our errors and it’s going to have 10 rows and 12 columns as there are 12 variables, so there are going to be 12 subset and 10 rows for each of the 10 folds.
cv.errors=matrix(NA,k,12, dimnames=list(NULL, paste(1:12)))
So now we are going to go through a double loop and so we go for k equals 1 in 10 and we fit a reg subsets model then we go through our loop for i equals 1 to 12 and with help of predict method. We make our prediction with his predict method and then we compute the mean squared error of the predictions and assign that into the kth row of our CV errors and we do that for all the i and for all k
for(j in 1:k){
bestfit.fwd=regsubsets(Class~.,data=LowerBack[folds!=j,],nvmax=12, method="forward")
for(i in 1:11){
pred=predict( bestfit.fwd,LowerBack[folds==j,],id=i)
cv.errors[j,i]=mean( (LowerBack$Class[folds==j]-pred)^2)
}
}
We now process out output matrix. First of all , we use the Apply function to the columns, so we take the column means and then we use square root to get the root mean squared error. We will make plot of that.
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
## 1 2 3 4 5 6 7
## 0.2215691 0.2066751 0.2050530 0.2011023 0.2125963 0.2088319 0.1940067
## 8 9 10 11 12
## 0.1946315 0.1961964 0.1964634 0.2154306 NA
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
reg.best=regsubsets(Class~.,data=LowerBack, nvmax=12,method = "forward")
coef(reg.best,7)
## (Intercept) pelvic_incidence pelvic_tilt
## 0.3719882247 -0.0010669857 0.0082686392
## lumbar_lordosis_angle degree_spondylolisthesis pelvic_slope
## 0.0018183233 0.0044613160 0.0370828807
## sacrum_angle sacral_slope
## 0.0003102214 0.0000000000
Similarly we do it for backward selection.
k=10
set.seed(1)
folds=sample(1:k,nrow(LowerBack),replace=TRUE)
cv.errors=matrix(NA,k,12, dimnames=list(NULL, paste(1:12)))
for(j in 1:k){
bestfit.bwd=regsubsets(Class~.,data=LowerBack[folds!=j,],nvmax=12, method="backward")
for(i in 1:11){
pred=predict(bestfit.bwd,LowerBack[folds==j,],id=i)
cv.errors[j,i]=mean( (LowerBack$Class[folds==j]-pred)^2)
}
}
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
reg.best=regsubsets(Class~.,data=LowerBack, nvmax=12,method = "backward")
coef(reg.best,3)
## (Intercept) pelvic_tilt degree_spondylolisthesis
## 0.392123757 0.008343006 0.004622579
## pelvic_slope
## 0.036739823
The glmnet() function has an alpha argument that determines what type of model is fit. If alpha=0 then a ridge regression model is fit, and if alpha=1 then a lasso model is fit. We will use the package “glmnet”, which does not use the model formula language, so we will set up an “x” and “y”
First we will fit a ridge-regression model. This is achieved by calling “glmnet” with “alpha=0”. There is also a “cv.glmnet” function which will do the cross-validation.
x=model.matrix(LowerBack$Class~.,LowerBack)[,-1]
y=LowerBack$Class
ridge.mod=glmnet(x,y,alpha=0)
plot(ridge.mod, xvar="lambda", label=TRUE)
The results of glmnet gives a whole path of possible coefficients, and to select a model, we could use glmnet’s built-in function called cv.glmnet(), which does k-fold cross-validation, with default k=10.
cv.ridge=cv.glmnet(x,y,alpha=0)
plot(cv.ridge)
The plot of cross-validation shows the cross-validated MSE mean squared error as a function of log(lambda). At the start, when log(lambda)=4, lambda is big, coefficients are restricted to be very small, MSE is big. Later and towards the end, when lambda is very small and coefficients are big, MSE becomes small and stays flat, indicating the full model is probably the best. At the top of the plot, the number 12 indicates the presence of all 12 variables in the model.
Finally, we refit our ridge regression model on the full data set, using the value of ?? chosen by cross-validation, and examine the coefficient estimates.
bestlam=cv.ridge$lambda.min
bestlam
## [1] 0.1605883
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:13,]
## (Intercept) pelvic_incidence pelvic_tilt
## 1.3525521505 0.0007221610 0.0061739656
## lumbar_lordosis_angle sacral_slope pelvic_radius
## 0.0020587635 -0.0022203073 -0.0082607491
## degree_spondylolisthesis pelvic_slope Direct_tilt
## 0.0032600863 0.0336336489 0.0002274377
## thoracic_slope cervical_tilt sacrum_angle
## -0.0010913735 0.0078157966 0.0004322943
## scoliosis_slope
## -0.0017149514
As expected, none of the coefficients are zero-ridge regression does not perform variable selection.
Lasso is similar to ridge regression. The only thing that was different was the penalty. So for lasso, we minimize the residual sum of squares plus lambda, and there’s a very subtle change. Instead of the sum of the squares of the coefficients, we penalized the absolute values of the coefficients. It’s also controlling the size of the coefficients, and by penalizing coefficients will restrict some of the coefficients to be exactly zero.
fit.lasso=glmnet(x,y)
plot(fit.lasso,xvar="lambda", label=TRUE)
The plot shows its fit the whole path, and when we make the plot against a lot of lambda, indeed what we see is that initially all the coefficients are 0. At the top of the plot we actually see, as a function of lambda as well, how many non-zero variables are in the model. So Lasso is doing both shrinkage and variable selection
Let’s do cross validation along with lasso.
cv.lasso=cv.glmnet(x,y)
plot(cv.lasso)
coef(cv.lasso)
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.971487638
## pelvic_incidence .
## pelvic_tilt 0.001482227
## lumbar_lordosis_angle .
## sacral_slope .
## pelvic_radius -0.003319868
## degree_spondylolisthesis 0.002715560
## pelvic_slope .
## Direct_tilt .
## thoracic_slope .
## cervical_tilt .
## sacrum_angle .
## scoliosis_slope .
It tell us that the best model, the minimum cross validation areas at full size 3 and within one standard error, we have a model of size nearly 2, and somewhat flat in between.