Lab 9 Mar 06 2019

Goals:

library(tidyverse)

Lasso and ridge regression

Work through code from Faraway 11.3 and 11.4

Meat fat content example from model selection examples in class:

data(meatspec, package = "faraway")
# Faraway takes the first 172 obs as training
# you shouldn't generally do that without careful
# thought
ind <- 1:172
trainmeat <- meatspec[ind, ]
testmeat <- meatspec[-ind, ]

# center all columns of x for ridge and lasso
x <- scale(meatspec[, -101], center = T, scale = F)
y <- meatspec[, 101]

testx <- x[-ind, ]
testy <- y[-ind]
trainx <- x[ind, ]
trainy <- y[ind]
rmse <- function(x,y) sqrt(mean((x-y)^2))
fit_all <- lm(fat ~ ., data = trainmeat)
(rmse_all <- rmse(testmeat$fat, predict(fit_all, newdata = testmeat)))
## [1] 3.814

Recall that a model with all predictors, had an estimated root mean square prediction error of about 3.81.

fit_best <- lm(fat ~ V1+V17+V19+V20+V21+V24+V31+V35+V37+V39+V40+V41+V44+V47+V48+
    V51+V53+V57,
  data = trainmeat)
(rmse_best <- rmse(testmeat$fat, predict(fit_best, newdata = testmeat)))
## [1] 1.837097

We also found some smaller models on Monday with smaller error, e.g., 1.84.

Ridge regression

Estimating the coefficients for a sequence of tuning parameters:

library(MASS)
rgmod <- lm.ridge(I(trainy - mean(trainy)) ~ trainx,
                  lambda = seq(0, 5e-8, 1e-9))

matplot(rgmod$lambda, coef(rgmod), type = "l", 
  xlab = expression(lambda), 
  ylab = expression(hat(beta)), col = 1)

Choosing one tuning parameter:

(min_gcv <- which.min(rgmod$GCV))
## 1.8e-08 
##      19
matplot(rgmod$lambda, coef(rgmod), type = "l", 
  xlab = expression(lambda), 
  ylab = expression(hat(beta)), col = 1)
abline(v = rgmod$lambda[min_gcv])

Evaluating the predictive accuracy:

ypred <- scale(trainx, 
               center = F, 
               scale = rgmod$scales) %*% # scale covariates
  rgmod$coef[, min_gcv] + # multiply by estimated coefficients
  mean(trainy) # add back mean (since response was centered)

rmse(ypred, trainy)
## [1] 0.8159824
ypred <- scale(testx, 
               center = F, 
               scale = rgmod$scales) %*% # scale covariates
  rgmod$coef[, min_gcv] + # multiply by estimated coefficients
  mean(trainy) # add back mean (since response was centered)
rmse(ypred, testy)
## [1] 4.067914

One particularly bad prediction (looks OK to me, why does Faraway get something different?:

c(ypred[13], testmeat$fat[13])
## [1] 11.32387 34.80000
rmse(ypred[-13], testmeat$fat[-13])
## [1] 1.954433
cbind(ypred, testy)
##               testy
## 173 48.047247  47.7
## 174 23.143626  25.5
## 175  8.944386  10.6
## 176  3.461404   2.0
## 177  6.141724   4.7
## 178  6.741146   6.8
## 179 10.116475   8.6
## 180 11.781558  11.2
## 181 13.553112  13.8
## 182 14.937346  17.7
## 183 28.464430  23.3
## 184 24.774611  29.0
## 185 11.323870  34.8
## 186 46.180519  47.8
## 187  9.960860  11.1
## 188  4.791950   2.9
## 189  9.148401   4.8
## 190  5.900053   5.6
## 191  6.465477   6.2
## 192  6.794227   6.4
## 193  6.004676   6.8
## 194  6.815092   7.1
## 195  7.820161   7.3
## 196  8.016253   7.9
## 197 10.328492   9.2
## 198  9.358988  10.1
## 199 11.897665  10.7
## 200  7.628819  11.2
## 201 11.506780  12.5
## 202 14.621129  14.3
## 203 11.126646  16.0
## 204 16.681274  17.0
## 205 16.686706  18.1
## 206 20.448977  19.4
## 207 23.345195  24.8
## 208 26.978289  27.2
## 209 29.363296  28.4
## 210 29.227825  29.2
## 211 29.045001  31.3
## 212 31.692496  33.8
## 213 37.691766  35.5
## 214 41.439211  42.5
## 215 46.180519  47.8
which.max(abs(ypred - testy))
## [1] 13
rmse(ypred[-13], testy[-13])
## [1] 1.954433

Lasso

library(lars)

Example with a smaller set of predictors (the state life expectancy data from last week)

Estimating the coefficients for a sequence of tuning parameters:

statedata <- data.frame(state.x77, row.names = state.abb)
lmod <- lars(as.matrix(statedata[, -4]), statedata$Life)

Choosing one tuning parameter:

plot(lmod)

set.seed(123)
cvlmod <- cv.lars(as.matrix(statedata[, -4]), statedata$Life)

(s <- cvlmod$index[which.min(cvlmod$cv)])
## [1] 0.6565657

Compare the coefficients in the lasso model:

predict(lmod, s = s, type = "coef", mode = "fraction")$coef
##    Population        Income    Illiteracy        Murder       HS.Grad 
##  2.345262e-05  0.000000e+00  0.000000e+00 -2.398786e-01  3.528871e-02 
##         Frost          Area 
## -1.694932e-03  0.000000e+00

to those from the least squares fit:

coef(lm(Life.Exp ~ Population + Murder + HS.Grad + Frost, statedata))
##   (Intercept)    Population        Murder       HS.Grad         Frost 
##  7.102713e+01  5.013998e-05 -3.001488e-01  4.658225e-02 -5.943290e-03

Back to the meat fat example

Estimating the coefficients for a sequence of tuning parameters:

lassomod <- lars(trainx, trainy)

Choosing one tuning parameter:

set.seed(123)
cvout <- cv.lars(trainx, trainy)

(best_s <- cvout$index[which.min(cvout$cv)])
## [1] 0.01010101

Evaluating the predictive accuracy:

predlars <- predict(lassomod, testx, s = best_s, mode = "fraction")
rmse(testy, predlars$fit)
## [1] 2.132218

Examing which coefficients are non-zero:

predlars <- predict(lassomod, s = best_s, type = "coef", mode = "fraction")
plot(predlars$coef, type = "h", ylab = "Coefficient")

sum(predlars$coef != 0)
## [1] 20

Sensitivity of model selection

Work through code in Faraway 10.3

library(leaps)
source("http://stat552.cwick.co.nz/lecture/fortify-leaps.R")

In class we used regsubsets() to select variables to predict life expectancy from other state level data. In particular, imagine we use an exhaustive search and choose a model with the highest adjusted R-squared:

b <- regsubsets(Life.Exp ~ ., data = statedata)
rs <- summary(b) 
rs$which[which.max(rs$adjr2),]
## (Intercept)  Population      Income  Illiteracy      Murder     HS.Grad 
##        TRUE        TRUE       FALSE       FALSE        TRUE        TRUE 
##       Frost        Area 
##        TRUE       FALSE

It’s possible this choice is sensitive both to indivudal points and transformations.

Alaska is unusual:

lmod <- lm(Life.Exp ~ ., data = statedata)
broom::augment(lmod) %>% 
  arrange(desc(.hat))
## # A tibble: 50 x 16
##    .rownames Life.Exp Population Income Illiteracy Murder HS.Grad Frost
##    <chr>        <dbl>      <dbl>  <dbl>      <dbl>  <dbl>   <dbl> <dbl>
##  1 AK            69.3        365   6315        1.5   11.3    66.7   152
##  2 CA            71.7      21198   5114        1.1   10.3    62.6    20
##  3 HI            73.6        868   4963        1.9    6.2    61.9     0
##  4 NV            69.0        590   5149        0.5   11.5    65.2   188
##  5 NM            70.3       1144   3601        2.2    9.7    55.2   120
##  6 TX            70.9      12237   4188        2.2   12.2    47.4    35
##  7 NY            70.6      18076   4903        1.4   10.9    52.7    82
##  8 WA            71.7       3559   4864        0.6    4.3    63.5    32
##  9 OR            72.1       2284   4660        0.6    4.2    60      44
## 10 ND            72.8        637   5087        0.8    1.4    50.3   186
## # … with 40 more rows, and 8 more variables: Area <dbl>, .fitted <dbl>,
## #   .se.fit <dbl>, .resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>,
## #   .std.resid <dbl>

Without it, the model selected changes:

b <- regsubsets(Life.Exp ~ ., data = statedata, subset = (state.abb != "AK"))
rs <- summary(b) 
rs$which[which.max(rs$adjr2),]
## (Intercept)  Population      Income  Illiteracy      Murder     HS.Grad 
##        TRUE        TRUE       FALSE       FALSE        TRUE        TRUE 
##       Frost        Area 
##        TRUE        TRUE

Notice Area and Population might benefit from a transform:

stripchart(data.frame(scale(statedata)), method = "jitter", las = 2, vertical = TRUE)

If we transform them, the selected model changes:

b <- regsubsets(Life.Exp ~ log(Population) + Income + Illiteracy + Murder + HS.Grad + Frost + log(Area), statedata)
rs <- summary(b)
rs$which[which.max(rs$adjr), ]
##     (Intercept) log(Population)          Income      Illiteracy 
##            TRUE            TRUE           FALSE           FALSE 
##          Murder         HS.Grad           Frost       log(Area) 
##            TRUE            TRUE            TRUE           FALSE