library(faraway)
library(tidyverse)
library(broom)
library(leaps)
source("http://stat552.cwick.co.nz/lecture/fortify-leaps.R")
data(state)
state_data <- data.frame(state.x77)
# fit all models with up to 8 (`nvmax`) variables
# keep only best 5 (`nbest`) for each size
all <- regsubsets(Life.Exp ~ ., data = state_data,
method = "exhaustive", nbest = 35, nvmax = 8)
# I fit all models here, you probably shouldn't if
# there are lots of variables.
summary(all)
plot(all)
# grab details on all models
models <- fortify(all)
head(models)
# manually add AIC as well
n <- nrow(state_data)
models$aic <- n*log(models$rss/n) + 2*(models$size)
# AIC plot
ggplot(models, aes(size, aic)) +
geom_point(aes(color = abs(Murder_coef) > 1e-6))
# Plot each other metrics versus size
ggplot(models, aes(size, bic)) +
geom_point()
ggplot(models, aes(size, cp)) +
geom_point()
ggplot(models, aes(size, adjr2)) +
geom_point()
# == What are the best five models according to each metric? ====
models %>% arrange(aic) %>% slice(1:5)
models %>% arrange(bic) %>% slice(1:5)
models %>% arrange(cp) %>% slice(1:5)
models %>% arrange(desc(adjr2)) %>% slice(1:5)
# == Why do all the metrics agree on the rank of size 2 models? ====
filter(models, size == 2)
# for fixed p, all criteria just rank on RSS