library(tidyverse)
library(faraway)
set.seed(19198)
# === Multicollinearity ====
data(seatpos, package = "faraway")
lmod <- lm(hipcenter ~ ., data = seatpos)
summary(lmod)
ggplot(seatpos, aes(Ht, hipcenter)) +
geom_point()
# marginally, this seems like a really strong relationship
# correlations between explanatory variables ===
# only picks up pairwise collinearity
round(cor(seatpos[, -9]), 2)
# or look graphically
cor_tbl <- cor(seatpos[, -9]) %>%
as_tibble() %>%
add_column(variable_1 = names(.)) %>%
gather(key = "variable_2", value = "correlation", -variable_1)
cor_tbl %>%
ggplot(aes(variable_1, variable_2)) +
geom_tile(aes(fill = correlation)) +
scale_fill_gradient2()
ggplot(seatpos, aes(Ht, HtShoes)) +
geom_point() +
geom_abline()
# the most pairwise correlated
# eigenvaues of X ====
X <- model.matrix(lmod)[, -1]
e <- eigen(t(X) %*% X)
e$val
# zeros would indicate exact collinearity
# condition numbers - relative sizes of eigenvalues
e$val[1]/e$val
sqrt(e$val[1]/e$val)
# > 30 considered large
# variance inflation factors ====
# for one varible, 1/(1 - R.squared_i)
# regress on explantory on the others
lm_x1 <- lm(Age ~ . - hipcenter, data = seatpos)
summary(lm_x1)
r_sq1 <- summary(lm_x1)$r.squared
1/(1-r_sq1)
# or for all varss
vif(lmod)
# is there a cutoff? rules of thumb 5? 10?
# estimates are sensitive to noise ====
lmod_noisy <- lm(I(hipcenter + 10 * rnorm(38)) ~ ., data = seatpos)
coef(lmod_noisy)
coef(lmod)
# === Solutions ====
# test multiple related terms at once
# i.e. does height matter?
lmod3 <- lm(hipcenter ~ Age + Weight + Arm + Thigh + Leg, seatpos)
anova(lmod3, lmod)
# no evidence for relationship with height after accounting for
# length of limbs, and age and weight
lmod4 <- lm(hipcenter ~ Age + Weight + Ht + Seated + HtShoes, seatpos)
anova(lmod4, lmod)
# no evidence for a relationship between hip center and length of
# limbs after accounting for height
lmod5 <- lm(hipcenter ~ Age + Weight, seatpos)
anova(lmod5, lmod)
# evidence for at least one height or length of limbs variables
add1(lm(hipcenter ~ Age + Weight, data = seatpos), lmod, test = "F")
# this considers all models with Age and Weight and one other variable.
# Any one of the height or length of limbs terms are associated with
# seat positions after accounting for age and weight.
ss <- add1(lm(hipcenter ~ Age + Weight, data = seatpos), lmod, test = "F")
tss <- sum((seatpos$hipcenter - mean(seatpos$hipcenter))^2)
R_sq <- (tss - ss[, "RSS"])/tss
names(R_sq) <- rownames(ss)
R_sq
# and any one of HtShoes, Ht or Leg alone would get a similar
# R^2 to the full model
# "Orthogonalising" can help too...
seatpos$shoes <- with(seatpos, HtShoes - Ht)
with(seatpos, cor(Ht, HtShoes))
with(seatpos, cor(Ht, shoes))
fit_orth <- lm(hipcenter ~ . - HtShoes, data = seatpos)
# exact same fit
summary(fit_orth)
# but Ht and shoes are now not correlated, and VIFs lower.
vif(fit_orth)