library(ggplot2)
library(reshape2)
library(faraway)
# === Multicollinearity ====
data(seatpos, package = "faraway")
lmod <- lm(hipcenter ~ ., data = seatpos)
summary(lmod)
qplot(Ht, hipcenter, data = seatpos)
# marginally, this seems like a really strong relationship
# correlations between explanatory variables ===
# only picks up pairwise collinearity
round(cor(seatpos[, -9]), 2)
# or look graphically
image(as.matrix(cor(seatpos[, -9])))
# or more pretty
cors_x <- as.data.frame(cor(seatpos[, -9]))
cors_x$V1 <- factor(rownames(cors_x), levels = colnames(cors_x))
cors_xm <- melt(cors_x, id.vars = "V1")
qplot(V1, variable, data = cors_xm, geom = "tile",
fill = value) +
scale_fill_gradient2()
# the most pairwise correlated
qplot(HtShoes, Ht, data = seatpos)
# eigenvaues of X ====
x <- model.matrix(lmod)[, -1]
e <- eigen(t(x) %*% x)
e$val
# condition numbers
sqrt(e$val[1]/e$val)
# variance inflation factors ====
# for one varible, 1/(1 - R.squared_i)
# regress on explantory on the others
lm_x1 <- lm(Age ~ . - hipcenter, data = seatpos)
summary(lm_x1)
r_sq1 <- summary(lm_x1)$r.squared
1/(1-r_sq1)
# or for all varss
vif(lmod)
# is there a cutoff?
# estimates are sensitive to noise ====
lmod_noisy <- lm(I(hipcenter + 10 * rnorm(38)) ~ ., data = seatpos)
coef(lmod_noisy)
coef(lmod)
# === Solutions ====
# test multiple related terms at once
# i.e. does height matter?
lmod3 <- lm(hipcenter ~ Age + Weight + Arm + Thigh + Leg, seatpos)
anova(lmod3, lmod)
# no evidence for relationship with height after accounting for
# length of limbs, and age and weight
lmod4 <- lm(hipcenter ~ Age + Weight + Ht + Seated + HtShoes, seatpos)
anova(lmod4, lmod)
# no evidence for a relationship between hip center and length of
# limbs after accounting for height
lmod5 <- lm(hipcenter ~ Age + Weight, seatpos)
anova(lmod5, lmod)
# evidence for at least one height or length of limbs variables
add1(lm(hipcenter ~ Age + Weight, data = seatpos), lmod, test = "F")
# this considers all models with Age and Weight and one other variable.
# Any one of the height or length of limbs terms are associated with
# seat positions after accounting for age and weight.
ss <- add1(lm(hipcenter ~ Age + Weight, data = seatpos), lmod, test = "F")
tss <- sum((seatpos$hipcenter - mean(seatpos$hipcenter))^2)
R_sq <- (tss - ss[, "RSS"])/tss
names(R_sq) <- rownames(ss)
R_sq
# and any one of HtShoes, Ht or Leg alone would get a similar
# R^2 to the full model
# "Orthogonalising" can help too...
seatpos$shoes <- with(seatpos, HtShoes - Ht)
with(seatpos, cor(Ht, HtShoes))
with(seatpos, cor(Ht, shoes))
fit_orth <- lm(hipcenter ~ . - HtShoes, data = seatpos)
# exact same fit
summary(fit_orth)
# but Ht and shoes are now not correlated, and VIFs lower.
vif(fit_orth)