library(ggplot2)
data(gala, package = "faraway")
source(url("http://stat552.cwick.co.nz/code/stat_qqline.R")) # my add a
# line to a q-q plot in ggplot2 code
fit_gala <- lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent,
data = gala)
# === Residual plots ====
# we could pull out things by hand,
residuals(fit_gala)
fitted(fit_gala)
# or make use of the fortify.lm method in ggplot2
fortify(fit_gala)
# COMMENT: the augment function in the broom package
# would be my reccomendation now.
# more generally if there are columns that weren't in the model use
fortify(model = fit_gala, data = gala)
# fortify methods are designed to turn complicated objects into data
# frames that are useful for plotting
# Examine the data.frames that are output
# Save the result
gala_fit_df <- fortify(fit_gala, gala)
# Then it is easy to make residual plots
# = residuals versus fitted
qplot(.fitted, .resid, data = gala_fit_df)
# = residuals versus explanatories
qplot(Area, .resid, data = gala_fit_df)
qplot(Elevation, .resid, data = gala_fit_df)
qplot(Nearest, .resid, data = gala_fit_df)
qplot(Scruz, .resid, data = gala_fit_df)
# = q-q plot
qplot(sample = .resid, data = gala_fit_df) + stat_qqline()
# a shortcut: you don't need to explicity fortify, just put the model in
# the data argument
qplot(sample = .resid, data = fit_gala) + stat_qqline()
# What problems do you see? Which is the biggest problem?
# Remember the rough hierarchy from class
# Can you fix it?
# Try it and rexamine plots
# === FIX ME === #
fit_gala_new <- lm( ~ ,
data = gala)
qplot(.fitted, .resid, data = fit_gala_new)
qplot(sample = .resid, data = fit_gala_new) + stat_qqline()
# === Calibrating your eyes ====
# It's useful to look at plots where the assumptions are satisfied
# to help learn what can be attributable to sampling variability.
# For example, once we've transformed the Species with a square root
# the Normal q-q plot looks much better, but is there still evidence
# of Non-Normality.
# The best comparisons are made when everything else is the same: same
# sample size, same sample mean, same sample sd.
obs_res <- residuals(fit_gala_new)
fake_errs <- rnorm(length(obs_res), mean = mean(obs_res), sd = sd(obs_res))
qplot(sample = fake_errs) + stat_qqline()
# even better let's wrap that into a function
fake_qq <- function(obs_res){
fake_errs <- rnorm(length(obs_res), mean = mean(obs_res),
sd = sd(obs_res))
qplot(sample = fake_errs) + stat_qqline()
}
fake_qq(obs_res)
fake_qq(obs_res)
# Even better create a lineup with the real data in a random position
# Does the data look guilty?
# Don't worry about understanding this code, unless you want to.
library(gridExtra)
plot_list <- lapply(1:10, function(x) fake_qq(obs_res))
real_ind <- sample(10, size = 1)
plot_list[[real_ind]] <- qplot(sample = obs_res) + stat_qqline()
do.call(grid.arrange, plot_list)
# There is actually a whole package devoted to making these lineups
# see: http://cran.r-project.org/web/packages/nullabor/vignettes/nullabor.html
# install.packages(nullabor)
# install.packages("DEoptimR")
library(nullabor)
fit_sqrt_df <- fortify(fit_gala_new)
# first we can create a set of null datasets
# in this case:
# generating the contents of the column, .resid to be Normal
lineup_dat <- lineup(null_dist(".resid", dist = "normal"),
fit_sqrt_df)
head(lineup_dat)
# then use the data to make any kind of diagnostic plot we want
qplot(.fitted, .resid, data = lineup_dat, geom = 'point') +
facet_wrap(~ .sample)
qplot(Adjacent, .resid, data = lineup_dat, geom = 'point') +
facet_wrap(~ .sample)
qplot(sample = .resid, data = lineup_dat) +
stat_qqline() +
facet_wrap(~ .sample)
# find true data with decrypt
# === Practice data ====
# Examine the residual plots for the following 3 model fits
# Can you fix any problems you find?
data(cheddar, package = "faraway")
?cheddar
fit_cheddar <- lm(taste ~ Acetic + H2S + Lactic, data = cheddar)
data(teengamb, package = "faraway")
fit_teen <- lm(gamble ~ income + status + income, data = teengamb)
# I purposely left out sex from this model. Can you tell if that is a
# problem from just examing the residual plots?
data(cornnit, package = "faraway")
fit_corn <- lm(yield ~ nitrogen, data = cornnit)