# === Lab #9
# Concepts:
# * Plotting and predicting with transformations
# * Lack of fit tests
# === Interpretation after log transforms ====
library(ggplot2)
library(Sleuth3)
library(plyr)
?ex0921
# "The goal is to see whether the distribution of species' ingestion rates
# depends on the percentage of organic matter in the food, after accounting
# for the effect of species weight and to describe the association."
# exploratory analysis
library(GGally)
ggpairs(ex0921[, -1],
upper = "blank",
lower = list(continuous = "points", combo = "dot"),
diag = list(continuous = "density", discrete = "bar"),
axisLabels='show'
)
# order of magnitude differences + clumping in low end suggests log transform
# easiest way for this library is to produce logged versions of the vars
ex0921_log <- colwise(log, is.numeric)(ex0921)
ggpairs(ex0921_log,
upper = "blank",
lower = list(continuous = "points", combo = "dot"),
diag = list(continuous = "density", discrete = "bar"),
axisLabels='show'
)
# a tentative model
fit_ingest <- lm(log(Ingestion) ~ log(Organic) + log(Weight), data = ex0921)
summary(fit_ingest)
qplot(.fitted, .resid, data = fit_ingest)
# looks ok, check other diagnostics too
summary(fit_ingest)
# Interpret the coefficient on log(Weight) on the log scale
# Interpret the coefficient on log(Weight) on the original scale
# Interpret the effect of Weight on the original scale
# assuming response is symmetric on log scale
qplot(.resid, data = fortify(fit_ingest))
# ? hard to tell, maybe suspicious here...
# (check later: box-cox transform)
# === Plotting predictions with transforms ====
# Let's say we want a plot of Ingestion rates as a function
# of Organic material for an average weight species
# our previous approach works just as well here
pred_df <- with(ex0921, expand.grid(Weight = mean(Weight),
Organic = seq(1, 100, 1)))
predict(fit_ingest, pred_df)
# but these are predictions of log(Ingestion)
# let's grab upper and lower limits too
pred_df <- cbind(pred_df, predict(fit_ingest, pred_df, interval = "confidence"))
qplot(log(Organic), fit, data = pred_df, geom = "line") +
ylab("Estimated mean log(Ingestion)")
# linear on the log-log scale
# just back transform the estimate to put it on the orignal scale
qplot(log(Organic), exp(fit), data = pred_df, geom = "line") +
ylab("Estimated mean Ingestion")
# we can also backtransform confidence limits
qplot(log(Organic), exp(fit), data = pred_df, geom = "line") +
geom_ribbon(aes(ymax = exp(upr), ymin = exp(lwr)), alpha = 0.2) +
ylab("Estimated mean Ingestion")
# and plot against untransformed Organic
qplot(Organic, exp(fit), data = pred_df, geom = "line") +
geom_ribbon(aes(ymax = exp(upr), ymin = exp(lwr)), alpha = 0.2) +
ylab("Estimated mean Ingestion")
# === Lack of fit F-test example ====
?ex1030
# some exploration
qplot(Age, WeeklyEarnings, data = ex1030, alpha = I(0.2))
qplot(factor(EducationCode), WeeklyEarnings,
data = ex1030, geom = "boxplot")
qplot(Race, WeeklyEarnings, data = ex1030, geom = "boxplot") +
facet_wrap(~ Region)
# What else?
fit_earn <- lm(log(WeeklyEarnings) ~ Race*Region + Age +
EducationCategory + MetropolitanStatus, data = ex1030)
qplot(.fitted, .resid, data = fit_earn, alpha = I(0.2))
# some bad low outliers (consequence of the log transfom) hopefully
# not too influential (should check...)
# == saturated model
# like one mean per group, where group is every combination of vars
ex1030$groups <- with(ex1030, interaction(Race, Region, Age, EducationCategory, MetropolitanStatus))
length(unique(ex1030$groups)) # 2224 combinations
table(tapply(ex1030$WeeklyEarnings, ex1030$groups, length))
# 1241 with only one observation in them
# fit_earn <- lm(log(WeeklyEarnings) ~ Race*Region*factor(Age)*EducationCategory*MetropolitanStatus, data = ex1030)
# Takes too long
# rather than fitting with lm (it takes awhile), I'm going to find the SS manually
ex1030$grp_avg <- with(ex1030, ave(log(WeeklyEarnings), groups))
RSS_sat <- with(ex1030, sum((log(WeeklyEarnings) - grp_avg)^2))
RSS_earn <- sum(residuals(fit_earn)^2)
n <- nrow(ex1030)
p_sat <- length(unique(ex1030$groups))
p_earn <- length(coef(fit_earn))
# lack of fit F-test, check my work!
df_top <- (p_sat - p_earn)
df_bottom <- (n - p_sat)
F <- ((RSS_earn - RSS_sat)/df_top) / (RSS_sat / df_bottom)
1 - pf(F, df_top, df_bottom)
# I'm surprised by this a little, I suspect becasue df_bottom is small (relative to n)
# we have very little power to detect problems (but also check my work)
# It's probably more common to use the lack-of-fit to examine more directed questions
# like: is it OK to treat Age as linear?
fit_earnAge <- lm(log(WeeklyEarnings) ~ Race*Region + factor(Age) +
EducationCategory + MetropolitanStatus, data = ex1030)
anova(fit_earn, fit_earnAge)
# nope
fit_earnAge2 <- lm(log(WeeklyEarnings) ~ Race*Region + Age + I(Age^2) +
EducationCategory + MetropolitanStatus, data = ex1030)
anova(fit_earnAge2, fit_earnAge)
# but quadratic might be enough
pred_df <- data.frame(Race = "White", Region = "South",
MetropolitanStatus = "Metropolitan", EducationCategory = "BachelorsDegree",
Age = seq(16, 85, 1))
pred_df_q <- cbind(pred_df, predict(fit_earnAge2, pred_df, interval = "confidence"))
qplot(Age, exp(fit), data = pred_df_q, geom = "line") +
geom_ribbon(aes(ymax = exp(upr), ymin = exp(lwr)), alpha = 0.2) +
ylab("Estimated mean weekly earnings")
# I might lean towards something slightly less parametric
library(splines)
fit_earnAge3 <- lm(log(WeeklyEarnings) ~ Race*Region + bs(Age, 4) +
EducationCategory + MetropolitanStatus, data = ex1030)
pred_df_sp <- cbind(pred_df, predict(fit_earnAge3, pred_df, interval = "confidence"))
qplot(Age, exp(fit), data = pred_df_sp, geom = "line") +
geom_ribbon(aes(ymax = exp(upr), ymin = exp(lwr)), alpha = 0.2) +
ylab("Estimated mean weekly earnings")
# but don't forget about where the data is
last_plot() + geom_jitter(aes(Age, 500), data = ex1030,
position = position_jitter(height = 50), alpha = 0.2)
# using nrow & length is lazy, because there might be missing values
# really I should use something like sum(!is.na( )), but I get away with it here since
any(is.na(ex1030))