library(tidyverse) data(chredlin, package = "faraway") head(chredlin) # involact = response # race, fires, everything else except zipcode # Is race a significant predictor of refusal rate (through our proxy of # applicants to FAIR program), after controlling for other variables that may impact insurance company decisions? library(GGally) ggpairs(chredlin) # or go through individually, e.g. ggplot(chredlin, aes(race, fire)) + geom_point() # a numeric summary summary(chredlin) # I like this package for summaries too library(skimr) skim(chredlin) # Verify observation: positive association between race and applications # to FAIR # Graphically? ggplot(chredlin, aes(race, involact)) + geom_point() + geom_smooth(method = "lm") # With a model? lm(involact ~ race, data = chredlin) %>% summary() # Ecological correlation? # We have measurements at an aggregate level: zip code. Any associations # we find may not be accurate of indivdual level relationships. # The insurance company might argue: # this is just that zips with high minority happen to be # areas with lots of theft and fires ggplot(chredlin, aes(race, theft0)) + geom_point() ggplot(chredlin, aes(race, fire)) + geom_point() # We might address their argument by # "adjusting" for these "risk" factors # Which ones should be adjusted for? fit <- lm(involact ~ log(income) + fire + theft + age, data = chredlin) # should we inlude income? fit_race <- lm(race ~ log(income) + fire + theft + age, data = chredlin) # zipcode income is correlated with zipcode race # a partial regression plot ggplot(mapping = aes(residuals(fit_race), residuals(fit))) + geom_point() # One possible model fit_2 <- lm(involact ~ race + log(income) + fire + theft + age, data = chredlin) summary(fit_2) # race is significant # diagnostics fit_2_diag <- broom::augment(fit_2) ggplot(fit_2_diag, aes(.fitted, .resid)) + geom_point() ggplot(fit_2_diag, aes(sample = .resid)) + geom_qq() + geom_qq_line() # sensitivity to specification of model listcombo <- unlist(sapply(0:4, function(x) combn(4, x, simplify = FALSE)), recursive = FALSE) predterms <- lapply(listcombo, function(x) paste(c("race", c("fire","theft","age","income")[x]), collapse="+")) coefm <- matrix(NA, 16, 2) for(i in 1:16){ lmi <- lm(as.formula(paste("involact ~ ", predterms[[i]])), data = chredlin) coefm[i, ] <- summary(lmi)$coef[2, c(1, 4)] } rownames(coefm) <- predterms colnames(coefm) <- c("beta","pvalue") round(coefm,4) # influence ggplot(fit_2_diag, aes(reorder(rownames(chredlin), .cooksd), .cooksd)) + geom_point() + coord_flip() faraway::vif(fit_2) ggplot(fit_2_diag, aes(reorder(rownames(chredlin), .hat), .hat)) + geom_point() + coord_flip() 2*length(coef(fit))/nrow(chredlin) # coefficient changes due to leaving out obs coef_change <- data.frame(lm.influence(fit_2)$coef) ggplot(coef_change, aes(row.names(coef_change), race)) + geom_linerange(aes(ymax = 0, ymin = race)) + xlab("ZIP") + geom_hline(yintercept = 0) + ggtitle("Leave-one-out change in Race coefficient") + theme(axis.text.x = element_text(angle = 90)) ggplot(coef_change, aes(x = fire, y = theft)) + geom_text(label = row.names(coef_change)) chredlin[c("60607", "60610"),] match(c("60607", "60610"), row.names(chredlin)) lmode <- lm(involact ~ race + fire + theft + age + log(income), chredlin, subset = -c(6, 24)) summary(lmode)