library(tidyverse)
data(chredlin, package = "faraway")
head(chredlin)

# involact = response
# race, fires, everything else except zipcode

# Is race a significant predictor of refusal rate (through our proxy of 
# applicants to FAIR program), after controlling for other variables that may impact insurance company decisions?
library(GGally)
ggpairs(chredlin)

# or go through individually, e.g.
ggplot(chredlin, aes(race, fire)) +
  geom_point()

# a numeric summary
summary(chredlin)

# I like this package for summaries too
library(skimr)
skim(chredlin)

# Verify observation: positive association between race and applications 
# to FAIR 
# Graphically?
ggplot(chredlin, aes(race, involact)) +
  geom_point() +
  geom_smooth(method = "lm")

# With a model?
lm(involact ~ race, data = chredlin) %>% 
  summary()

# Ecological correlation?
# We have measurements at an aggregate level: zip code.  Any associations
# we find may not be accurate of indivdual level relationships.

# The insurance company might argue:
# this is just that zips with high minority happen to be
# areas with lots of theft and fires
ggplot(chredlin, aes(race, theft0)) + 
  geom_point()
ggplot(chredlin, aes(race, fire)) + 
  geom_point()

# We might address their argument by
# "adjusting" for these "risk" factors
# Which ones should be adjusted for?
fit <- lm(involact ~ log(income) + fire + theft + age, data = chredlin)

# should we inlude income?
fit_race <- lm(race  ~ log(income) + fire + theft + age, data = chredlin)
# zipcode income is correlated with zipcode race 

# a partial regression plot
ggplot(mapping = aes(residuals(fit_race), residuals(fit))) +
  geom_point()


# One possible model
fit_2 <- lm(involact ~ race + log(income) + fire + theft + age, 
  data = chredlin)
summary(fit_2)
# race is significant

# diagnostics
fit_2_diag <- broom::augment(fit_2)

ggplot(fit_2_diag, aes(.fitted, .resid)) +
  geom_point()

ggplot(fit_2_diag, aes(sample = .resid)) +
  geom_qq() +
  geom_qq_line()

# sensitivity to specification of model
listcombo <- unlist(sapply(0:4, 
  function(x) combn(4, x, simplify = FALSE)), recursive = FALSE)
predterms <- lapply(listcombo, 
  function(x) paste(c("race", c("fire","theft","age","income")[x]), collapse="+"))

coefm <- matrix(NA, 16, 2)
for(i in 1:16){
  lmi <- lm(as.formula(paste("involact ~ ", predterms[[i]])), data = chredlin)
  coefm[i, ] <- summary(lmi)$coef[2, c(1, 4)]
}
rownames(coefm) <- predterms
colnames(coefm) <- c("beta","pvalue")
round(coefm,4)

# influence
ggplot(fit_2_diag, aes(reorder(rownames(chredlin), .cooksd), .cooksd)) + 
  geom_point() +
  coord_flip()
faraway::vif(fit_2)

ggplot(fit_2_diag, aes(reorder(rownames(chredlin), .hat), .hat)) +
  geom_point() +
  coord_flip()
2*length(coef(fit))/nrow(chredlin)

# coefficient changes due to leaving out obs
coef_change <- data.frame(lm.influence(fit_2)$coef)
ggplot(coef_change, aes(row.names(coef_change), race)) + 
  geom_linerange(aes(ymax = 0, ymin = race)) +
  xlab("ZIP") + geom_hline(yintercept = 0) +
  ggtitle("Leave-one-out change in Race coefficient") +
  theme(axis.text.x = element_text(angle = 90))  

ggplot(coef_change, aes(x = fire, y = theft)) + 
  geom_text(label = row.names(coef_change))

chredlin[c("60607", "60610"),]
match(c("60607", "60610"), row.names(chredlin))
lmode <- lm(involact ~ race + fire + theft + age + log(income), chredlin, 
  subset = -c(6, 24))
summary(lmode)