## Adpated from: 4.7, 5.1 & 5.2 R code
# available at: http://www.stat.columbia.edu/~gelman/arm/software/

library(tidyverse)
library(arm)

# == Data loading and cleaning ====
library(foreign)

brdata <- read.dta("https://stat552.cwick.co.nz/lecture/nes5200_processed_voters_realideo.dta",
  convert.factors = F)

pres <- brdata %>%
  filter(complete.cases(black, female, educ1, age,
    income, state)) %>%
  dplyr::select(income, presvote, year) %>% 
  filter(presvote %in% 1:2) %>% # Bush or Clinton
  mutate(vote = presvote - 1,
    vote_f = factor(vote, levels = c(0, 1), 
      labels =  c("Clinton", "Bush"))) # make 0,1
pres_1992 <- filter(pres, year == 1992)


# == Exploratory plots =====
ggplot(pres_1992, aes(income, vote)) +
  geom_point()


ggplot(pres_1992, aes(income, vote)) +
  geom_jitter()


ggplot(pres_1992, aes(income)) +
  geom_bar(aes(fill = vote_f), position = position_dodge())

ggplot(pres_1992, aes(income)) +
  geom_bar(aes(fill = vote_f), position = "fill") 


# linear regression doesn't do too badly here
# since x is retristricted in range and fitted values
# fall in [0, 1]
ggplot(pres_1992, aes(income, vote)) +
  geom_jitter(position = position_jitter(height = 0.1)) +
  geom_smooth(method = "lm")

ggplot(pres_1992, aes(income, vote)) +
  geom_jitter(position = position_jitter(height = 0.1)) +
  geom_smooth(method = "lm", fullrange = TRUE) +
  xlim(-4, 9) 
# but we know we are violating assumptions

# install.packages("productplots")
library(productplots)
prodplot(pres_1992, ~ vote_f | income) + list(aes(fill = vote_f)) +
  scale_fill_manual(
    values = c("Bush" = "#E41A1C", "Clinton" = "#377EB8")) +
  scale_y_continuous("Proportion") +
  scale_x_continuous("Income", 
    breaks = seq(0, 0.8, length = 5) + 0.1, 
    labels = 1:5)

# == Simple logistic regression fit ====
fit_1 <- glm(vote ~ income, family = binomial(link = "logit"),
  data = pres_1992)
display(fit_1) # function in arm library
summary(fit_1) # alternative

# add fit - for Simple model only
ggplot(pres_1992, aes(income, vote)) + 
  geom_jitter(position = position_jitter(height = 0.1))  +
  geom_smooth(method = "glm", method.args = list(family = "binomial"))

ggplot(pres_1992, aes(income, vote)) + 
  geom_jitter(position = position_jitter(height = 0.1))  +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), 
    fullrange = TRUE) + 
  xlim(-4, 9)

# ==  plots in book ====

# Graph figure 5.1 (a) 
curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), 1, 5, 
  ylim = c(-.01, 1.01),
  xlim = c(-2, 8), xaxt = "n", xaxs = "i", mgp = c(2,.5,0),
  ylab = "Pr (Republican vote)", xlab = "Income", lwd = 4)
curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), -2, 8, 
  lwd = .5, add = T)
axis (1, 1:5, mgp = c(2,.5,0))
mtext("(poor)", 1, 1.5, at = 1, adj = .5)
mtext("(rich)", 1, 1.5, at = 5, adj = .5)
with(pres_1992, points(jitter(income, .5), jitter(vote, .08), pch = 20, cex = .1))

# Graph figure 5.1 (b)
sim.1 <- sim(fit_1)
curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), .5, 5.5, 
  ylim = c(-.01, 1.01),
  xlim = c(.5, 5.5), xaxt = "n", xaxs = "i", mgp = c(2, .5, 0),
  ylab = "Pr (Republican vote)", xlab = "Income", lwd = 1)
for (j in 1:20){
  curve (invlogit(coef(sim.1)[j,1] + coef(sim.1)[j,2]*x), 
    col = "gray", lwd = .5, add = T)
}
curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), add = T)
axis(1, 1:5, mgp = c(2, .5, 0))
mtext("(poor)", 1, 1.5, at = 1, adj = .5)
mtext("(rich)", 1, 1.5, at = 5, adj = .5)
with(pres_1992, points(jitter(income, .5), 
  jitter(vote, .08), pch = 20, cex = .1))

# == Interpreting coefficients =====
# backtransform from linear predictor scale to probabilities
invlogit <- function(x) 1/(1 + exp(-x))

# = Interpret at some x =
mean_inc <- with(pres_1992, mean(income, na.rm=T))
invlogit(-1.40 + 0.33*mean_inc)
# Estimated P(support Bush | avg income) = 0.4

# = Interpret change in P for 1 unit change in x, at some x =
invlogit(-1.40 + 0.33*3) - invlogit(-1.40 + 0.33*2)
# An increase in income from category 2 to category 3 is
# associated with an increase in the estimated probability
# of supporting Bush of 0.08

invlogit(-1.40 + 0.33*4) - invlogit(-1.40 + 0.33*3)
# An increase in income from category 3 to category 4 is
# associated with an increase in the estimated probability
# of supporting Bush of 0.08

# = Interpret derivative of P at some x =
# derivative at x = 3.1
logit_p <- (-1.40 + 0.33*3.1)
0.33*exp(logit_p)/(1 + exp(logit_p))^2
# Book says = 0.13 - must be a typo!
# Each "small" unit of increase in income 
# associated with an increase in the estimated probability
# of supporting Bush of 0.08

# = Interpret bound on change in P =
coef(fit_1)[2]/4
# At most a one unit change in income is associated with
# a change in P(Bush) of 0.08

# = Interpret coefficients as odds ratios
coef(fit_1)[2]
# A one unit increase in income is associated with
# a change in the log odds ratio of 0.08

# === Several logistic regressions ====

# fit a model to each year
many_fits <- pres %>% 
  group_by(year) %>% 
  nest() %>% 
  mutate(
    model = map(data, glm, formula = vote ~ income, 
      family = binomial(link = "logit")),
    coef = map(model, broom::tidy)
  ) %>% 
  unnest(coef)

many_fits %>% 
  filter(term == "income") %>% 
  ggplot(aes(year, estimate)) +
    geom_pointrange(aes(ymin = estimate - 2*std.error, 
      ymax = estimate + 2*std.error)) +
    geom_hline(yintercept = 0, colour = "grey40", linetype = "dashed") +
    ylab("Estimated slope on Income")

# == Other thoughts ====
# could treat income as categorical
fit_2 <- glm(vote ~ factor(income) - 1, 
  family=binomial(link="logit"),
  data = pres_1992)
summary(fit_2)

pred_df <- data.frame(income = 1:5)
preds <- predict(fit_2, newdata = pred_df, type = "response", se = TRUE)
pred_df <- cbind(pred_df, preds)

ggplot(pres_1992, aes(income, vote)) +
  geom_jitter(position = position_jitter(height = 0.1))  +
  geom_pointrange(
    aes(y = fit, ymin = fit - 2*se.fit, ymax = fit + 2*se.fit), 
    colour = "red", data = pred_df) +
  ylab("P(support Bush)")