## Adpated from: 4.7, 5.1 & 5.2 R code # available at: http://www.stat.columbia.edu/~gelman/arm/software/ library(tidyverse) library(arm) # == Data loading and cleaning ==== library(foreign) brdata <- read.dta("https://stat552.cwick.co.nz/lecture/nes5200_processed_voters_realideo.dta", convert.factors = F) pres <- brdata %>% filter(complete.cases(black, female, educ1, age, income, state)) %>% dplyr::select(income, presvote, year) %>% filter(presvote %in% 1:2) %>% # Bush or Clinton mutate(vote = presvote - 1, vote_f = factor(vote, levels = c(0, 1), labels = c("Clinton", "Bush"))) # make 0,1 pres_1992 <- filter(pres, year == 1992) # == Exploratory plots ===== ggplot(pres_1992, aes(income, vote)) + geom_point() ggplot(pres_1992, aes(income, vote)) + geom_jitter() ggplot(pres_1992, aes(income)) + geom_bar(aes(fill = vote_f), position = position_dodge()) ggplot(pres_1992, aes(income)) + geom_bar(aes(fill = vote_f), position = "fill") # linear regression doesn't do too badly here # since x is retristricted in range and fitted values # fall in [0, 1] ggplot(pres_1992, aes(income, vote)) + geom_jitter(position = position_jitter(height = 0.1)) + geom_smooth(method = "lm") ggplot(pres_1992, aes(income, vote)) + geom_jitter(position = position_jitter(height = 0.1)) + geom_smooth(method = "lm", fullrange = TRUE) + xlim(-4, 9) # but we know we are violating assumptions # install.packages("productplots") library(productplots) prodplot(pres_1992, ~ vote_f | income) + list(aes(fill = vote_f)) + scale_fill_manual( values = c("Bush" = "#E41A1C", "Clinton" = "#377EB8")) + scale_y_continuous("Proportion") + scale_x_continuous("Income", breaks = seq(0, 0.8, length = 5) + 0.1, labels = 1:5) # == Simple logistic regression fit ==== fit_1 <- glm(vote ~ income, family = binomial(link = "logit"), data = pres_1992) display(fit_1) # function in arm library summary(fit_1) # alternative # add fit - for Simple model only ggplot(pres_1992, aes(income, vote)) + geom_jitter(position = position_jitter(height = 0.1)) + geom_smooth(method = "glm", method.args = list(family = "binomial")) ggplot(pres_1992, aes(income, vote)) + geom_jitter(position = position_jitter(height = 0.1)) + geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange = TRUE) + xlim(-4, 9) # == plots in book ==== # Graph figure 5.1 (a) curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), 1, 5, ylim = c(-.01, 1.01), xlim = c(-2, 8), xaxt = "n", xaxs = "i", mgp = c(2,.5,0), ylab = "Pr (Republican vote)", xlab = "Income", lwd = 4) curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), -2, 8, lwd = .5, add = T) axis (1, 1:5, mgp = c(2,.5,0)) mtext("(poor)", 1, 1.5, at = 1, adj = .5) mtext("(rich)", 1, 1.5, at = 5, adj = .5) with(pres_1992, points(jitter(income, .5), jitter(vote, .08), pch = 20, cex = .1)) # Graph figure 5.1 (b) sim.1 <- sim(fit_1) curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), .5, 5.5, ylim = c(-.01, 1.01), xlim = c(.5, 5.5), xaxt = "n", xaxs = "i", mgp = c(2, .5, 0), ylab = "Pr (Republican vote)", xlab = "Income", lwd = 1) for (j in 1:20){ curve (invlogit(coef(sim.1)[j,1] + coef(sim.1)[j,2]*x), col = "gray", lwd = .5, add = T) } curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), add = T) axis(1, 1:5, mgp = c(2, .5, 0)) mtext("(poor)", 1, 1.5, at = 1, adj = .5) mtext("(rich)", 1, 1.5, at = 5, adj = .5) with(pres_1992, points(jitter(income, .5), jitter(vote, .08), pch = 20, cex = .1)) # == Interpreting coefficients ===== # backtransform from linear predictor scale to probabilities invlogit <- function(x) 1/(1 + exp(-x)) # = Interpret at some x = mean_inc <- with(pres_1992, mean(income, na.rm=T)) invlogit(-1.40 + 0.33*mean_inc) # Estimated P(support Bush | avg income) = 0.4 # = Interpret change in P for 1 unit change in x, at some x = invlogit(-1.40 + 0.33*3) - invlogit(-1.40 + 0.33*2) # An increase in income from category 2 to category 3 is # associated with an increase in the estimated probability # of supporting Bush of 0.08 invlogit(-1.40 + 0.33*4) - invlogit(-1.40 + 0.33*3) # An increase in income from category 3 to category 4 is # associated with an increase in the estimated probability # of supporting Bush of 0.08 # = Interpret derivative of P at some x = # derivative at x = 3.1 logit_p <- (-1.40 + 0.33*3.1) 0.33*exp(logit_p)/(1 + exp(logit_p))^2 # Book says = 0.13 - must be a typo! # Each "small" unit of increase in income # associated with an increase in the estimated probability # of supporting Bush of 0.08 # = Interpret bound on change in P = coef(fit_1)[2]/4 # At most a one unit change in income is associated with # a change in P(Bush) of 0.08 # = Interpret coefficients as odds ratios coef(fit_1)[2] # A one unit increase in income is associated with # a change in the log odds ratio of 0.08 # === Several logistic regressions ==== # fit a model to each year many_fits <- pres %>% group_by(year) %>% nest() %>% mutate( model = map(data, glm, formula = vote ~ income, family = binomial(link = "logit")), coef = map(model, broom::tidy) ) %>% unnest(coef) many_fits %>% filter(term == "income") %>% ggplot(aes(year, estimate)) + geom_pointrange(aes(ymin = estimate - 2*std.error, ymax = estimate + 2*std.error)) + geom_hline(yintercept = 0, colour = "grey40", linetype = "dashed") + ylab("Estimated slope on Income") # == Other thoughts ==== # could treat income as categorical fit_2 <- glm(vote ~ factor(income) - 1, family=binomial(link="logit"), data = pres_1992) summary(fit_2) pred_df <- data.frame(income = 1:5) preds <- predict(fit_2, newdata = pred_df, type = "response", se = TRUE) pred_df <- cbind(pred_df, preds) ggplot(pres_1992, aes(income, vote)) + geom_jitter(position = position_jitter(height = 0.1)) + geom_pointrange( aes(y = fit, ymin = fit - 2*se.fit, ymax = fit + 2*se.fit), colour = "red", data = pred_df) + ylab("P(support Bush)")