### Soc 6708 ### Advanced Statistics ### Robert Andersen, University of Toronto ### Thursday, May 24, 2012 ######################################## # Logit and Probit Models II # Multinomial, Ordered, and GLMs ######################################## library(car)#Data are here data(Womenlf) attach(Womenlf) #Creating a two category factor working <- recode(partic, " 'not.work' = 'no'; else = 'yes' ") working[1:10] # binary logit model #binary logit models #can be fit with glm in base pacakage mod.working <- glm(working ~ hincome*children + region, family=binomial) summary(mod.working) Anova(mod.working) mod.working.1 <- glm(working ~ hincome + children, family=binomial) summary(mod.working.1) predictors <- data.frame(expand.grid(list(hincome=1:45, children=c('absent', 'present')))) p.work <- predict(mod.working.1, predictors, type='response') plot(c(1,45), c(0,1), type='n', xlab="Husband's Income", ylab='Fitted Probability(Working)') lines(1:45, p.work[1:45], lty=1, lwd=3) text(locator(1), "Children Absent") lines(1:45, p.work[46:90], lty=2, lwd=3) text(locator(1), "Children Present") # diagnostics #Looking for outliers plot(hatvalues(mod.working.1), rstudent(mod.working.1)) abline(v=(3*mean(hatvalues(mod.working.1))), lty=2) abline(v=(2*mean(hatvalues(mod.working.1))), lty=2) #Index plot of Cook's D #Assesses influence plot(cookd(mod.working.1)) identify(1:length(partic), cookd(mod.working.1)) #looking for joint influence #partial residual plot cr.plots(mod.working.1) #Assessing for nonlinearity #if my model had more than one quantitative #predictor, cr.plots and av.plots #could also be checked in eaxctly the #same way as a linear model #partial regression plot av.plots(mod.working.1) # Multinomial logit model #creating an ordered variable for the response participation <- ordered(partic, levels=c('not.work', 'parttime', 'fulltime')) #nnet and MASS libraries needed #for multinomial logit models library(nnet) library(MASS) multinomial.model <- multinom(participation ~ hincome + children) summary(multinomial.model, cor=F, Wald=T) #Creating effect display p.fit <- predict(multinomial.model, predictors, type='probs') data.frame(predictors, p.fit)[1:5,] par(mfrow=c(1,2)) plot(c(1,45), c(0,1), type='n', xlab="Husband's Income", ylab='Fitted Probability', main="Children Absent") lines(1:45, p.fit[1:45, 'not.work'], lty=1, lwd=3) lines(1:45, p.fit[1:45, 'parttime'], lty=2, lwd=3) lines(1:45, p.fit[1:45, 'fulltime'], lty=3, lwd=3) legend(locator(1), lty=1:3, lwd=3, legend=c('not working', 'part-time', 'full-time')) plot(c(1,45), c(0,1), type='n', xlab="Husband's Income", ylab='Fitted Probability', main="Children Present") lines(1:45, p.fit[46:90, 'not.work'], lty=1, lwd=3) lines(1:45, p.fit[46:90, 'parttime'], lty=2, lwd=3) lines(1:45, p.fit[46:90, 'fulltime'], lty=3, lwd=3) # Ordered logit (proportional odds model) library(MASS) ordered.model <- polr(participation ~ hincome + children) summary(ordered.model) exp(ordered.model$coefficients) #Creating effect display p.fit <- predict(ordered.model, predictors, type='probs') plot(c(1,45), c(0,1), type='n', xlab="Husband's Income", ylab='Fitted Probability', main="Children Absent") lines(1:45, p.fit[1:45, 'not.work'], lty=1, lwd=3) lines(1:45, p.fit[1:45, 'parttime'], lty=2, lwd=3) lines(1:45, p.fit[1:45, 'fulltime'], lty=3, lwd=3) legend(locator(1), lty=1:3, lwd=3, legend=c('not working', 'part-time', 'full-time')) plot(c(1,45), c(0,1), type='n', xlab="Husband's Income", ylab='Fitted Probability', main="Children Present") lines(1:45, p.fit[46:90, 'not.work'], lty=1, lwd=3) lines(1:45, p.fit[46:90, 'parttime'], lty=2, lwd=3) lines(1:45, p.fit[46:90, 'fulltime'], lty=3, lwd=3) #Assessing whether the proportional odds assumption has been met #with an analysis of deviance for the #multinomial and ordinal models #Creating a function for the Chi-square test: parlines<-function(mod1, mod2){ dev2<-deviance(mod1) dev<-deviance(mod2) d<-abs(mod1$edf-mod2$edf) ch2<-abs(dev2-dev) p<-1-pchisq(ch2, d) output<-round(cbind(ch2, d, p),5) dimnames(output)[[2]]<-c("Chi-square", "df", "p-value") output } parlines(multinomial.model, ordered.model) #Test shows a statistically significant difference-- #parallel lines assumption doesn't hold #The AIC is also much smaller for the multinomial logit model extractAIC(multinomial.model) extractAIC(ordered.model) detach(Womenlf) ################## ######GLMs ################## ###################################### # Exploring how GLMs differ from # simply regressiosn with # transformations of Y ###################################### ###Comparing E[log(y)] with log(E[y]) library(car) data(Prestige) attach(Prestige) mean(log(income));log(mean(income)) ##Comparing regression models mod1<-glm(log(income)~education) mod2<-glm(income~education, family=Gamma(link="log")) deviance(mod1); deviance(mod2) ##Comparing the fitted values plot(exp(fitted(mod1)), fitted(mod2), xlab="OLS with log transformation", ylab="Gamma model with log link", xlim=c(0,15000), ylim=c(0,15000)) abline(0,1) ################################ #Poisson model for count data ################################ library(foreign) Assoc<-read.spss("c:/teaching/ICPSR/data/assoc.sav", use.value.labels=TRUE, to.data.frame=TRUE) summary(Assoc) attach(Assoc) #looking at the distribution hist(ASSOC, nclass=15) Mod.poiss<-glm(ASSOC~SEX+AGE+SES+COUNTRY, family=poisson) summary(Mod.poiss) exp(Mod.poiss$coefficients) ### #A model with interaction terms ### Mod.poisson<-glm(ASSOC~SEX+AGE+SES*COUNTRY, family=poisson) summary(Mod.poisson) library(car) Anova(Mod.poisson) library(effects) effect('SES*COUNTRY',Mod.poisson) effect('COUNTRY',Mod.poisson) logassoc<-log(ASSOC+1) Mod.lm<-glm(logassoc~SEX+AGE+SES*COUNTRY) hist(logassoc) summary(Mod.lm) effect('COUNTRY',Mod.lm)