### Soc 6708 ### Advanced Statistics ### Robert Andersen, University of Toronto ######################################## ## Generalized Linear Models ######################################## library(MASS) library(mgcv)#package for GAM library(car)#to get the data data(Prestige) #Removing missing cases Prestige2<-na.omit(Prestige) attach(Prestige2) ###################################### #Generalized Additive Model example ##################################### Prestige.gam<-gam(prestige~s(income)+s(education)) Prestige.gam #creating a perspective plot manually inc<-seq(min(income), max(income), len=25) ed<-seq(min(education), max(education), len=25) #dividing the range of income and education #into 25 values each data<-expand.grid(income=inc, education=ed) fit.prestige<-matrix(predict(Prestige.gam, data), 25, 25) persp(inc, ed, fit.prestige, theta=45, phi=30, ticktype='detailed', xlab='Income', ylab='Education', zlab='Prestige', expand=2/3, shade=.5) #perspective plots in mgcv package: vis.gam(Prestige.gam, theta=45) #individual plots for each #X variable #pages=1 puts them on the same page plot(Prestige.gam, pages=1) #Alternatively, putting the plots side-by-side split.screen(figs=c(1,2)) screen(1) plot(Prestige.gam, select=1) screen(2) plot(Prestige.gam, select=2) close.screen(all=TRUE) summary(Prestige.gam) ###Interpreting the effects par(mfrow=c(1,2)) #fitted value at income=10000 plot(Prestige.gam, select=1, main="Income=10000") segments(0,6,10000,6, lty=2, lwd=2, col="red") segments(10000, -20, 10000,6, lty=2, lwd=2, col="red") points(10000,6, cex=1.5, pch=19) text(6500,8.5, "(10 000,6)") #fitted value at education=10 plot(Prestige.gam, select=2, main="Education=10") segments(10,-20,10,-5, lty=2, lwd=2, col="red") segments(0, -5,10,-5, lty=2, lwd=2, col="red") points(10,-5, cex=1.5, pch=19) text(9.2,-2.5, "(10,-5)") #fitted value for income=10000 and education=10 is: mean(prestige)+6-5 #Fitting a regular linear model using gam Prestige.ols<-gam(prestige~income+education) deviance(Prestige.ols) #Could now test for linearity using an analysis #of deviance anova(Prestige.ols, Prestige.gam) #Diagnostics plots gam.check(Prestige.gam) ###Interaction between two smooths gam(prestige~s(income, education)) #Semi-parametric model with interaction #First must specify type.bc<-as.numeric(type=="bc") type.prof<-as.numeric(type=="prof") type.wc<-as.numeric(type=="wc") inter.gam<-gam(prestige~type+s(income,by=type.bc) +s(income,by=type.prof)+s(income,by=type.wc)) split.screen(figs=c(1,3)) screen(1) plot.gam(inter.gam, select=1) title("Blue Collar") screen(2) plot.gam(inter.gam, select=2) title("Professional") screen(3) plot.gam(inter.gam, select=3) title("White Collar") close.screen(all=TRUE) summary(inter.gam) anova(inter.gam) inter.glm<-glm(prestige~type+income*type) detach(Prestige2) ################################ # Example #2 ################################ Weakliem <- read.table('C:/Teaching/ICPSR/data/Weakliem.txt', header=T) summary(Weakliem) attach(Weakliem) #Multiple nonparametric regression Model.lowess<-loess(secpay~gini+gdp, span=.8, degree=1) gini2<-seq(min(gini), max(gini), len=15) gdp2<-seq(min(gdp), max(gdp), len=15) #dividing the range of income and education #into 25 values each data<-expand.grid(gini=gini2, gdp=gdp2) secpay.fitted<-matrix(predict(Model.lowess, data), 15, 15) persp(gini2, gdp2, secpay.fitted, theta=35, ph=20, ticktype='detailed', xlab='Gini Coefficient', ylab='GDP', zlab='Attitudes') #Generalized Additive Model library(mgcv) Model.gam<-gam(secpay~s(gini)+s(gdp)) gini2<-seq(min(gini), max(gini), len=15) gdp2<-seq(min(gdp), max(gdp), len=15) #dividing the range of income and education #into 25 values each data<-expand.grid(gini=gini2, gdp=gdp2) secpay.fitted2<-matrix(predict(Model.gam, data), 15, 15) persp(gini2, gdp2, secpay.fitted2, theta=35, ph=20, ticktype='detailed', xlab='Gini Coefficient', ylab='GDP', zlab='Attitudes') #Gam's with only two predictors can also be #plotted easily using the "vis.gam" function #in the mgcv package vis.gam(Model.gam) #putting perspective plots side-by-side split.screen(figs=c(1,2)) screen(1) persp(gini2, gdp2, secpay.fitted2, theta=25, ph=20, ticktype='detailed', xlab='Gini Coefficient', ylab='GDP', zlab='Attitudes', main='GAM', expand=2/3, shade=.5) screen(2) persp(gini2, gdp2, secpay.fitted, theta=25, ph=20, ticktype='detailed', xlab='Gini Coefficient', ylab='GDP', zlab='Attitudes', main='Lowess model',expand=2/3, shade=.5) close.screen(all=TRUE) #Plotting the additive GAM functions split.screen(figs=c(1,2)) screen(1) plot(Model.gam, select=1) screen(2) plot(Model.gam, select=2) close.screen(all=TRUE) #Comparing the models using "anova.gam" function Model.lm<-gam(secpay~gini+gdp) anova.gam(Model.lm, Model.gam) vis.gam(Model.gam) #Making Democrcacy a Factor #For interaction model democratic<-factor(democrat) demo<-as.numeric(democrat==1) nondem<-as.numeric(democrat==0) #Semi-parametric interaction model #Model below specifies an interaction #between democratic and gini demo.gam<-gam(secpay~democratic+s(gini,by=demo) +s(gini,by=nondem)) summary(demo.gam) #Plotting the additive GAM functions split.screen(figs=c(1,2)) screen(1) plot(demo.gam, select=1) title("Democracies") screen(2) plot(demo.gam, select=2) title("Non-democracies") close.screen(all=TRUE) demo.lm<-gam(secpay~democratic*gini) anova.gam(demo.lm, demo.gam) ####################################### # Generalized Aditive Mixed Models ####################################### library(foreign) Context<-read.spss("c:/teaching/ICPSR/data/context.sav", use.value.labels=TRUE, to.data.frame=TRUE) attach(Context) summary(Context) #Centering income to make estimates #more stable interpretation of intercept more sensible-- #they become the group means Context$INCOME2<-Context$INCOME-mean(Context$INCOME) detach(Context) attach(Context) library(nlme) library(mgcv) Mod1<-gamm(LRSCALE~s(INCOME2), random=list(PANO=~1), data=Context) ###Returns the variance components: summary(Mod1$lme) ###Returns the fixed effects, including the smooths summary(Mod1$gam) ###Plotting the curve plot(Mod1$gam, pages=1) ##Some Diagnostics plot(Mod1$lme) qqnorm(Mod1$lme) ##Testing for linearity Mod2<-gamm(LRSCALE~INCOME2, random=list(PANO=~1), data=Context) anova(Mod1$lme, Mod2$lme) #####Problem of mising data ##Creating 100 random observations x<-rnorm(80, mean=20, sd=2) x2<-rnorm(20, mean=20, sd=2)#"missing" observations xall<-c(x,x2)#ALL observations #Perfect linear relationship between y and x yall<-xall plot(xall, yall, main="No missing data") abline(lm(yall~xall)) ##Replacing "missing at random" x-values with mean imputation x3<-rep(mean(x), 20) xmean<-c(x,x3) sd(xmean);sd(xall) plot(density(xmean), main="Density of x") lines(density(xall), lty=2, col=2) legend(locator(1), lty=1:2, col=1:2, legend=c('All cases', 'Mean imputation')) ##Relationship between y (all) and x (mean imputation) plot(xmean, yall, main="20% Mean imputation (x)") abline(lm(yall~xmean)) ##Replacing "missing at random" y-values with mean imputation ymean<-c(x,x3) plot(xall, ymean, main="20% Mean imputation (y)") abline(lm(ymean~xall)) ##Comparing numerical output summary(lm(yall~xmean)) summary(lm(ymean~xall))