### Soc 6708 ### Advanced Statistics ### Robert Andersen, University of Toronto ### Wednesday, June 7, 2012 ######################################## ## Robust and Resistant Regression ######################################## library(car) library(MASS) library(lqs) Nondemo <- read.table('C:/data/Weakliem2.txt', header=T) summary(Nondemo) attach(Nondemo) #scatterplot with case names plot(gini, secpay, pch=16, xlim=c(20,63), xlab="Gini Coefficient", ylab="Attitudes towards inequality") chw <- par()$cxy[1] #adds space from data point text(gini+chw, secpay, labels=row.names(Nondemo), adj=0) #adj=0 adjusts the text on the x axis abline(lm(secpay~gini)) Nondemo.ls<-lm(secpay~gini) summary(Nondemo.ls) #diagnostics for the distribution of the residuals qq.plot(Nondemo.ls, simulate=T, main="Studentized residuals for Inequality model") row.names(Nondemo)[c(7, 16, 24, 26)] outlier.test(Nondemo.ls) #Bubble-plot displaying the studentized residuals, #hat-values, and Cook's D plot(hatvalues(Nondemo.ls), rstudent(Nondemo.ls), ylim=range(-1,6),type='n', main="Influence plot") cook<-sqrt(cookd(Nondemo.ls)) points(hatvalues(Nondemo.ls), rstudent(Nondemo.ls), cex=20*cook/max(cook)) abline(v=3/length(Nondemo), lty=2) #line for hatvalues abline(h=c(-2,0,2), lty=2) #lines for studentized residuals identify(hatvalues(Nondemo.ls), rstudent(Nondemo.ls), row.names(Nondemo)) Nondemo.ls2<-update(Nondemo.ls, subset=-c(7,26)) summary(Nondemo.ls) summary(Nondemo.ls2) #Deleting Czech and Slovakia #improves fit of model and #changes direction of effect for gini ################################################ # # Robust Regression Methods # ################################################ #Fitting Least Absolute Value Regression library(quantreg) Nondemo.lav<-rq(secpay~gini,.5) summary(Nondemo.lav) #Nearly same results as OLS deleting influential cases plot(residuals(Nondemo.lav)) identify(1:26, Nondemo.lav$residuals, rownames(Nondemo)) #Influential cases have largest residual ################################ #M-Estimation with Huber weight ################################ library(MASS) Nondemo.huber<-rlm(secpay~gini) summary(Nondemo.huber) #Again similar findings to OLS without influential cases plot(Nondemo.huber$residuals) identify(1:26, Nondemo.huber$residuals, rownames(Nondemo)) #Largest residuals indicate most influential cases #in regular OLS plot(Nondemo.huber$w, ylab="Huber Weight") identify(1:26, Nondemo.huber$w, rownames(Nondemo)) #As we would expect, Czech and Slovakia #are given the least weight ################################### #M-Estimation with bisquare weight ################################### Nondemo.bisq<-rlm(secpay~gini, data=Nondemo, psi=psi.bisquare) summary(Nondemo.bisq) #Again similar to Huber and LAV plot(Nondemo.bisq$residuals) identify(1:26, Nondemo.bisq$residuals, rownames(Nondemo)) plot(Nondemo.bisq$w, ylab="Bisquare Weight") identify(1:26, Nondemo.bisq$w, rownames(Nondemo)) #Again Czech Rep and Slovakia have lowest weight #Plot comparing Bisquare and Huber weights plot(Nondemo.bisq$w, Nondemo.huber$w, xlim=c(0,1), ylim=c(0,1), xlab="Bisquare weight", ylab="Huber weight", main="Comparison of weights") abline(0,1) identify(1:26, Nondemo.bisq$w,Nondemo.huber$w, row.names(Nondemo)) ########################################## # # Resistant Regression Methods # (Bounded Influence Methods) # ########################################## #Fitting Least Trimmed Squares Regression library(lqs) Nondemo.lts<-ltsreg(secpay~gini, data=Nondemo) Nondemo.lts #No summary function for ltsreg or lmsreg #Fitting least Median Squares Regression Nondemo.lms<-lmsreg(secpay~gini, data=Nondemo) Nondemo.lms #Fitting S-estimation regression Nondemo.S<-lmsreg(secpay~gini, method="S", data=Nondemo) Nondemo.S #Plot of the residuals shows influential cases split.screen(f=c(1,2)) screen(1) plot(residuals(Nondemo.lts)) identify(1:26, Nondemo.lts$residuals, rownames(Nondemo)) #Plot of the residuals shows influential cases screen(2) plot(residuals(Nondemo.lms)) identify(1:26, Nondemo.lms$residuals, rownames(Nondemo)) close.screen(all=TRUE) plot(Nondemo.lts$residuals, Nondemo.lms$residuals) abline(0,1) ####################################### ## MM-estimation (Huber weights) ####################################### library(MASS) Nondemo.MM<-rlm(secpay~gini, method="MM") summary(Nondemo.MM) #A plot comparing the fit of the different #models fit above plot(gini,secpay, cex=1.5) identify(gini, secpay, row.names(Nondemo)) abline(Nondemo.ls, lwd=2, col=1, lty=1) abline(Nondemo.lts, lwd=2, col=2, lty=2) abline(Nondemo.bisq, lwd=2, col=3, lty=3) abline(Nondemo.huber, lwd=2, col=4, lty=4) abline(Nondemo.lav, lwd=2, col=5, lty=5) abline(Nondemo.MM, lwd=2, col=6, lty=6) legend(locator(1), lty=1:6, lwd=2, col=1:6, legend=c('OLS', 'LTS', 'M-Bisquare', 'M-Huber', 'LAV', 'MM-Huber')) title("Comparison of Different Models") ###################################### ## RR-plots ###################################### par(mfrow=c(2,2)) plot( residuals(Nondemo.huber), residuals(Nondemo.ls),xlim=c(-.2,.6), ylim=c(-.2, .6), ylab="OLS residual", xlab="Robust residual", main="M-estimation (Huber)", cex=1.2) abline(0,1, lty=2) abline(lm(residuals(Nondemo.ls)~residuals(Nondemo.huber))) identify(residuals(Nondemo.huber),residuals(Nondemo.ls), row.names(Nondemo)) plot( residuals(Nondemo.bisq), residuals(Nondemo.ls),xlim=c(-.2,.6), ylim=c(-.2, .6), ylab="OLS residual", xlab="Robust residual", main="M-estimation (bisquare)", cex=1.2) abline(0,1, lty=2) abline(lm(residuals(Nondemo.ls)~residuals(Nondemo.bisq))) identify(residuals(Nondemo.bisq),residuals(Nondemo.ls), row.names(Nondemo)) plot( residuals(Nondemo.lav), residuals(Nondemo.ls),xlim=c(-.2,.6), ylim=c(-.2, .6), ylab="OLS residual", xlab="Robust residual", main="LAV", cex=1.2) abline(0,1, lty=2) abline(lm(residuals(Nondemo.ls)~residuals(Nondemo.lav))) identify(residuals(Nondemo.lav),residuals(Nondemo.ls), row.names(Nondemo)) plot( residuals(Nondemo.MM), residuals(Nondemo.ls),xlim=c(-.2,.6), ylim=c(-.2, .6), ylab="OLS residual", xlab="Robust residual", main="MM-estimation (Huber)", cex=1.2) abline(0,1, lty=2) abline(lm(residuals(Nondemo.ls)~residuals(Nondemo.MM))) identify(residuals(Nondemo.MM),residuals(Nondemo.ls), row.names(Nondemo)) ################################################# # Robust Generalized linear models ################################################# ####################################################### #Poisson regression model ####################################################### #Using Canadian data from Equality, Security, and Community (ESC) survey of 2000, #concentrating only on Quebec detach(Nondemo) library(foreign) MPSA<-read.spss("C:/Research/civic/MPSA/MPSA.sav", use.value.labels=TRUE, to.data.frame=TRUE) MPSA$Region2<-recode(MPSA$REGION,"'Quebec'='Quebec'; 'Ontario'='Ontario'; 'Atlantic'='Atlantic'; else='West'") Quebec<-na.omit(subset(MPSA, Region2=="Quebec", drop=TRUE)) ###Poisson model mod1<-glm(TOTORG~MEN+CANBORN+LANG, data=Quebec, family=poisson) summary(mod1) #Diagnostics par(mfrow=c(1,2)) plot(cookd(mod1), main="Index plot of Cook's D", ylab="Cook's D") identify(1:length(Quebec$TOTORG), cookd(mod1), row.names(Quebec)) plot(dfbetas(mod1)[,5], main="DFBETAs for 'Canadian born' coefficient", ylab="DFBETAs") identify(1:length(Quebec$TOTORG), dfbetas(mod1)[,5], row.names(Quebec)) ##Robust GLM Poisson regression library(robustbase) mod2<-glmrob(TOTORG~MEN+CANBORN+LANG, data=Quebec, family=poisson) summary(mod1) summary(mod2) exp(mod1$coefficients) exp(mod2$coefficients) ########################### ## BOOTSTRAPPING ## ########################### ############################### #random x resampling of #robust regression coefficients ############################### library(MASS) library(boot) Nondemo <- read.table('C:/Teaching/ICPSR/data/Weakliem2.txt', header=T) attach(Nondemo) #scatterplot with case names plot(gini, secpay, pch=16, xlim=c(20,63), xlab="Gini Coefficient", ylab="Attitudes towards inequality") chw <- par()$cxy[1]#adds space from data point chh <- par()$cxy[2] text(gini+chw, secpay,labels=row.names(Nondemo), adj=0) abline(lm(secpay~gini)) Weakliem.ols<-lm(secpay~gini) summary(Weakliem.ols) #Bubble-plot displaying the studentized residuals, #hat-values, and Cook's D library(car) plot(hatvalues(Weakliem.ols), rstudent(Weakliem.ols), ylim=range(-1,15),type='n') cook<-sqrt(cookd(Weakliem.ols)) points(hatvalues(Weakliem.ols), rstudent(Weakliem.ols), cex=20*cook/max(cook)) abline(v=3/27, lty=2)#line for hatvalues abline(h=c(-2,0,2), lty=2) #lines for studentized residuals identify(hatvalues(Weakliem.ols), rstudent(Weakliem.ols), row.names(Nondemo)) Weakliem.huber<-rlm(secpay~gini) summary(Weakliem.huber) #Ceating function to get the #Huber M-estimates in order #to bootstrap them boot.huber<-function(data, indices, maxit=20){ data<-data[indices,] #selects the observations in bootstrap sample Weakliem.huber<-rlm(secpay~gini, data=data, maxit=maxit) coefficients(Weakliem.huber) #retuns coefficients vector } #getting bootstrap estimates Weakliem.boot<-boot(data=Nondemo, statistic=boot.huber, R=1500, maxit=1000) Weakliem.boot #Plotting bootstrap sample for gini# #gives histogram and qq-plot plot(Weakliem.boot, index=2) #Confidence intervals for the #bootstrap coefficient (gini) boot.ci(Weakliem.boot, index=2, type=c("norm", "perc", "bca")) #Jackknife-after-bootstrap jack.after.boot(Weakliem.boot, index=2) ################################### #Fixed-x Resampling #of robust regression coefficients #################################### boot.huber.fixed<-function(data, indices, maxit=20){ fit<-fitted(Weakliem.huber) e<-residuals(Weakliem.huber) X<-model.matrix(Weakliem.huber) y<-fit+e[indices] mod<-rlm(y~X-1, maxit=maxit) #y~X-1 omits the intercept-- #the line goes through the origin coefficients(mod) } Weakliem.huber.fixed<-boot(Nondemo, boot.huber.fixed, R=1500, maxit=100) Weakliem.huber.fixed plot(Weakliem.huber.fixed, index=2) jack.after.boot(Weakliem.huber.fixed, index=2) ###Crossvalidation using the Design package library(car) library(Design) data(Prestige) mod1<-ols(prestige~income+type+education, x=TRUE, y=TRUE, data=Prestige) validate(mod1, method="crossvalidation", pr=T) #The default number of samples above is 40--this "overfits" the model, #thus leading to a "negative" test. This means fewer samples #are needed. #Below I divide the sample into only 4 subsamples validate(mod1, method="crossvalidation", B=4) #pr=T displays all the test samples validate(mod1, method="crossvalidation", B=4, pr=T)