### Soc 6708 ### Advanced Statistics ### Robert Andersen, University of Toronto ### Thursday, May 31, 2012 ############################################### #Influential case example #Measured weight and height, Davis 1990 ############################################### library(car) data(Davis) attach(Davis) davis.model.1<-lm(repwt~weight) plot(height, weight, main="Davis data") Model1<-lm(weight~height) identify(height, weight, row.names(Davis)) #observation 12 returned as outlier abline(Model1, lty=1, col=1, lwd=3) Model2<-update(Model1, subset=-12) abline(Model2, lty=2, col=2, lwd=3) legend(locator(1), lty=1:2, col=1:2, lwd=3, legend=c('All cases', 'Outlier excluded')) library(xtable)#Prints model summaries in LaTeX code print(xtable(Model1)) print(xtable(Model2)) detach(Davis) #Large data example with contrived data #creating a variable with 1000 random observations x<-c(rnorm(1000,mean=4,sd=1)) #adding one case with a "miscode" of 55 that "should be" 5: x1<-c(x,55) #Y is perfectly correlated with x1 (omitting miscode): y<-c(x,5) range(x1); range (y) plot(x1,y) abline(lm(y~x1)) summary(lm(y~x1)) ##Effect Display for Jasso and Kahn Models wife.age<-matrix(c(20,30,40,50,60,70)) B.Jasso<-matrix(c(-.72, 27.61, -6.43, -1.5, -3.71, -.56, .37, -1.28)) B.mod1<-matrix(c(-.67, 13.56, 7.87, -1.56, -3.74, -.68, .23, -1.10)) B.le2<-matrix(c(-3.06, 29.49, 57.89, -1.51, -2.88, -2.91, .86, -4.11)) B.gr2<-matrix(c(-.08, -1.62, -5.23, 1.29, -3.95, -.55, .02, -.38)) x20j<-B.Jasso*(cbind(c(73.5, log(20), log(34), log(12), .06, .52, .51, .955))) x30j<-B.Jasso*(cbind(c(73.5, log(30), log(34), log(12), .06, .52, .51, .955))) x40j<-B.Jasso*(cbind(c(73.5, log(40), log(34), log(12), .06, .52, .51, .955))) x50j<-B.Jasso*(cbind(c(73.5, log(50), log(34), log(12), .06, .52, .51, .955))) x60j<-B.Jasso*(cbind(c(73.5, log(60), log(34), log(12), .06, .52, .51, .955))) x70j<-B.Jasso*(cbind(c(73.5, log(70), log(34), log(12), .06, .52, .51, .955))) Jasso.fitted<-c(sum(x20j), sum(x30j), sum(x40j), sum(x50j), sum(x60j), sum(x70j)) x20m1<-B.mod1*(cbind(c(73.5, log(20), log(34), log(12), .06, .52, .51, .955))) x30m1<-B.mod1*(cbind(c(73.5, log(30), log(34), log(12), .06, .52, .51, .955))) x40m1<-B.mod1*(cbind(c(73.5, log(40), log(34), log(12), .06, .52, .51, .955))) x50m1<-B.mod1*(cbind(c(73.5, log(50), log(34), log(12), .06, .52, .51, .955))) x60m1<-B.mod1*(cbind(c(73.5, log(60), log(34), log(12), .06, .52, .51, .955))) x70m1<-B.mod1*(cbind(c(73.5, log(70), log(34), log(12), .06, .52, .51, .955))) Mod1.fitted<-c(sum(x20m1), sum(x30m1), sum(x40m1), sum(x50m1), sum(x60m1), sum(x70m1)) newdata<-data.frame(cbind(wife.age, Jasso.fitted, Mod1.fitted)) attach(newdata) plot(V1, Jasso.fitted, type="n", ylim=c(0, 40), main="Effect Display for Marital Coital Frequency Models", xlab="Age of Wife", ylab="Fitted Frequency") lines(spline(V1, Jasso.fitted), lty=1, col=1, lwd=3) lines(spline(V1, Mod1.fitted), lty=2, col=2, lwd=3) legend(locator(1), lty=1:2, col=1:2, lwd=3, legend=c('Original Jasso model', 'Model with outliers removed')) detach(newdata) ####################### #WORLD VALUES SURVEY #Nondemocracies only ######################## Weakliem2<-read.table('C:/Teaching/ICPSR/data/Weakliem2.txt', header=T) summary(Weakliem2) attach(Weakliem2) Weakliem.model1<-lm(secpay~gini+gdp) summary(Weakliem.model1) library(xtable)#Print LaTeX code for the output table print(xtable(Weakliem.model1)) plot(gini, secpay, main='Nondemocratic countries', xlab='Gini', ylab='Attitudes towards inequality(mean)') identify(gini,secpay, row.names(Weakliem2)) #"identify" returns cases 7, 26 as outliers abline(Weakliem.model1, lwd=2, lty=1, col=1) Weakliem.model2<-update(Weakliem.model1, subset=-c(7,26)) abline(Weakliem.model2, lwd=2, lty=2, col=2) legend(locator(1), lty=1:2, col=1:2, legend=c('All cases', 'Outliers excluded')) summary(Weakliem.model2) print(xtable(Weakliem.model2)) #qqplots allow us to detect outliers qq.plot(Weakliem.model1, simulate=T, lables=row.names(Weakliem2)) #several observations are outside #the 95% Confidence envelope outlier.test(Weakliem.model1) #returns a Bonferroni t-test for largest absolute #studentized residual #index plot of hat values allows us # to assess leverage plot(hatvalues(Weakliem.model1), main="Hat Values for Inequality model") abline(h=c(2,3)*3/length(secpay), lty=2) #"h" signifies horizontal line #the average hat value=(k+1)/n. #A rule of thumb is that 2*average hat value #for large samples, and 3*average hat value #for small samples should be examined identify(1:length(secpay), hatvalues(Weakliem.model1), row.names(Weakliem2)) #Brazil has the largest hat-value-- #much larger than the Czech Republic or Slovakia ######################################### # # Assessing Influence ######################################### #dfbetas assess influence Weakliem.dfbetas<-dfbetas(Weakliem.model1) plot(Weakliem.dfbetas[,c(2,3)], main="DFBETAs for the Gini and GDP coefficients") #c(2,3) specifies the coefficients of interest identify(Weakliem.dfbetas[,2], Weakliem.dfbetas[,3], row.names(Weakliem2)) #Cook's D assesses plot(cookd(Weakliem.model1)) abline(h=4/24, lty=2) identify(1:26, cookd(Weakliem.model1), row.names(Weakliem2)) #Slovakia and Czech highest Cook's D, #But Slovenia also high #Influence OR Bubble-plot displaying the studentized residuals, #hat-values, and Cook's D plot(hatvalues(Weakliem.model1), rstudent(Weakliem.model1), ylim=c(-3,6),type='n', main="Influence Plot for Inequality data", xlab="Hat Values", ylab="Studentized Residuals") cook<-sqrt(cookd(Weakliem.model1)) points(hatvalues(Weakliem.model1), rstudent(Weakliem.model1), cex=10*cook/max(cook)) abline(v=3/25, lty=2)#line for hatvalues abline(h=c(-2,0,2), lty=2) #lines for studentized residuals identify(hatvalues(Weakliem.model1), rstudent(Weakliem.model1), row.names(Weakliem2)) ##Influence plots can also be made within Rcmdr: library(Rcmdr) R-cmdr> data(Florida, package="car") R-cmdr> attach(Florida) R-cmdr> RegModel.1 <- lm(GORE~BUSH, data=Florida) #after fitting model, use the menus as follows: #models-->graphs-->influence... R-cmdr> influence.plot(RegModel.1) #Added variable plots best for assessing joint influence av.plots(Weakliem.model1) #again cases 7 and 26 stand out #Model with Czech and Slov. deleted qq.plot(Weakliem.model2, simulate=T, lables=row.names(Weakliem2), main="QQ plot for model exluding outliers") #There are now no observations outside #the 95% Confidence envelope outlier.test(Weakliem.model2) #bubble-plot for model with #Czech Republic and Slovakia deleted remove <-which.names(c("CzechRepublic","Slovakia"), Weakliem2) Weakliem.model2 <- update(Weakliem.model1, subset=-remove) plot(hatvalues(Weakliem.model2), rstudent(Weakliem.model2), ylim=c(-3,6),type='n', main="Influence Plot minus outliers", xlab="Hat Values", ylab="Studentized Residuals") cook<-sqrt(cookd(Weakliem.model2)) points(hatvalues(Weakliem.model2), rstudent(Weakliem.model2), cex=10*cook/max(cook)) abline(v=3/25, lty=2)#line for hatvalues abline(h=c(-2,0,2), lty=2) #lines for studentized residuals identify(hatvalues(Weakliem.model2), rstudent(Weakliem.model2), row.names(Weakliem2)) #Added variable plots (partial regression plot) library(car) av.plots(Weakliem.model1) #This allows you to choose the #variables interactively leverage.plot(Weakliem.model1, "gini") #This method you choose the #variable of interest detach(Weakliem2)