### Soc 6708 ### Advanced Statistics ### Robert Andersen, University of Toronto ######################################## ## Nonlinearity ######################################## library(car) library(sm) library(xtable) ############################################### #WVS for both #democracies and non-democracies ############################################### Weakliem<-read.table('C:/Teaching/ICPSR/data/Weakliem.txt', header=T) summary(Weakliem) attach(Weakliem) Weakliem.model<-lm(secpay~gini+democrat) summary(Weakliem.model) #checking the distribution of the #residuals sm.density(rstudent(Weakliem.model), model="normal") #There is a heavy positive tail #We already know the there are some influential cases, #so we shall look for them first plot(hatvalues(Weakliem.model), rstudent(Weakliem.model), type='n') cook<-sqrt(cookd(Weakliem.model)) points(hatvalues(Weakliem.model), rstudent(Weakliem.model), cex=10*cook/max(cook)) abline(v=4/49, lty=2) abline(h=c(-2,0,2), lty=) identify(hatvalues(Weakliem.model), rstudent(Weakliem.model), row.names(Weakliem)) #refitting model without the Czech Republic #and Slovakia Weakliem.model2<-update(Weakliem.model, subset=-c(25,49)) #now checking the distribution of the residuals #for the new model split.screen(figs=c(2,2)) screen(1) sm.density(rstudent(Weakliem.model), model="normal") screen(2) sm.density(rstudent(Weakliem.model2), model="normal") screen(3) qq.plot(Weakliem.model2, simulate=T) close.screen(all=TRUE) #Checking for nonconstant error variance #for the weakliem data plot(fitted.values(Weakliem.model),rstudent(Weakliem.model), main="Plot of studentized residuals versus fitted values") abline(h=0, lty=2) spread.level.plot(Weakliem.model) #suggests that we missed an important #nonlinear pattern in the data #so we add the intercation with democracy and remove teh outliers democratic<-factor(democrat) democratic<-recode(democratic, "c(1)='Democratic'; c(0)='Non-Democratic'") summary(democratic) infl.outliers<-which.names(c('Slovakia', 'CzechRepublic'), Weakliem) Weakliem.mod2<-lm(secpay~gini*democrat+gdp, subset=-infl.outliers) summary(Weakliem.mod2) Anova(Weakliem.mod2) #Better model fit plot(fitted.values(Weakliem.mod2),rstudent(Weakliem.mod2), main="Plot of studentized residuals versus fitted values") abline(h=0, lty=2) #Nonconstant error variance no longer #a serious peoblem detach(Weakliem) ######################################## #Ornstein's interlocking directorates data #used to show how to find the variance-stablizing #transformation ##commands below come close to reproducing #figures 12.1, 12.2 in Fox (1997) library(car) data(Ornstein) attach(Ornstein) Ornstein.model<-lm(interlocks~assets+sector+nation, data=Ornstein) summary(Ornstein.model) #density plot of the residuals sm.density(rstudent(Ornstein.model)) #quantile-comparsion plot of the residuals qq.plot(Ornstein.model, sim=TRUE) #Exploring for nonconstant error variance plot(fitted.values(Ornstein.model),rstudent(Ornstein.model), main="Plot of studentized residuals versus fitted values") abline(h=0, lty=2) spread.level.plot(Ornstein.model) ncv.test(Ornstein.model) ncv.test(Ornstein.model, ~ assets+sector+nation) #weighted least squares of Ornstein data Ornstein.wls<-lm(interlocks~assets+nation+sector, weights=1/assets) summary(Ornstein.wls) summary(Ornstein.model) print(xtable(Ornstein.wls)) print(xtable(Ornstein.model)) #Robust standard errors (White correction) #Function for White's Robust Standard Errors #relying on hccm function in car library(car) robust.se <- function( model) { s <- summary( model) wse <- sqrt( diag( hccm( model, type="hc0"))) t <- model\$coefficients/wse p <- 2*pnorm( -abs( t)) results <- cbind( model\$coefficients, wse, t, p) dimnames(results) <- dimnames( s\$coefficients) results } robust.se(Ornstein.model) #An alternative way to get robust standard errors #is to use the "ols" and "robcov" functions #in the "Design" package library(Design) #fit the OLS model Ornstein.ols <- ols(interlocks~assets+sector+nation, data=Ornstein, x=TRUE, y=TRUE) #summary of OLS model Ornstein.ols #summary of model with robust standard errors: robcov(Ornstein.ols) ############################ #Nonlinearity example using #Prestige data #Component-plus-residual plots #(partial-residual plots) ############################# detach(Ornstein) data(Prestige) attach(Prestige) Prestige.model<-lm(prestige~income+education+women) cr.plots(Prestige.model) #Discrete data test detach(Prestige) data(Vocab) attach(Vocab) summary(Vocab) Vocab.model<-lm(vocabulary~education) plot(education, vocabulary) lines(lowess(education, vocabulary)) summary(Vocab.model) #making education a factor #i.e., it will no be treated #as a set of dummy regressors education2<-as.factor(education) Vocab.model2<-lm(vocabulary~education2) #Since the linear model is nested #within the factor discrete model, #we can usa an F-test for the #difference anova(Vocab.model, Vocab.model2) detach(Vocab) # ML Methods ###################### # Box-Cox regression #################### library(MASS) data(Ornstein) attach(Ornstein) Ornstein.model<-lm((interlocks+1)~assets+sector+nation, data=Ornstein) boxcox(Ornstein.model) boxcox(Ornstein.model, lambda=seq(.1,.5, by=.01)) #adding one to interlocks because #some observations score 0--- #cannot transfom a variable that #takes on values of 0 Ornstein.model.BC <- update(Ornstein.model, ~ . +box.cox.var(interlocks + 1)) summary(Ornstein.model.BC) av.plots(Ornstein.model.BC, 'box.cox.var(interlocks + 1)') hccm(Ornstein.model) Var(Ornstein.model) ########################### # Box-Tidwell regression ########################### detach(Ornstein) attach(Prestige) #attempting find the required transformation #for X's in the Prestige data data(Prestige) box.tidwell(prestige~income+education, ~ poly(women,2)) #Plotting the added-variable plot #for the box.tidwell transformation mod.prestige.bt <- lm(prestige ~ income + education + women + I(women^2) + I(income*log(income)) + I(education*log(education))) summary(mod.prestige.bt) av.plots(mod.prestige.bt, labels=row.names(Prestige))