### Soc 6708 ### Advanced Statistics ### Robert Andersen, University of Toronto ### Wednesday, May 16, 2012 ###################### ## LINEAR MODELS ###################### ############################################ # 1. Fitting Linear Models ############################################ library(car) data(Prestige) attach(Prestige) # Fitting linear model mod1<-lm(prestige~income+education, data=Prestige) summary(mod1) ## model summary anova(mod1) ## Type I F-test Anova(mod1) ## Type II F-tests (compare to 'anova') ##From 'car' library # Adding categorical predictor 'type' of occupation mod2<-lm(prestige~income+education+type, data=Prestige) anova(mod1, mod2) ## Analysis of variance ## Check the dataset to learn why there is an error mod1b<-lm(prestige~income+education, data=na.omit(Prestige)) #'na.omit' removes missing observations mod2b<-lm(prestige~income+education+type, data=na.omit(Prestige)) # Should 'type' be included in the model? summary(mod2b) anova(mod1b, mod2b) Anova(mod2b) ## compare to 'anova (mod1b, mod2b)' AIC(mod2b) ################################################ # 2. Dealing with categorical predictors ################################################ #checking reference category for 'type' contrasts(type) # reference coded '0' # alternatively: levels(type) # first category is reference # changing the reference category to 'prof' contrasts(type)<-contr.treatment(levels(type), base=2) # second row contrasts(type) # alternatively: type2<-relevel(type, 'prof') contrasts(type2) ########################################################## # 3. Quasi-variances ######################################################### library(qvcalc) qvtype<-qvcalc(mod2b, "type") summary(qvtype) ## Is there a problem with the quasi-variances'? ## see '?qvcalc' for details ?qvcalc ## Refitting model wihout 'income': mod3<-lm(prestige~type, data=na.omit(Prestige)) qvtype.mod3<-qvcalc(mod3, "type") plot(qvtype.mod3) # plotting 95% confidence intervals for each category # based on quasi-variances #Making function for plot of regular standard errors: plot.se<-function (x, width = 2, ylab = "estimate", main = "Intervals based on standard errors", ...) { frame <- x$qvframe if (is.na(frame$estimate[1])) stop("x has no parameter estimates to plot") if (any(is.nan(frame$quasiSE))) levels <- factor(row.names(frame), levels = row.names(frame)) xvalues <- seq(along = levels) est <- frame$estimate se <- frame$SE tops <- est + width * se tails <- est - width * se range <- max(tops) - min(tails) ylim <- c(min(tails) - range/10, max(tops) + range/10) plot(levels, frame$estimate, border = "transparent", ylim = ylim, xlab = x$factorname, ylab = ylab, main = main, ...) points(frame$estimate, ...) segments(xvalues, tails, xvalues, tops) } #Plotting graphs of SE ad Quasi SEs side-by-side split.screen(figs=c(1,2)) # an alternative to 'par(mfrow=c(1,2))' screen(1) plot.se(qvtype.mod3, cex=1.5, main="Regular SEs") screen(2) plot(qvtype.mod3, cex=1.5, main="Quasi SEs") close.screen(all=TRUE) ########################################## # 4. Simple nonlinear relationships ########################################## # Fitting model with quadratic effect for income # creating income-squared term: Prestige$income2<-Prestige$income^2 Prestige ## Fitting model mod.poly<-lm(prestige~education+income+income2, data=Prestige) summary(mod.poly) ## Compare model 'mod.poly' to model with orthogonal polynomial: mod.poly2<-lm(prestige~education+poly(income,2), data=Prestige) summary(mod.poly) summary(mod.poly2) # How and why do the models differ? # How and why are they similar? mod.inc2<-lm(prestige~education+income2, data=Prestige) summary(mod.inc2) #Does this model differ from the quadratic polynomial? summary(mod.poly) #################################### # 5. Effect displays #################################### library(effects) ?effect # income effect for quadratic polynomial: inc.eff<-effect("poly(income,2)", mod.poly2) inc.eff # see what is in the 'effect' object summary(inc.eff) plot(inc.eff, main="Income effect", xlab="Income", ylab="Fitted Prestige score") ######Creating my own plot from the effects # a) Creating data.frame from effects output: inc.eff2<-data.frame(inc.eff) inc.eff2 # b) Extracting the fitted values and 95% confidence bands income.eff<-inc.eff2[1:nrow(inc.eff2), 1] fit.prestige<-inc.eff2[1:nrow(inc.eff2), 2] lower.prestige<-inc.eff2[1:nrow(inc.eff2), 4] upper.prestige<-inc.eff2[1:nrow(inc.eff2), 5] # c) making the plot plot(income.eff, fit.prestige, main="Effect display for income", xlab="Income", ylab="Fitted prestige", type="l", lwd=2, ylim=c(20, 75)) lines(income.eff, lower.prestige, lty=3) lines(income.eff, upper.prestige, lty=3) ### Creating an Effect Display using the 'predict' function ?predict income<-seq(0,26000, length=10) length(income) education<-seq(0,16, length=10) length(education) new.data<-data.frame(income, education) fitted.data<-data.frame(income, education, predict(mod.poly2, new.data, se.fit=TRUE)) fitted.data plot(fitted.data$income, fitted.data$fit, type="l", main="Fitted Prestige by Income", xlab="Income", ylab="Fitted prestige") ############################################# # 6. Standardization and Relative importance ############################################# #standardizing 'income' and 'education' using scale function: mod1<-lm(prestige~scale(income)+scale(education)+type, data=Prestige) summary(mod1) # relative importance of several terms: mod1b<-lm(prestige~income+education+type, data=Prestige) summary(mod1b) library(relimp) # comparing 'income' (2nd terms in summary) # and occupation type (4th and 5th terms in summary) relimp(mod1b, set1=2, set2=c(4,5)) ############################################## # 7. Interactions ############################################## mod2a<-lm(prestige~income*type, data=Prestige) ##alternatively mod2b<-lm(prestige~income+type+income:type, data=Prestige) Anova(mod2b) library(effects) plot(allEffects(mod2b)) ## type: ?AllEffects for help