### Soc 6708 ### Advanced Statistics ### Robert Andersen, University of Toronto ### Wednesday, May 16, 2012 ###################### ##INTRODUCTION TO R ###################### ########################################## ## 1. Creating variables and data frames ########################################## ## Creating a dataset with 4 variables and 13 observations ## One observation is missing, 'NA", on test score ##variable 1: gender<-c('man', 'man', 'woman', 'woman', 'man', 'man', 'man', 'woman','woman', 'woman', 'man', 'man', 'woman') gender length(gender) ##gives number of observations (i.e., rows) #variable 2: age<-c(30,22,23,24,25,26,23,30,26,28,30,22,26) age length(age) #variable 3: subject<-c('Soc','Soc', 'Soc','Soc', 'Soc','Soc','Soc', 'Polsci','Polsci','Polsci','Polsci', 'Polsci','Polsci') subject length(subject) #variable 4: test.score<-c(NA, 82,78,78,75,82,90,68,75,82,82,75,68) test.score ##'NA' is missing data code in R length(test.score) ###Creating the data frame out of the 4 variables: My.data<-data.frame(gender, age, subject, test.score) ############################### ## 2. Exploring a data frame ############################### My.data ##displays all of the data summary(My.data) ##summaries of the variables in the data frame nrow(My.data) #number of rows (observations) ncol(My.data) #number of columns (variables) length(My.data) #number of variables names(My.data) #variable names #indexing the data frame: My.data[1] #values in first column (first variable) My.data[2] #values in second column (second variable) My.data[1,2] #value in first row (observation), #second column (variable) My.data[1:5, 2:3] #values in first five rows, #columns 2 and 3 ###Taking subsets of the data: My.data2<-subset(My.data[1:5, 2:3]) My.data2 #selecting based on values of a variable My.data3<-subset(My.data, test.score>=80) My.data3 ls() #shows objects in workspace ##removing unwanted objects: remove(age, gender, My.data2, My.data3, subject, test.score) ls() # Various ways of looking at the data # Tables for categorical variables: summary(My.data$subject) table(My.data$subject) summary(My.data$subject)/length(My.data$subject) ##Two-way table table(My.data$subject, My.data$gender) library(gmodels) ##For Crosstab function CrossTable(My.data$subject, My.data$gender) ##Gives cell percentages ##summarizing quantitative variables: summary(My.data$test.score) mean(My.data$test.score) ##won't calculate if missing data mean(na.omit(My.data$test.score)) #'na.omit' removes missing median(My.data$test.score, na.rm=TRUE) #'na.rm' also removes missing min(My.data$test.score, na.rm=TRUE) max(My.data$test.score, na.rm=TRUE) ###Removing missing observations from a data.frame My.data2<-na.omit(My.data) My.data2 sd(My.data2$test.score) var(My.data2$test.score) fivenum(My.data2$test.score) by(My.data2$test.score, My.data2$subject, mean) by(My.data2$test.score, My.data2$gender, fivenum) ################################################### ## 3. Manipulating variables ################################################### ##attaching dataset so that variables are accessible attach(My.data) library(car) # includes 'recode' function #Recoding the missing observation test.score2<-recode(test.score, "NA=75") test.score [1] #indexing first observation (first row) test.score [1:5] #first five rows test.score2 [1:5] #first five rows ##Creating a 'factor' out of a numeric variable test.factor<-as.factor(test.score2) test.factor table(test.factor) #Collapsing test.score2 into categories test.cat<-recode(test.score2, "60:69='C'; 70:79='B'; 80:89='A'; else='A+'") table(test.cat) test.cat2<-recode(test.cat, "c('A', 'A+')='A'") test.cat2 table(test.cat2) ##Creating an 'ordered' variable test.ord<-ordered(test.cat, levels=c('C', 'B', 'A', 'A+')) test.ord table(test.ord) ##Another look at 'My.data' summary(My.data) ###Adding new variables to the 'My.data' data frame: My.data<-data.frame(My.data, test.cat, test.ord) My.data ##cleaning up workspace again ls() remove(My.data2, test.cat, test.cat2, test.factor, test.ord, test.score2) ls() ##Taking a subset of the data (those with Bs or higher) My.data2<-subset(My.data,test.cat!='C') #'!=' is 'not equal' My.data2 #removing the observation missing on 'test.score' My.data3<-na.omit(My.data2) My.data3 table(My.data3$test.cat) table(My.data2$test.cat) nrow(My.data3); nrow(My.data2) #Checking number of observations ###################################### ## 4. Some other important functions ###################################### # creating a variable from a sequence of numbers var1<-seq(0,100, by=10) #variable with range 0-100, #observations spaced at every 10 var1 # alternatively, but more limited: var2<-0:100 #0 to 100, every whole number var2 # 'ifelse' statement to manipulate the variable: var3<-ifelse(var1<=50, 0, 1) # if var1 is less than or # equal to 50, recode so it equals 0; # everything else is equal to 1 var3 var4<-ifelse(var1<=50, 0, var1) # if var1 is less than or # equal to 50, recode so it equals 0; # everything else remains same var4 # standardizing a variable: var1.scale<-as.numeric(scale(var1)) var1.scale mean(var1.scale) # taking the log of a variable var1.log<-log(var1) var1.log #'-inf' for first observation because var1=0 var1.log<-log(var1+.1) #add a small 'positive start' #to remove 0 before transformation # power transformations: var1.squared<-var1^2 #Quadratic transformation var1.cubed<-var1^3 #Cubic transformation var1.sqrt<-sqrt(var1) #square root var1.squared var1.cubed var1.sqrt ########################################### # 5. Saving a data frame ########################################### # as a rectangular text file New.Data<-data.frame(var1, var1.squared, var1.cubed, var1.sqrt) write.table(New.Data, file="c:/Temp/DataName.txt") # Notice the 'forward' leaning slashes! # alternatively, as a CSV file: write.csv(New.Data, file="c:/Temp/DataName.csv", row.names=FALSE) # 'row.names=FALSE' removes numbered row names ############################################## # 6. Importing existing data ############################################## # Text files: Text.data<-read.table('c:/Temp/DataName.txt', header=TRUE) #'header=TRUE' makes first row the variable names # CSV files CSV.data<-read.csv("C:/Temp/Redist_context.csv", header=TRUE) ### 'foreign' library has functions for importing datasets ### in the format of other statistical packages library(foreign) ##SPSS example: SPSS.data<-read.spss('c:/Temp/Redist.sav', use.value.labels=TRUE, to.data.frame=TRUE) # 'use.value.labels=TRUE': value labels as categories # 'to.data.frame=TRUE' imports to a data frame object # often gives 'warning' message but file will be fine ##Stata example STATA.data<-read.dta("c:/temp/ClassIdentity_WVS.dta", convert.factors=TRUE) # 'convert.factors=TRUE': value labels as categories ## merging data frames with 'merge' function Merged.data<-merge(SPSS.data,CSV.data, by="s021") # variable 's021' is country # 'SPSS.data' contains individual-level data # for 48 countries # 'CVS.data' contains country-level variables summary(SPSS.data) summary(CSV.data) summary(Merged.data) ############################### ## EXPLORING DISTRIBUTIONS ############################### ################################## # 1.Categorical variables ################################## library(car) ##some important functions and datasets data(Prestige)#Canadian prestige data in "car" library attach(Prestige) summary(Prestige) ##looking at variables in Prestige dataset ########################### ##Tables of Proportions ########################### #counts: table(type) #proportions: table(type)/length(type) #Aternativley library(gmodels) CrossTable(type) ##gives both counts and proportions #################### ##bar plots #################### ##works only on a 'table' of the variable barplot(table(type)) #customizing the graph: windows() grays<-palette(gray(1:100/100)) #gray scale barplot(table(type), xlab="", ylab="Percentage", ylim=c(0, 60), xaxt="n", legend=c("Blue collar", "Professional", "White collar"), args.legend=list(x="topright", bty="n"), col=c("gray1", "gray50", "white"), main="Distribution of 'Occupation Type'") ##Displaying bars horizontally: barplot(table(type), horiz=TRUE) ######################################### ## 2.Displaying quantitative distributions ######################################### summary(income) ##from 'Prestige' dataset (still attached) ################## # histogram ################## #changing bin size of the histogram so that the distribution #becomes more clear #let n*=number of nonoutling observations #for n*< or = 100, 2*srt(n*) works well; for n*>100, 10*log10n* works well 2*sqrt(length(income)) #length refers to the number of cases-returns answer of 20 hist(income, nclass=20)#nclassassigns the number of bins-reproduces Figure 3.2 n.bins(income)#returns answer of 15 hist(income, nclass=15) #nclass assigns the number of bins-reproduces Figure 3.2 box()#puts a box around the graph ########################## # stem and leaf display ########################## stem(income) stem(income, scale = 2)#Expands the scale of the stemplot ##################################### # historgram with density estimate ##################################### #histogram with naive density estimation #density estimation using rectangular weight funtion for hist(income, nclass=n.bins(income), probability=T, main='Histogram with Density Estimation', ylab='Density') lines(density(income), col='red', lwd=2) lines(density(income, bw = 400, kernel = c("rectangular")), lty=1, lwd=1) legend(locator(1), lty=1:1, lwd=2:1, col=2:1, legend=c('bw=1087', 'bw=400')) #density estimation using the normal density function hist(income, nclass=n.bins(income), probability=T, main='Histogram with Density Estimation', ylab='Density') lines(density(income), col=1, lty=1,lwd=2) lines(density(income, bw=400), col=2, lty=1, lwd=1) legend(locator(1), lty=1, lwd=2:1, col=1:2, legend=c('bw=1087', 'bw=400')) #could also adjust the bandwidth with "adjust=.5" which #would make it 1/2 the size of the default #example: #lines(density(income, adjust=.4), col=2, lwd=2) #the "sm" package provides some other functions for density estimation library(sm) windows() sm.density(income, display="se")#95 percent confidence band sm.density(income, model="normal")#normal reference band sm.density(income, display="se",model="normal") ############################### ##Quantile comparison plots ############################### library(car) windows() #opens a windows device par(mfrow=c(2,2)) #4 plots on one graph (2 rows, 2 columns) par(cex.axis=1.3, cex.lab=1.4) ##changes size of font for 'axes' and 'axes labels' rdata<-rnorm(300) qq.plot(rdata,ylab='Data Quantile',xlab='Normal Quantile', main="Normal distribution") ##qqpositive skew rdata<-rbeta(1000,.4,1.3) rdata<-rnorm(300) rdata<-2^(rdata-min(rdata)+1) par(cex.axis=1.4, cex.lab=1.5) qq.plot(rdata,ylab='Data Quantile',xlab='Normal Quantile', main="Positive Skew") #Quantile-Comparison Plot for income qq.plot(income, labels=row.names(Prestige)) ########################### #boxplot ########################### windows() boxplot(income, main='Boxplot of Income', ylab='Income') identify(rep(1, length(income)), income, row.names(Prestige)) #use mouse to identify outliers ############################### # 3. Exploring relationships ############################## data(anscombe)#Anscombes' contrived data are in "base" package attach(anscombe) summary(anscombe) #Displaying side-by-side scatterplots (Figure 3.1 of Fox, 1997) split.screen(figs=c(2,2)) screen(1) plot(x1, y1, xlim=range(0,20), ylim=range(0,15), main='(a) Accurate summary', xlab='X', ylab='Y', col='red', pch=19) abline(lm(y1~x1), lwd=2) screen(2) plot(x2, y2, xlim=range(0,20), ylim=range(0,15), main='(b) Distorts curvilinear rel.', xlab='X', ylab='Y',col='red', pch=19) abline(lm(y2~x2), lwd=2) screen(3) plot(x3, y3, xlim=range(0,20), ylim=range(0,15), main='(c) Drawn to outlier', xlab='X', ylab='Y', col='red', pch=19) abline(lm(y3~x3), lwd=2) screen(4) plot(x4, y4, xlim=range(0,20), ylim=range(0,15), main='(d) "Chases" outlier', xlab='X', ylab='Y', col='red', pch=19) abline(lm(y4~x4), lwd=2) close.screen(all = TRUE) # exit split-screen mode detach(anscombe) ###Two categorical variables attach(Ornstein) ##dataset in 'car' library ?Ornstein #learn about the data ##crosstabs table(sector, nation) library(gmodels) CrossTable(sector, nation) ##barplot barplot(table(sector, nation)) windows() Table<-table(nation,sector) barplot(Table, beside = TRUE, col = c("lightblue", "mistyrose", "lightcyan", "lavender"), legend = c("Canada", "Other", "UK", "USA"), ylim = c(0, 30), args.legend=list(x="topright", bty="n"), main = "Number of Interlocks by Sector and Nation of Countrol") #Side by side boxplots boxplot(income ~ type, data = Prestige, col="yellow", main='Boxplots of Income for different Occupation Types') detach(Prestige) ####Comparing density estimates #subset of only 'blue collar' jobs from Prestige dataset BC<-subset(Prestige, type=="bc") #subset of only 'white collar' jobs from Prestige dataset WC<-subset(Prestige, type=="wc") #subset of only 'professionals' jobs from Prestige dataset PROF<-subset(Prestige, type=="prof") windows() par(mfrow=c(1,3)) sm.density(BC$income, display="se", xlim=c(0,30000), xlab="Income") title("(a) Blue Collar") abline(v=median(Prestige$income), lty=2) text(12500,0.00025, "Median income") sm.density(WC$income,display="se", xlim=c(0,30000), xlab="Income") title("(b) White Collar") abline(v=median(Prestige$income), lty=2) text(12500,0.00025, "Median income") sm.density(PROF$income, display="se", xlim=c(0,30000), ylim=c(0, 0.0003), xlab="Income") title("(c) Professionals") abline(v=median(Prestige$income), lty=2) text(12500,0.00025, "Median income") #####Heatmap ##data set up in rectangular data.frame Heatmap.data<-read.table("c:/Oxford2011/Data/HeatMap_AJPS.txt", header=T) library(graphics) windows() heatmap(as.matrix(Heatmap.data), Rowv = NA, Colv = NA, scale="row", labCol =c("Never=1",2,3,4,5,6,7,8,9,"Always=10"), cexRow = 1.2, margins = c(8,10), xlab="Homosexuality is justifiable") ##In grayscale: # grays<-palette(gray(1:100/100)) # heatmap(as.matrix(1-Heatmap.data), col =grays, Rowv = NA, Colv = NA, scale="row", # labCol =c("Never=1",2,3,4,5,6,7,8,9,"Always=10"), cexRow = 1.2, margins = c(8,10), # xlab="Homosexuality is justifiable") detach("package:graphics") #example of how jittering a scatterplot makes the #relationship more clear CES <- read.table('C:/Oxford2011/Data/ces97.txt', header=T) attach(CES) summary(CES) plot(age, lascale, main='No jitter', xlab='Age',ylab='Conservative attitudes') abline(lm(lascale~age), lwd=4, col='red' ) #jittered scatterplot plot(jitter(age, factor=2), jitter(lascale, factor=5), main='jittered', xlab='Age',ylab='Conservative attitudes') abline(lm(lascale~age), lwd=4, col='red' ) #enhanced scatterplots display boxplots library(car) scatterplot(jitter(age, factor=2), jitter(lascale, factor=5), main='Enhanced scatterplot with boxplots', xlab='Age',ylab='Conservative attitudes') #Bivariate density estimates #Three variations library(sm) data<-cbind(age,lascale) sm.density(data) sm.density(data, display="image") sm.density(data, display="slice") detach(CES) #a 3D example of Density estimates using Prestige data data(Prestige) attach(Prestige) y <- cbind(income, prestige, education) windows() sm.density(y) detach(Prestige) #Categorical variables and interactions in scatterplots Weakliem <- read.table('C:/Oxford2011/data/Weakliem.txt', header=T) summary(Weakliem) attach(Weakliem) democratic<-factor(democrat) democratic<-recode(democratic, "c(1)='Democratic'; c(0)='Non-Democratic'") summary(democratic) #scatterplot with case names windows() plot(gini, secpay, pch=16, xlim=c(20,65), xlab="Gini Coefficient", ylab="Attitudes towards inequality") #adding space to 'width' from data point chw <- par()$cxy[1] text(gini+chw, secpay, labels=row.names(Weakliem), adj=0) ## scatterplot identifying a categorical variable scatterplot(secpay~gini|democratic, boxplot=F,data=Weakliem, span=.85, lwd=2, main='World Values Survey', xlab='Gini coefficient', ylab='Attitudes towards inequality(mean)', legend.plot=T) #scatterplot matrix scatterplot.matrix(cbind(gini, secpay, gdp)) ##Three dimensional plot y <- cbind(gini, secpay, gdp) sm.density(y) #Dynamic 3D scatterplot library(Rcmdr) #Rcmdr works best using R in SDI format attach(Weakliem) scatter3d(gdp, secpay, gini, fit="linear", bg="white", grid=TRUE) detach(Weakliem) #Conditioning Plots CES <- read.table('C:/Oxford2011/data/ces97.txt', header=T) attach(CES) summary(CES) coplot(jitter(lascale)~age|men+education, panel = panel.car, lwd=3, cex=0.4) detach(CES) coplot(gini~secpay|gdp, panel = panel.smooth) detach(Weakliem) ################################### # 4.Example of transformations ################################### data(Prestige) attach(Prestige) #log transformation of income to get rid of positive skew log.inc<-log(income, 10) sm.density(income, model="normal") sm.density(log.inc, model="normal") detach(Prestige) #transforming nonlinearity library(locfit) # local regression functions data(Leinhardt) # from 'car' library attach(Leinhardt) scatterplot(income, infant) ###Logit transformation for proportion library(car) library(sm) data(Prestige) attach(Prestige) par(mfrow=c(1,2)) sm.density(women, model="normal") sm.density(logit(women+1), model="normal")