Data Mining with R: Census Income

Objective: Predict whether income of a person exceeds $50K/yr based on census data.
Data: downloaded from http://archive.ics.uci.edu/ml/datasets/Adult
Method: Logistic Regression; Decision Tree

Initialization

rm(list=ls(all=TRUE)) ## Clear memory
library(RCurl)     ## package for downloading URLs 
library(XML)       ## package for reading HTML files
library(xtable)    ## package for printing HTML tables
library(corrplot)  ## package for plotting correlations
library(prettyR)   ## package for cross-tabulation
library(ggplot2)   ## package for producing graphical plots
library(gridExtra) ## package for drawing multiple graphs on a grid

Download data using RCurl

This function looks up the Index file in a given URL and downloads the content of URL using RCurl package.

download.files <- function(url) {
  ## Get the list of files in URL
  html <- getURLContent(url)           ## Get the Index file
  html.content <- htmlParse(html)      ## Get the html content
  html.table <- readHTMLTable(content) ## Read the html table
  files <- html.table[[1]]$Name        ## Get file names
  files <- files[!is.na(files)]        ## Remove NA
  files <- as.character(files[3:length(files)]) ## Ignore the first two entries: "Parent Directory" and "Index"
  print(files)
  ## Download files and save into local directory
  for (f in files){
    fname <- file.path(url,f)
    content <- getURLContent(fname)  ## Get the file content
    write(content,f)
  }
}

Get the training and test datasets

If files are already downloaded then import the saved data, otherwise read data from files:

## Check if files are already downloaded
if (file.exists("train.test.RData")) {
  load("train.test.RData")             
} else {
  url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/"
  download.files(url)
  ## Get training and test data
  training.file <- list.files(pattern=".data")
  training.data <- as.data.frame(read.csv(training.file))
  test.file <- list.files(pattern=".test")
  test.data <- as.data.frame(read.csv(test.file, skip=1))
  ## Get the names of variables
  description.file <- list.files(pattern=".names")[-1]
  content <- readLines(description.file)
  i <- grep("Attribute Information",content) + 2
  var.names <- NULL
  while(content[i]!="") {
    j <- gregexpr(":", content[i])[[1]][1]
    var.names <- c(var.names, substr(content[i],1,j-1))
    i <- i + 1
  }
  print(var.names)
  names(training.data) <- gsub("-","",var.names)
  names(test.data) <- gsub("-","",var.names)
  save(training.data, test.data, var.names, file="train.test.RData")
}
N.obs <- dim(training.data)[1]  ## Number of observations
N.var <- dim(training.data)[2]  ## Number of variables
cat("Number of observations:",N.obs,"

     Number of variables:",N.var,"\n")

Number of observations: 32560
Number of variables: 15

Look at the first few rows of the data

print(xtable(head(training.data), caption='Training Data'), caption.placement = 'top', type="html", 
      html.table.attributes='border=1 align="center" bgcolor="#FFCC00"')

Training Data
	age	workclass	fnlwgt	education	educationnum	maritalstatus	occupation	relationship	race	sex	hoursperweek	nativecountry	class
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	40	Cuba	<=50K
5	37	Private	284582	Masters	14	Married-civ-spouse	Exec-managerial	Wife	White	Female	40	United-States	<=50K
6	49	Private	160187	9th	5	Married-spouse-absent	Other-service	Not-in-family	Black	Female	16	Jamaica	<=50K

Exploratory Data Analysis and Feature Selection

Check for missing data

cat("Number of missing data in training set:", sum(is.na(training.data)), "\n")

Number of missing data in training set: 0

Statistics summary

print(xtable(summary(training.data), caption='Summary'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="green"')

Summary
	age	workclass	fnlwgt	education	educationnum	maritalstatus	occupation	relationship	race	sex	capitalgain	capitalloss	hoursperweek	nativecountry	class
1	Min. :17.0	Private :22696	Min. : 12285	HS-grad :10501	Min. : 1.0	Married-civ-spouse :14976	Prof-specialty :4140	Husband :13193	White :27815	Male :21789	Min. : 0	Min. : 0	Min. : 1.0	United-States:29169	<=50K:24719
2	1st Qu.:28.0	Self-emp-not-inc: 2541	1st Qu.: 117832	Some-college: 7291	1st Qu.: 9.0	Never-married :10682	Craft-repair :4099	Not-in-family : 8304	Black : 3124	Female:10771	1st Qu.: 0	1st Qu.: 0	1st Qu.:40.0	Mexico : 643	>50K : 7841
3	Median :37.0	Local-gov : 2093	Median : 178363	Bachelors : 5354	Median :10.0	Divorced : 4443	Exec-managerial:4066	Own-child : 5068	Asian-Pac-Islander: 1039		Median : 0	Median : 0	Median :40.0	? : 583
4	Mean :38.6	? : 1836	Mean : 189782	Masters : 1723	Mean :10.1	Separated : 1025	Adm-clerical :3769	Unmarried : 3446	Amer-Indian-Eskimo: 311		Mean : 1078	Mean : 87	Mean :40.4	Philippines : 198
5	3rd Qu.:48.0	State-gov : 1297	3rd Qu.: 237054	Assoc-voc : 1382	3rd Qu.:12.0	Widowed : 993	Sales :3650	Wife : 1568	Other : 271		3rd Qu.: 0	3rd Qu.: 0	3rd Qu.:45.0	Germany : 137
6	Max. :90.0	Self-emp-inc : 1116	Max. :1484705	11th : 1175	Max. :16.0	Married-spouse-absent: 418	Other-service :3295	Other-relative: 981			Max. :99999	Max. :4356	Max. :99.0	Canada : 121
7		(Other) : 981		(Other) : 5134		Married-AF-spouse : 23	(Other) :9541							(Other) : 1709

Sort categorical variables in descending order

categ.sort <- function(x){reorder(x,x,function(y){-length(y)})} ## Sorting function for categorical variables
categ.var <- which(sapply(training.data, is.factor)) ## Find the categorical variables
for (c in categ.var){  ## Apply the sort function on each categorical variable
  training.data[,c] <- categ.sort(training.data[,c])   
}
attach(training.data)

Histograms of numerical variables

## Function for displaying histograms using ggplot2
p1 <- ggplot(training.data, aes(x=age)) + ggtitle("Age") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=5, colour="black", fill="white") + ylab("Percentage")
p2 <- ggplot(training.data, aes(x=log10(fnlwgt))) + ggtitle("log( Weight )") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage")
p3 <- ggplot(training.data, aes(x=educationnum)) + ggtitle("Years of Education") + 
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=1, colour="black", fill="white") + ylab("Percentage")
p4 <- ggplot(training.data, aes(x=hoursperweek)) + ggtitle("Hours per Week") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage")
p5 <- ggplot(training.data, aes(x=log10(capitalgain+1))) + ggtitle("log( Capital Gain )") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage") + 
  annotate("text", x = 3, y = 50, label = "X", colour="red", size=30, fontface="bold")
p6 <- ggplot(training.data, aes(x=log10(capitalloss+1))) + ggtitle("log( Capital Loss )") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage") + 
  annotate("text", x = 2, y = 50, label = "X", colour="red", size=30, fontface="bold")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3)
cat("% of data with zero Capital Gain:", sum(capitalgain==0)/N.obs*100, 
    "<br>% of data with zero Capital Loss:", sum(capitalloss==0)/N.obs*100)

plot of chunk histograms

% of data with zero Capital Gain: 91.67
% of data with zero Capital Loss: 95.33

Feature Analysis: Age, log(Weight), Years of Education, and Hours per Week have broad distributions, therefore will be considered for regression analysis. Capital Gain and Capital Loss, however, have very narrow distributions (more than 90% of data are clustered at zero) therefore they will be excluded from the model.

Bar plots of categorical variables

p1 <- ggplot(training.data, aes(x=workclass)) + ggtitle("Work Class") + xlab("Work Class") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() + 
  scale_x_discrete(limits = rev(levels(workclass)))
p2 <- ggplot(training.data, aes(x=education)) + ggtitle("Education") + xlab("Education") + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() +
  scale_x_discrete(limits = rev(levels(education)))
p3 <- ggplot(training.data, aes(x=occupation)) + ggtitle("Occupation") + xlab("Occupation") + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() +
  scale_x_discrete(limits = rev(levels(occupation)))
p4 <- ggplot(training.data, aes(x=nativecountry)) + ggtitle("Native Country") + xlab("Native Country") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() + 
  scale_x_discrete(limits = rev(levels(nativecountry))) +
  annotate("text", x = 21, y = 50, label = "X", colour="red", size=30, fontface="bold")
grid.arrange(p1, p2, p3, p4, ncol=2)

plot of chunk barplots Native Country has a very narrow distribution (90% of population coming from the United States), therefore will be excluded from the model.

Pie charts of categorical variables

p1 <- ggplot(training.data, aes(x=factor(1), fill=maritalstatus)) + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") + 
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) + 
  xlab("") + ylab("") + ggtitle("Marital Status") 
p2 <- ggplot(training.data, aes(x=factor(1), fill=relationship)) + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") + 
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) + 
  xlab("") + ylab("") + ggtitle("Relationship") 
p3 <- ggplot(training.data, aes(x=factor(1), fill=race)) + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") + 
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) + 
  xlab("") + ylab("") + ggtitle("Race")
p4 <- ggplot(training.data, aes(x=factor(1), fill=sex)) + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") + 
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) + 
  xlab("") + ylab("") + ggtitle("Sex")
grid.arrange(p1, p2, p3, p4, ncol=2)

plot of chunk piecharts All these variables have reasonable spread of distribution, therefore will be considered.

Correlation between numerical variables

numeric.var <- sapply(training.data, is.numeric) ## Find numerical variables
corr.matrix <- cor(training.data[,numeric.var])  ## Calculate the correlation matrix
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables")  ## Correlation plot
par(mar=c(5.1,4.1,4.1,2.1))  ## Restore plot margins

plot of chunk corr1

The numerical variables are nearly uncorrelated.

Correlation between numerical variables and income class

par(mfrow=c(2,2))  ## Arrange plots in a 4x4 grid
boxplot(age~class, main="Age vs. Income Class", 
        xlab="Income Class", ylab="Age")
boxplot(educationnum~class, main="Years of Eduction vs. Income Class", 
        xlab="Income Class", ylab="Years of Eduction")
boxplot(log(fnlwgt)~class, main="log(Weight) vs. Income Class", 
        xlab="Income Class", ylab="log(Weight)")
text(0.4, 12, "X", cex=8, font=2, col="red", xpd=TRUE)
boxplot(hoursperweek~class, main="Hours per Week vs. Income Class", 
        xlab="Income Class", ylab="Hours per Week")
par(mfrow=c(1,1))

plot of chunk corr2

Weight shows little variation with Income Class therefore it will be excluded from the model. Other variables have significant correlation with Income hence will be selected.

Correlation between numerical and categorical variables

Education and Years of Education exhibit perfect correlation (zero standard deviation within each category), therefore the categorical variable Education will be excluded from the model.

boxplot(educationnum~education, main="Years of Eduction\nvs. Education", 
        xlab="Education", ylab="Years of Eduction")
text(8.5, 0, "X", cex=8, font=2, col="red", xpd=TRUE)

plot of chunk corr3

AOV <- aov(educationnum~education)
print(xtable(summary(AOV), caption='Analysis of Variance'), caption.placement='top', type="html", 
  html.table.attributes='border=1 align="center" bgcolor="lightgray"')

Analysis of Variance
	Df	Sum Sq	Mean Sq	F value	Pr(>F)
education	15	215502.53	14366.84	54063530208304329392208800202220400.00	0.0000
Residuals	32544	0.00	0.00

Correlation between categorical variables and income class

par(mfrow=c(3,2))
par(las=1)  ## horizontal axis labels
plot(table(class, workclass), main="Work Class vs. Income Class", cex=1.5)
plot(table(class, maritalstatus), main="Marital Status vs. Income Class", cex=1.5)
plot(table(class, occupation), main="Occupation vs. Income Class", cex=1.5)
plot(table(class, relationship), main="Relationship vs. Income Class", cex=1.5)
plot(table(class, race), main="Race vs. Income Class", cex=1.5)
plot(table(class, sex), main="Sex vs. Income Class", cex=1.5)
par(las=0)  ## parallel axis labels

plot of chunk corr4

Work Class and Race show weak correlation with Income Class, nevertheless all these variables are selected.

Logistic Regression

Generalized linear model (glm)

Use the selected features in the glm model:

## Create a binary response variable
y <- rep(0, N.obs)
y[class==levels(class)[2]] <- 1
## GLM fit
fit <- glm(y ~ age + educationnum + hoursperweek + workclass + maritalstatus + occupation + relationship + race + sex, family=binomial("logit"))
save(fit, file="fit.RData")

Model summary

## Get the coefficients
tab <- summary(fit)$coefficients
## Sort by p-value
sorter <- order(tab[,4])
tab <- tab[sorter,]
## Display the result
print(xtable(tab, digits=3, caption='GLM coefficients<br>(sorted in ascending order of p-value)'), caption.placement='top', type="html", html.table.attributes='border=1 align="center" bgcolor="lightgray"')

GLM coefficients
(sorted in ascending order of p-value)
	Estimate	Std. Error	z value	Pr(>\|z\|)
educationnum	0.296	0.009	34.138	0.000
(Intercept)	-5.470	0.160	-34.100	0.000
hoursperweek	0.031	0.002	20.073	0.000
age	0.030	0.002	19.281	0.000
relationship Wife	1.360	0.095	14.288	0.000
occupation Other-service	-1.510	0.107	-14.085	0.000
occupation Farming-fishing	-1.546	0.124	-12.426	0.000
sex Female	-0.856	0.072	-11.931	0.000
workclass ?	-1.237	0.105	-11.819	0.000
maritalstatus Never-married	-2.511	0.248	-10.105	0.000
occupation Handlers-cleaners	-1.291	0.132	-9.765	0.000
occupation Machine-op-inspct	-0.896	0.092	-9.741	0.000
maritalstatus Divorced	-2.081	0.254	-8.194	0.000
occupation Transport-moving	-0.708	0.088	-8.068	0.000
occupation Adm-clerical	-0.587	0.075	-7.846	0.000
maritalstatus Separated	-2.206	0.284	-7.772	0.000
occupation Craft-repair	-0.514	0.067	-7.683	0.000
workclass Self-emp-not-inc	-0.454	0.060	-7.615	0.000
maritalstatus Widowed	-1.968	0.281	-7.006	0.000
maritalstatus Married-spouse-absent	-2.168	0.316	-6.857	0.000
workclass Federal-gov	0.446	0.088	5.054	0.000
occupation Sales	-0.294	0.066	-4.432	0.000
workclass State-gov	-0.372	0.086	-4.318	0.000
occupation Exec-managerial	0.223	0.059	3.800	0.000
workclass Self-emp-inc	0.241	0.079	3.058	0.002
occupation Priv-house-serv	-3.279	1.126	-2.912	0.004
workclass Local-gov	-0.195	0.069	-2.843	0.004
relationship Own-child	-0.671	0.252	-2.664	0.008
race Other	-0.669	0.252	-2.657	0.008
race Amer-Indian-Eskimo	-0.504	0.208	-2.425	0.015
relationship Not-in-family	0.565	0.251	2.248	0.025
race Asian-Pac-Islander	-0.165	0.096	-1.722	0.085
race Black	-0.119	0.070	-1.705	0.088
relationship Other-relative	-0.351	0.231	-1.521	0.128
relationship Unmarried	0.357	0.265	1.344	0.179
occupation Armed-Forces	-1.418	1.260	-1.125	0.260
maritalstatus Married-AF-spouse	0.370	0.481	0.769	0.442
occupation Protective-serv	-0.055	0.112	-0.495	0.621
occupation Tech-support	0.019	0.098	0.198	0.843
workclass Without-pay	-11.968	118.027	-0.101	0.919
workclass Never-worked	-10.953	162.917	-0.067	0.946

Feature analysis

The top three most-relevant features include Years of Education, Hours per Week, and Age, all of which are numeric variables.
At the bottom of the table, instances of Work Class, Occupation, Relationship, and Marital Status are observed. However these variables are seen all across the table, therefore cannot be eliminated from the model.
From the remaining two categorical variables, Sex appears near the top of the table and is highly ranked, however Race categories are ranked near the bottom of the table and therefore can be removed to avoid model overfitting.

Cross-Validation

Apply model to the test set

## Prediction of test data
pred <- predict(fit, test.data, type="response")
N.test <- length(pred)
## Use a threshold of 0.5 to assign the predicted classes
y.hat <- rep(0, N.test)
y.hat[pred>=0.5] <- 1

Confusion Table

Calculate true positive ($ TP $), false positive ($ FP $), false negative ($ FN $), and true negative ($ TN $), and evaluate accuracy:

\[ accuracy = \frac{TP+TN}{TP+FP+FN+TN} \]

## Get the true outcome of the test data
outcome <- levels(test.data$class)
y.test <- rep(0, N.test)
y.test[test.data$class==outcome[2]] <- 1
## Tabulate the true outcome vs. prediction
confusion.table <- table(y.hat, y.test)
colnames(confusion.table) <- c(paste("Actual",outcome[1]), outcome[2])
rownames(confusion.table) <- c(paste("Predicted",outcome[1]), outcome[2])
print(xtable(confusion.table, caption='Confusion Table for Logistic Regression'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="pink"')

Confusion Table for Logistic Regression
	Actual <=50K	>50K
Predicted <=50K	11452	1705
>50K	982	2141

accuracy <- sum(diag(confusion.table)) / N.test
cat("accuracy =", accuracy*100, "%<br>missclassification =", 100-accuracy*100, "%\n")

accuracy = 83.5 %
missclassification = 16.5 %

Decision Tree

library(party)
tree1 <- ctree(class ~ educationnum + hoursperweek + age + sex + workclass + maritalstatus + occupation + relationship, data=training.data)
print.tree(tree1)

Conditional inference tree with 87 terminal nodes

Response: class
Inputs: educationnum, hoursperweek, age, sex, workclass, maritalstatus, occupation, relationship
Number of observations: 32560

1) relationship == { Husband, Wife}; criterion = 1, statistic = 6698.472
  2) educationnum <= 12; criterion = 1, statistic = 2346.306
    3) occupation == { Prof-specialty, Exec-managerial, Adm-clerical, Sales, Tech-support, Protective-serv}; criterion = 1, statistic = 755.73
      4) educationnum <= 9; criterion = 1, statistic = 95.657
        5) hoursperweek <= 34; criterion = 1, statistic = 41.951
          6) workclass == { Self-emp-inc, Federal-gov}; criterion = 0.972, statistic = 19.414
            7)* weights = 23
          6) workclass == { Private, Self-emp-not-inc, Local-gov, State-gov, Without-pay}
            8)* weights = 184
        5) hoursperweek > 34
          9) educationnum <= 8; criterion = 1, statistic = 31.898
            10)* weights = 195
          9) educationnum > 8
            11) age <= 34; criterion = 1, statistic = 37.574
              12) age <= 27; criterion = 1, statistic = 20.729
                13) hoursperweek <= 55; criterion = 0.956, statistic = 7.689
                  14)* weights = 100
                13) hoursperweek > 55
                  15)* weights = 12
              12) age > 27
                16)* weights = 280
            11) age > 34
              17) workclass == { Self-emp-not-inc, State-gov}; criterion = 1, statistic = 27.78
                18)* weights = 191
              17) workclass == { Private, Local-gov, Self-emp-inc, Federal-gov}
                19) occupation == { Exec-managerial, Tech-support}; criterion = 0.984, statistic = 18.824
                  20)* weights = 308
                19) occupation == { Prof-specialty, Adm-clerical, Sales, Protective-serv}
                  21)* weights = 557
      4) educationnum > 9
        22) age <= 33; criterion = 1, statistic = 51.745
          23) age <= 27; criterion = 1, statistic = 17.139
            24)* weights = 185
          23) age > 27
            25)* weights = 338
        22) age > 33
          26) workclass == { Self-emp-not-inc, State-gov, Without-pay}; criterion = 1, statistic = 45.321
            27)* weights = 218
          26) workclass == { Private, Local-gov, Self-emp-inc, Federal-gov}
            28) hoursperweek <= 34; criterion = 1, statistic = 28.93
              29) sex == { Female}; criterion = 0.986, statistic = 12.072
                30)* weights = 36
              29) sex == { Male}
                31)* weights = 64
            28) hoursperweek > 34
              32) occupation == { Adm-clerical, Sales}; criterion = 0.995, statistic = 21.56
                33)* weights = 524
              32) occupation == { Prof-specialty, Exec-managerial, Tech-support, Protective-serv}
                34)* weights = 821
    3) occupation == { Craft-repair, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Farming-fishing, Priv-house-serv, Armed-Forces}
      35) educationnum <= 7; criterion = 1, statistic = 222.588
        36) hoursperweek <= 45; criterion = 1, statistic = 30.09
          37) hoursperweek <= 38; criterion = 0.964, statistic = 17.103
            38)* weights = 216
          37) hoursperweek > 38
            39)* weights = 860
        36) hoursperweek > 45
          40)* weights = 228
      35) educationnum > 7
        41) occupation == { Other-service, ?, Handlers-cleaners, Farming-fishing, Priv-house-serv, Armed-Forces}; criterion = 1, statistic = 116.098
          42) age <= 40; criterion = 0.998, statistic = 13.415
            43) age <= 26; criterion = 0.973, statistic = 10.186
              44)* weights = 132
            43) age > 26
              45) educationnum <= 10; criterion = 0.97, statistic = 8.859
                46)* weights = 512
              45) educationnum > 10
                47)* weights = 74
          42) age > 40
            48) hoursperweek <= 34; criterion = 0.98, statistic = 10.071
              49) educationnum <= 9; criterion = 0.999, statistic = 15.251
                50) age <= 55; criterion = 0.993, statistic = 10.993
                  51)* weights = 34
                50) age > 55
                  52)* weights = 107
              49) educationnum > 9
                53)* weights = 71
            48) hoursperweek > 34
              54) occupation == { Other-service, Farming-fishing, Priv-house-serv}; criterion = 0.953, statistic = 14.465
                55)* weights = 370
              54) occupation == { ?, Handlers-cleaners}
                56)* weights = 243
        41) occupation == { Craft-repair, Machine-op-inspct, Transport-moving}
          57) age <= 35; criterion = 1, statistic = 93.145
            58) hoursperweek <= 47; criterion = 1, statistic = 21.257
              59) age <= 24; criterion = 0.997, statistic = 12.568
                60)* weights = 99
              59) age > 24
                61)* weights = 790
            58) hoursperweek > 47
              62)* weights = 296
          57) age > 35
            63) hoursperweek <= 35; criterion = 1, statistic = 24.539
              64)* weights = 140
            63) hoursperweek > 35
              65) educationnum <= 9; criterion = 1, statistic = 17.47
                66)* weights = 1380
              65) educationnum > 9
                67)* weights = 741
  2) educationnum > 12
    68) occupation == { Craft-repair, Adm-clerical, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Farming-fishing}; criterion = 1, statistic = 289.065
      69) hoursperweek <= 41; criterion = 0.997, statistic = 23.655
        70)* weights = 502
      69) hoursperweek > 41
        71) workclass == { Private, Local-gov, ?, State-gov, Self-emp-inc, Federal-gov}; criterion = 0.999, statistic = 28.308
          72)* weights = 201
        71) workclass == { Self-emp-not-inc}
          73)* weights = 53
    68) occupation == { Prof-specialty, Exec-managerial, Sales, Tech-support, Protective-serv, Armed-Forces}
      74) educationnum <= 13; criterion = 1, statistic = 49.21
        75) workclass == { Private, Self-emp-inc, Federal-gov}; criterion = 1, statistic = 36.172
          76) age <= 28; criterion = 1, statistic = 19.916
            77) age <= 25; criterion = 0.976, statistic = 8.765
              78)* weights = 46
            77) age > 25
              79) sex == { Female}; criterion = 0.986, statistic = 9.79
                80)* weights = 22
              79) sex == { Male}
                81)* weights = 76
          76) age > 28
            82) occupation == { Prof-specialty, Sales, Tech-support, Protective-serv}; criterion = 0.965, statistic = 15.113
              83)* weights = 852
            82) occupation == { Exec-managerial}
              84)* weights = 651
        75) workclass == { Self-emp-not-inc, Local-gov, State-gov}
          85)* weights = 490
      74) educationnum > 13
        86) hoursperweek <= 25; criterion = 1, statistic = 24.662
          87) sex == { Female}; criterion = 0.967, statistic = 8.195
            88)* weights = 14
          87) sex == { Male}
            89)* weights = 52
        86) hoursperweek > 25
          90)* weights = 1473
1) relationship == { Not-in-family, Own-child, Unmarried, Other-relative}
  91) occupation == { Prof-specialty, Exec-managerial, Protective-serv}; criterion = 1, statistic = 1146.951
    92) educationnum <= 13; criterion = 1, statistic = 201.34
      93) hoursperweek <= 44; criterion = 1, statistic = 99.093
        94) age <= 34; criterion = 1, statistic = 56.847
          95)* weights = 1152
        94) age > 34
          96) sex == { Female}; criterion = 1, statistic = 30.016
            97)* weights = 687
          96) sex == { Male}
            98)* weights = 353
      93) hoursperweek > 44
        99) educationnum <= 12; criterion = 1, statistic = 20.214
          100) sex == { Male}; criterion = 0.998, statistic = 16.955
            101) age <= 42; criterion = 0.972, statistic = 12.414
              102)* weights = 182
            101) age > 42
              103)* weights = 74
          100) sex == { Female}
            104) occupation == { Prof-specialty, Protective-serv}; criterion = 0.989, statistic = 13.259
              105)* weights = 41
            104) occupation == { Exec-managerial}
              106)* weights = 136
        99) educationnum > 12
          107) age <= 27; criterion = 1, statistic = 23.707
            108)* weights = 96
          107) age > 27
            109) workclass == { Self-emp-not-inc, Local-gov, State-gov, Self-emp-inc}; criterion = 0.998, statistic = 23.402
              110)* weights = 124
            109) workclass == { Private, Federal-gov}
              111) occupation == { Exec-managerial, Protective-serv}; criterion = 0.991, statistic = 17.141
                112)* weights = 146
              111) occupation == { Prof-specialty}
                113)* weights = 114
    92) educationnum > 13
      114) educationnum <= 14; criterion = 1, statistic = 46.391
        115) occupation == { Prof-specialty, Protective-serv}; criterion = 1, statistic = 48.1
          116) age <= 46; criterion = 0.954, statistic = 11.049
            117) workclass == { Private, Self-emp-not-inc, Self-emp-inc}; criterion = 0.983, statistic = 18.722
              118)* weights = 125
            117) workclass == { Local-gov, State-gov, Federal-gov}
              119)* weights = 138
          116) age > 46
            120)* weights = 145
        115) occupation == { Exec-managerial}
          121) hoursperweek <= 43; criterion = 0.999, statistic = 14.055
            122)* weights = 79
          121) hoursperweek > 43
            123) age <= 29; criterion = 0.963, statistic = 8.005
              124)* weights = 7
            123) age > 29
              125)* weights = 80
      114) educationnum > 14
        126) age <= 32; criterion = 0.988, statistic = 17.017
          127) maritalstatus == { Never-married, Married-spouse-absent}; criterion = 0.962, statistic = 10.668
            128)* weights = 61
          127) maritalstatus == { Divorced}
            129)* weights = 9
        126) age > 32
          130) sex == { Female}; criterion = 0.994, statistic = 17.268
            131) maritalstatus == { Divorced, Separated, Widowed, Married-spouse-absent}; criterion = 0.968, statistic = 15.369
              132)* weights = 46
            131) maritalstatus == { Never-married}
              133)* weights = 44
          130) sex == { Male}
            134)* weights = 99
  91) occupation == { Craft-repair, Adm-clerical, Sales, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Farming-fishing, Tech-support, Priv-house-serv, Armed-Forces}
    135) hoursperweek <= 41; criterion = 1, statistic = 268.244
      136) age <= 28; criterion = 1, statistic = 141.138
        137) maritalstatus == { Married-civ-spouse, Separated}; criterion = 1, statistic = 44.013
          138)* weights = 209
        137) maritalstatus == { Never-married, Divorced, Widowed, Married-spouse-absent}
          139) age <= 21; criterion = 0.993, statistic = 11.182
            140)* weights = 2627
          139) age > 21
            141) sex == { Female}; criterion = 0.951, statistic = 7.467
              142)* weights = 1225
            141) sex == { Male}
              143)* weights = 1477
      136) age > 28
        144) educationnum <= 12; criterion = 1, statistic = 95.835
          145) occupation == { Craft-repair, Farming-fishing, Tech-support}; criterion = 1, statistic = 68.331
            146)* weights = 986
          145) occupation == { Adm-clerical, Sales, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Priv-house-serv}
            147) maritalstatus == { Married-civ-spouse}; criterion = 1, statistic = 35.81
              148)* weights = 51
            147) maritalstatus == { Never-married, Divorced, Separated, Widowed, Married-spouse-absent, Married-AF-spouse}
              149) age <= 50; criterion = 0.994, statistic = 19.088
                150)* weights = 2958
              149) age > 50
                151)* weights = 1269
        144) educationnum > 12
          152) maritalstatus == { Married-civ-spouse}; criterion = 0.969, statistic = 20.743
            153)* weights = 11
          152) maritalstatus == { Never-married, Divorced, Separated, Widowed, Married-spouse-absent}
            154)* weights = 565
    135) hoursperweek > 41
      155) educationnum <= 12; criterion = 1, statistic = 103.455
        156) age <= 38; criterion = 1, statistic = 65.904
          157) maritalstatus == { Married-civ-spouse}; criterion = 1, statistic = 41.235
            158)* weights = 22
          157) maritalstatus == { Never-married, Divorced, Separated, Widowed, Married-spouse-absent}
            159) age <= 29; criterion = 1, statistic = 19.805
              160) workclass == { Self-emp-not-inc}; criterion = 0.999, statistic = 27.501
                161)* weights = 50
              160) workclass == { Private, Local-gov, ?, State-gov, Self-emp-inc, Federal-gov}
                162)* weights = 729
            159) age > 29
              163)* weights = 569
        156) age > 38
          164) sex == { Female}; criterion = 1, statistic = 35.112
            165)* weights = 315
          164) sex == { Male}
            166) maritalstatus == { Never-married, Divorced, Married-spouse-absent}; criterion = 0.996, statistic = 22.004
              167)* weights = 340
            166) maritalstatus == { Married-civ-spouse, Separated, Widowed}
              168)* weights = 48
      155) educationnum > 12
        169) educationnum <= 13; criterion = 1, statistic = 28.34
          170) age <= 32; criterion = 1, statistic = 18.878
            171)* weights = 167
          170) age > 32
            172)* weights = 180
        169) educationnum > 13
          173)* weights = 63

Cross-Validation

Apply the tree to test set

## Making sure the levels of categorical variables match between training and test datasets
outcome <- levels(class)
levels(test.data$class) <- outcome
test.data$sex <- factor(test.data$sex, levels(sex))
test.data$workclass <- factor(test.data$workclass, levels(workclass))
test.data$maritalstatus <- factor(test.data$maritalstatus, levels(maritalstatus))
test.data$occupation <- factor(test.data$occupation, levels(occupation))
test.data$relationship <- factor(test.data$relationship, levels(relationship))
## Prediction of test data
pred <- predict(tree1, test.data)
N.test <- length(pred)

Confusion Table

Calculate true positive ($ TP $), false positive ($ FP $), false negative ($ FN $), and true negative ($ TN $), and evaluate accuracy:

\[ accuracy = \frac{TP+TN}{TP+FP+FN+TN} \]

confusion.table <- table(pred, test.data$class)
colnames(confusion.table) <- c(paste("Actual",outcome[1]), outcome[2])
rownames(confusion.table) <- c(paste("Predicted",outcome[1]), outcome[2])
print(xtable(confusion.table, caption='Confusion Table for Decision Tree'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="pink"')

Confusion Table for Decision Tree
	Actual <=50K	>50K
Predicted <=50K	11571	1814
>50K	863	2032

accuracy <- sum(diag(confusion.table)) / N.test
cat("accuracy =", accuracy*100, "%<br>missclassification =", 100-accuracy*100, "%\n")

accuracy = 83.56 %
missclassification = 16.44 %