Data Mining with R: Census Income

Initialization

rm(list=ls(all=TRUE)) ## Clear memory
library(RCurl)     ## package for downloading URLs 
library(XML)       ## package for reading HTML files
library(xtable)    ## package for printing HTML tables
library(corrplot)  ## package for plotting correlations
library(prettyR)   ## package for cross-tabulation
library(ggplot2)   ## package for producing graphical plots
library(gridExtra) ## package for drawing multiple graphs on a grid

Download data using RCurl

This function looks up the Index file in a given URL and downloads the content of URL using RCurl package.

download.files <- function(url) {
  ## Get the list of files in URL
  html <- getURLContent(url)           ## Get the Index file
  html.content <- htmlParse(html)      ## Get the html content
  html.table <- readHTMLTable(content) ## Read the html table
  files <- html.table[[1]]$Name        ## Get file names
  files <- files[!is.na(files)]        ## Remove NA
  files <- as.character(files[3:length(files)]) ## Ignore the first two entries: "Parent Directory" and "Index"
  print(files)
  ## Download files and save into local directory
  for (f in files){
    fname <- file.path(url,f)
    content <- getURLContent(fname)  ## Get the file content
    write(content,f)
  }
}

Get the training and test datasets

If files are already downloaded then import the saved data, otherwise read data from files:

## Check if files are already downloaded
if (file.exists("train.test.RData")) {
  load("train.test.RData")             
} else {
  url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/"
  download.files(url)
  ## Get training and test data
  training.file <- list.files(pattern=".data")
  training.data <- as.data.frame(read.csv(training.file))
  test.file <- list.files(pattern=".test")
  test.data <- as.data.frame(read.csv(test.file, skip=1))
  ## Get the names of variables
  description.file <- list.files(pattern=".names")[-1]
  content <- readLines(description.file)
  i <- grep("Attribute Information",content) + 2
  var.names <- NULL
  while(content[i]!="") {
    j <- gregexpr(":", content[i])[[1]][1]
    var.names <- c(var.names, substr(content[i],1,j-1))
    i <- i + 1
  }
  print(var.names)
  names(training.data) <- gsub("-","",var.names)
  names(test.data) <- gsub("-","",var.names)
  save(training.data, test.data, var.names, file="train.test.RData")
}
N.obs <- dim(training.data)[1]  ## Number of observations
N.var <- dim(training.data)[2]  ## Number of variables
cat("Number of observations:",N.obs,"
Number of variables:",N.var,"\n")

Number of observations: 32560
Number of variables: 15

Look at the first few rows of the data

print(xtable(head(training.data), caption='Training Data'), caption.placement = 'top', type="html", 
      html.table.attributes='border=1 align="center" bgcolor="#FFCC00"')
Training Data
age workclass fnlwgt education educationnum maritalstatus occupation relationship race sex capitalgain capitalloss hoursperweek nativecountry class
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
5 37 Private 284582 Masters 14 Married-civ-spouse Exec-managerial Wife White Female 0 0 40 United-States <=50K
6 49 Private 160187 9th 5 Married-spouse-absent Other-service Not-in-family Black Female 0 0 16 Jamaica <=50K

Exploratory Data Analysis and Feature Selection

Check for missing data

cat("Number of missing data in training set:", sum(is.na(training.data)), "\n")

Number of missing data in training set: 0

Statistics summary

print(xtable(summary(training.data), caption='Summary'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="green"')
Summary
age workclass fnlwgt education educationnum maritalstatus occupation relationship race sex capitalgain capitalloss hoursperweek nativecountry class
1 Min. :17.0 Private :22696 Min. : 12285 HS-grad :10501 Min. : 1.0 Married-civ-spouse :14976 Prof-specialty :4140 Husband :13193 White :27815 Male :21789 Min. : 0 Min. : 0 Min. : 1.0 United-States:29169 <=50K:24719
2 1st Qu.:28.0 Self-emp-not-inc: 2541 1st Qu.: 117832 Some-college: 7291 1st Qu.: 9.0 Never-married :10682 Craft-repair :4099 Not-in-family : 8304 Black : 3124 Female:10771 1st Qu.: 0 1st Qu.: 0 1st Qu.:40.0 Mexico : 643 >50K : 7841
3 Median :37.0 Local-gov : 2093 Median : 178363 Bachelors : 5354 Median :10.0 Divorced : 4443 Exec-managerial:4066 Own-child : 5068 Asian-Pac-Islander: 1039 Median : 0 Median : 0 Median :40.0 ? : 583
4 Mean :38.6 ? : 1836 Mean : 189782 Masters : 1723 Mean :10.1 Separated : 1025 Adm-clerical :3769 Unmarried : 3446 Amer-Indian-Eskimo: 311 Mean : 1078 Mean : 87 Mean :40.4 Philippines : 198
5 3rd Qu.:48.0 State-gov : 1297 3rd Qu.: 237054 Assoc-voc : 1382 3rd Qu.:12.0 Widowed : 993 Sales :3650 Wife : 1568 Other : 271 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.:45.0 Germany : 137
6 Max. :90.0 Self-emp-inc : 1116 Max. :1484705 11th : 1175 Max. :16.0 Married-spouse-absent: 418 Other-service :3295 Other-relative: 981 Max. :99999 Max. :4356 Max. :99.0 Canada : 121
7 (Other) : 981 (Other) : 5134 Married-AF-spouse : 23 (Other) :9541 (Other) : 1709

Sort categorical variables in descending order

categ.sort <- function(x){reorder(x,x,function(y){-length(y)})} ## Sorting function for categorical variables
categ.var <- which(sapply(training.data, is.factor)) ## Find the categorical variables
for (c in categ.var){  ## Apply the sort function on each categorical variable
  training.data[,c] <- categ.sort(training.data[,c])   
}
attach(training.data)

Histograms of numerical variables

## Function for displaying histograms using ggplot2
p1 <- ggplot(training.data, aes(x=age)) + ggtitle("Age") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=5, colour="black", fill="white") + ylab("Percentage")
p2 <- ggplot(training.data, aes(x=log10(fnlwgt))) + ggtitle("log( Weight )") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage")
p3 <- ggplot(training.data, aes(x=educationnum)) + ggtitle("Years of Education") + 
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=1, colour="black", fill="white") + ylab("Percentage")
p4 <- ggplot(training.data, aes(x=hoursperweek)) + ggtitle("Hours per Week") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage")
p5 <- ggplot(training.data, aes(x=log10(capitalgain+1))) + ggtitle("log( Capital Gain )") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage") + 
  annotate("text", x = 3, y = 50, label = "X", colour="red", size=30, fontface="bold")
p6 <- ggplot(training.data, aes(x=log10(capitalloss+1))) + ggtitle("log( Capital Loss )") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage") + 
  annotate("text", x = 2, y = 50, label = "X", colour="red", size=30, fontface="bold")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3)
cat("% of data with zero Capital Gain:", sum(capitalgain==0)/N.obs*100, 
    "<br>% of data with zero Capital Loss:", sum(capitalloss==0)/N.obs*100)

plot of chunk histograms

% of data with zero Capital Gain: 91.67
% of data with zero Capital Loss: 95.33

Feature Analysis: Age, log(Weight), Years of Education, and Hours per Week have broad distributions, therefore will be considered for regression analysis. Capital Gain and Capital Loss, however, have very narrow distributions (more than 90% of data are clustered at zero) therefore they will be excluded from the model.

Bar plots of categorical variables

p1 <- ggplot(training.data, aes(x=workclass)) + ggtitle("Work Class") + xlab("Work Class") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() + 
  scale_x_discrete(limits = rev(levels(workclass)))
p2 <- ggplot(training.data, aes(x=education)) + ggtitle("Education") + xlab("Education") + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() +
  scale_x_discrete(limits = rev(levels(education)))
p3 <- ggplot(training.data, aes(x=occupation)) + ggtitle("Occupation") + xlab("Occupation") + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() +
  scale_x_discrete(limits = rev(levels(occupation)))
p4 <- ggplot(training.data, aes(x=nativecountry)) + ggtitle("Native Country") + xlab("Native Country") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() + 
  scale_x_discrete(limits = rev(levels(nativecountry))) +
  annotate("text", x = 21, y = 50, label = "X", colour="red", size=30, fontface="bold")
grid.arrange(p1, p2, p3, p4, ncol=2)

plot of chunk barplots Native Country has a very narrow distribution (90% of population coming from the United States), therefore will be excluded from the model.

Pie charts of categorical variables

p1 <- ggplot(training.data, aes(x=factor(1), fill=maritalstatus)) + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") + 
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) + 
  xlab("") + ylab("") + ggtitle("Marital Status") 
p2 <- ggplot(training.data, aes(x=factor(1), fill=relationship)) + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") + 
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) + 
  xlab("") + ylab("") + ggtitle("Relationship") 
p3 <- ggplot(training.data, aes(x=factor(1), fill=race)) + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") + 
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) + 
  xlab("") + ylab("") + ggtitle("Race")
p4 <- ggplot(training.data, aes(x=factor(1), fill=sex)) + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") + 
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) + 
  xlab("") + ylab("") + ggtitle("Sex")
grid.arrange(p1, p2, p3, p4, ncol=2)

plot of chunk piecharts All these variables have reasonable spread of distribution, therefore will be considered.

Correlation between numerical variables

numeric.var <- sapply(training.data, is.numeric) ## Find numerical variables
corr.matrix <- cor(training.data[,numeric.var])  ## Calculate the correlation matrix
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables")  ## Correlation plot
par(mar=c(5.1,4.1,4.1,2.1))  ## Restore plot margins

plot of chunk corr1

The numerical variables are nearly uncorrelated.

Correlation between numerical variables and income class

par(mfrow=c(2,2))  ## Arrange plots in a 4x4 grid
boxplot(age~class, main="Age vs. Income Class", 
        xlab="Income Class", ylab="Age")
boxplot(educationnum~class, main="Years of Eduction vs. Income Class", 
        xlab="Income Class", ylab="Years of Eduction")
boxplot(log(fnlwgt)~class, main="log(Weight) vs. Income Class", 
        xlab="Income Class", ylab="log(Weight)")
text(0.4, 12, "X", cex=8, font=2, col="red", xpd=TRUE)
boxplot(hoursperweek~class, main="Hours per Week vs. Income Class", 
        xlab="Income Class", ylab="Hours per Week")
par(mfrow=c(1,1))

plot of chunk corr2

Weight shows little variation with Income Class therefore it will be excluded from the model. Other variables have significant correlation with Income hence will be selected.

Correlation between numerical and categorical variables

Education and Years of Education exhibit perfect correlation (zero standard deviation within each category), therefore the categorical variable Education will be excluded from the model.

boxplot(educationnum~education, main="Years of Eduction\nvs. Education", 
        xlab="Education", ylab="Years of Eduction")
text(8.5, 0, "X", cex=8, font=2, col="red", xpd=TRUE)

plot of chunk corr3

AOV <- aov(educationnum~education)
print(xtable(summary(AOV), caption='Analysis of Variance'), caption.placement='top', type="html", 
  html.table.attributes='border=1 align="center" bgcolor="lightgray"')
Analysis of Variance
Df Sum Sq Mean Sq F value Pr(>F)
education 15 215502.53 14366.84 54063530208304329392208800202220400.00 0.0000
Residuals 32544 0.00 0.00

Correlation between categorical variables and income class

par(mfrow=c(3,2))
par(las=1)  ## horizontal axis labels
plot(table(class, workclass), main="Work Class vs. Income Class", cex=1.5)
plot(table(class, maritalstatus), main="Marital Status vs. Income Class", cex=1.5)
plot(table(class, occupation), main="Occupation vs. Income Class", cex=1.5)
plot(table(class, relationship), main="Relationship vs. Income Class", cex=1.5)
plot(table(class, race), main="Race vs. Income Class", cex=1.5)
plot(table(class, sex), main="Sex vs. Income Class", cex=1.5)
par(las=0)  ## parallel axis labels

plot of chunk corr4

Work Class and Race show weak correlation with Income Class, nevertheless all these variables are selected.

Logistic Regression

Generalized linear model (glm)

Use the selected features in the glm model:

## Create a binary response variable
y <- rep(0, N.obs)
y[class==levels(class)[2]] <- 1
## GLM fit
fit <- glm(y ~ age + educationnum + hoursperweek + workclass + maritalstatus + occupation + relationship + race + sex, family=binomial("logit"))
save(fit, file="fit.RData")

Model summary

## Get the coefficients
tab <- summary(fit)$coefficients
## Sort by p-value
sorter <- order(tab[,4])
tab <- tab[sorter,]
## Display the result
print(xtable(tab, digits=3, caption='GLM coefficients<br>(sorted in ascending order of p-value)'), caption.placement='top', type="html", html.table.attributes='border=1 align="center" bgcolor="lightgray"')
GLM coefficients
(sorted in ascending order of p-value)
Estimate Std. Error z value Pr(>|z|)
educationnum 0.296 0.009 34.138 0.000
(Intercept) -5.470 0.160 -34.100 0.000
hoursperweek 0.031 0.002 20.073 0.000
age 0.030 0.002 19.281 0.000
relationship Wife 1.360 0.095 14.288 0.000
occupation Other-service -1.510 0.107 -14.085 0.000
occupation Farming-fishing -1.546 0.124 -12.426 0.000
sex Female -0.856 0.072 -11.931 0.000
workclass ? -1.237 0.105 -11.819 0.000
maritalstatus Never-married -2.511 0.248 -10.105 0.000
occupation Handlers-cleaners -1.291 0.132 -9.765 0.000
occupation Machine-op-inspct -0.896 0.092 -9.741 0.000
maritalstatus Divorced -2.081 0.254 -8.194 0.000
occupation Transport-moving -0.708 0.088 -8.068 0.000
occupation Adm-clerical -0.587 0.075 -7.846 0.000
maritalstatus Separated -2.206 0.284 -7.772 0.000
occupation Craft-repair -0.514 0.067 -7.683 0.000
workclass Self-emp-not-inc -0.454 0.060 -7.615 0.000
maritalstatus Widowed -1.968 0.281 -7.006 0.000
maritalstatus Married-spouse-absent -2.168 0.316 -6.857 0.000
workclass Federal-gov 0.446 0.088 5.054 0.000
occupation Sales -0.294 0.066 -4.432 0.000
workclass State-gov -0.372 0.086 -4.318 0.000
occupation Exec-managerial 0.223 0.059 3.800 0.000
workclass Self-emp-inc 0.241 0.079 3.058 0.002
occupation Priv-house-serv -3.279 1.126 -2.912 0.004
workclass Local-gov -0.195 0.069 -2.843 0.004
relationship Own-child -0.671 0.252 -2.664 0.008
race Other -0.669 0.252 -2.657 0.008
race Amer-Indian-Eskimo -0.504 0.208 -2.425 0.015
relationship Not-in-family 0.565 0.251 2.248 0.025
race Asian-Pac-Islander -0.165 0.096 -1.722 0.085
race Black -0.119 0.070 -1.705 0.088
relationship Other-relative -0.351 0.231 -1.521 0.128
relationship Unmarried 0.357 0.265 1.344 0.179
occupation Armed-Forces -1.418 1.260 -1.125 0.260
maritalstatus Married-AF-spouse 0.370 0.481 0.769 0.442
occupation Protective-serv -0.055 0.112 -0.495 0.621
occupation Tech-support 0.019 0.098 0.198 0.843
workclass Without-pay -11.968 118.027 -0.101 0.919
workclass Never-worked -10.953 162.917 -0.067 0.946

Feature analysis

Cross-Validation

Apply model to the test set

## Prediction of test data
pred <- predict(fit, test.data, type="response")
N.test <- length(pred)
## Use a threshold of 0.5 to assign the predicted classes
y.hat <- rep(0, N.test)
y.hat[pred>=0.5] <- 1

Confusion Table

Calculate true positive (\( TP \)), false positive (\( FP \)), false negative (\( FN \)), and true negative (\( TN \)), and evaluate accuracy:

\[ accuracy = \frac{TP+TN}{TP+FP+FN+TN} \]

## Get the true outcome of the test data
outcome <- levels(test.data$class)
y.test <- rep(0, N.test)
y.test[test.data$class==outcome[2]] <- 1
## Tabulate the true outcome vs. prediction
confusion.table <- table(y.hat, y.test)
colnames(confusion.table) <- c(paste("Actual",outcome[1]), outcome[2])
rownames(confusion.table) <- c(paste("Predicted",outcome[1]), outcome[2])
print(xtable(confusion.table, caption='Confusion Table for Logistic Regression'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="pink"')
Confusion Table for Logistic Regression
Actual
<=50K
>50K
Predicted <=50K 11452 1705
>50K 982 2141
accuracy <- sum(diag(confusion.table)) / N.test
cat("accuracy =", accuracy*100, "%<br>missclassification =", 100-accuracy*100, "%\n")

accuracy = 83.5 %
missclassification = 16.5 %

Decision Tree

library(party)
tree1 <- ctree(class ~ educationnum + hoursperweek + age + sex + workclass + maritalstatus + occupation + relationship, data=training.data)
print.tree(tree1)


Conditional inference tree with 87 terminal nodes

Response: class
Inputs: educationnum, hoursperweek, age, sex, workclass, maritalstatus, occupation, relationship
Number of observations: 32560

1) relationship == { Husband, Wife}; criterion = 1, statistic = 6698.472
  2) educationnum <= 12; criterion = 1, statistic = 2346.306
    3) occupation == { Prof-specialty, Exec-managerial, Adm-clerical, Sales, Tech-support, Protective-serv}; criterion = 1, statistic = 755.73
      4) educationnum <= 9; criterion = 1, statistic = 95.657
        5) hoursperweek <= 34; criterion = 1, statistic = 41.951
          6) workclass == { Self-emp-inc, Federal-gov}; criterion = 0.972, statistic = 19.414
            7)* weights = 23
          6) workclass == { Private, Self-emp-not-inc, Local-gov, State-gov, Without-pay}
            8)* weights = 184
        5) hoursperweek > 34
          9) educationnum <= 8; criterion = 1, statistic = 31.898
            10)* weights = 195
          9) educationnum > 8
            11) age <= 34; criterion = 1, statistic = 37.574
              12) age <= 27; criterion = 1, statistic = 20.729
                13) hoursperweek <= 55; criterion = 0.956, statistic = 7.689
                  14)* weights = 100
                13) hoursperweek > 55
                  15)* weights = 12
              12) age > 27
                16)* weights = 280
            11) age > 34
              17) workclass == { Self-emp-not-inc, State-gov}; criterion = 1, statistic = 27.78
                18)* weights = 191
              17) workclass == { Private, Local-gov, Self-emp-inc, Federal-gov}
                19) occupation == { Exec-managerial, Tech-support}; criterion = 0.984, statistic = 18.824
                  20)* weights = 308
                19) occupation == { Prof-specialty, Adm-clerical, Sales, Protective-serv}
                  21)* weights = 557
      4) educationnum > 9
        22) age <= 33; criterion = 1, statistic = 51.745
          23) age <= 27; criterion = 1, statistic = 17.139
            24)* weights = 185
          23) age > 27
            25)* weights = 338
        22) age > 33
          26) workclass == { Self-emp-not-inc, State-gov, Without-pay}; criterion = 1, statistic = 45.321
            27)* weights = 218
          26) workclass == { Private, Local-gov, Self-emp-inc, Federal-gov}
            28) hoursperweek <= 34; criterion = 1, statistic = 28.93
              29) sex == { Female}; criterion = 0.986, statistic = 12.072
                30)* weights = 36
              29) sex == { Male}
                31)* weights = 64
            28) hoursperweek > 34
              32) occupation == { Adm-clerical, Sales}; criterion = 0.995, statistic = 21.56
                33)* weights = 524
              32) occupation == { Prof-specialty, Exec-managerial, Tech-support, Protective-serv}
                34)* weights = 821
    3) occupation == { Craft-repair, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Farming-fishing, Priv-house-serv, Armed-Forces}
      35) educationnum <= 7; criterion = 1, statistic = 222.588
        36) hoursperweek <= 45; criterion = 1, statistic = 30.09
          37) hoursperweek <= 38; criterion = 0.964, statistic = 17.103
            38)* weights = 216
          37) hoursperweek > 38
            39)* weights = 860
        36) hoursperweek > 45
          40)* weights = 228
      35) educationnum > 7
        41) occupation == { Other-service, ?, Handlers-cleaners, Farming-fishing, Priv-house-serv, Armed-Forces}; criterion = 1, statistic = 116.098
          42) age <= 40; criterion = 0.998, statistic = 13.415
            43) age <= 26; criterion = 0.973, statistic = 10.186
              44)* weights = 132
            43) age > 26
              45) educationnum <= 10; criterion = 0.97, statistic = 8.859
                46)* weights = 512
              45) educationnum > 10
                47)* weights = 74
          42) age > 40
            48) hoursperweek <= 34; criterion = 0.98, statistic = 10.071
              49) educationnum <= 9; criterion = 0.999, statistic = 15.251
                50) age <= 55; criterion = 0.993, statistic = 10.993
                  51)* weights = 34
                50) age > 55
                  52)* weights = 107
              49) educationnum > 9
                53)* weights = 71
            48) hoursperweek > 34
              54) occupation == { Other-service, Farming-fishing, Priv-house-serv}; criterion = 0.953, statistic = 14.465
                55)* weights = 370
              54) occupation == { ?, Handlers-cleaners}
                56)* weights = 243
        41) occupation == { Craft-repair, Machine-op-inspct, Transport-moving}
          57) age <= 35; criterion = 1, statistic = 93.145
            58) hoursperweek <= 47; criterion = 1, statistic = 21.257
              59) age <= 24; criterion = 0.997, statistic = 12.568
                60)* weights = 99
              59) age > 24
                61)* weights = 790
            58) hoursperweek > 47
              62)* weights = 296
          57) age > 35
            63) hoursperweek <= 35; criterion = 1, statistic = 24.539
              64)* weights = 140
            63) hoursperweek > 35
              65) educationnum <= 9; criterion = 1, statistic = 17.47
                66)* weights = 1380
              65) educationnum > 9
                67)* weights = 741
  2) educationnum > 12
    68) occupation == { Craft-repair, Adm-clerical, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Farming-fishing}; criterion = 1, statistic = 289.065
      69) hoursperweek <= 41; criterion = 0.997, statistic = 23.655
        70)* weights = 502
      69) hoursperweek > 41
        71) workclass == { Private, Local-gov, ?, State-gov, Self-emp-inc, Federal-gov}; criterion = 0.999, statistic = 28.308
          72)* weights = 201
        71) workclass == { Self-emp-not-inc}
          73)* weights = 53
    68) occupation == { Prof-specialty, Exec-managerial, Sales, Tech-support, Protective-serv, Armed-Forces}
      74) educationnum <= 13; criterion = 1, statistic = 49.21
        75) workclass == { Private, Self-emp-inc, Federal-gov}; criterion = 1, statistic = 36.172
          76) age <= 28; criterion = 1, statistic = 19.916
            77) age <= 25; criterion = 0.976, statistic = 8.765
              78)* weights = 46
            77) age > 25
              79) sex == { Female}; criterion = 0.986, statistic = 9.79
                80)* weights = 22
              79) sex == { Male}
                81)* weights = 76
          76) age > 28
            82) occupation == { Prof-specialty, Sales, Tech-support, Protective-serv}; criterion = 0.965, statistic = 15.113
              83)* weights = 852
            82) occupation == { Exec-managerial}
              84)* weights = 651
        75) workclass == { Self-emp-not-inc, Local-gov, State-gov}
          85)* weights = 490
      74) educationnum > 13
        86) hoursperweek <= 25; criterion = 1, statistic = 24.662
          87) sex == { Female}; criterion = 0.967, statistic = 8.195
            88)* weights = 14
          87) sex == { Male}
            89)* weights = 52
        86) hoursperweek > 25
          90)* weights = 1473
1) relationship == { Not-in-family, Own-child, Unmarried, Other-relative}
  91) occupation == { Prof-specialty, Exec-managerial, Protective-serv}; criterion = 1, statistic = 1146.951
    92) educationnum <= 13; criterion = 1, statistic = 201.34
      93) hoursperweek <= 44; criterion = 1, statistic = 99.093
        94) age <= 34; criterion = 1, statistic = 56.847
          95)* weights = 1152
        94) age > 34
          96) sex == { Female}; criterion = 1, statistic = 30.016
            97)* weights = 687
          96) sex == { Male}
            98)* weights = 353
      93) hoursperweek > 44
        99) educationnum <= 12; criterion = 1, statistic = 20.214
          100) sex == { Male}; criterion = 0.998, statistic = 16.955
            101) age <= 42; criterion = 0.972, statistic = 12.414
              102)* weights = 182
            101) age > 42
              103)* weights = 74
          100) sex == { Female}
            104) occupation == { Prof-specialty, Protective-serv}; criterion = 0.989, statistic = 13.259
              105)* weights = 41
            104) occupation == { Exec-managerial}
              106)* weights = 136
        99) educationnum > 12
          107) age <= 27; criterion = 1, statistic = 23.707
            108)* weights = 96
          107) age > 27
            109) workclass == { Self-emp-not-inc, Local-gov, State-gov, Self-emp-inc}; criterion = 0.998, statistic = 23.402
              110)* weights = 124
            109) workclass == { Private, Federal-gov}
              111) occupation == { Exec-managerial, Protective-serv}; criterion = 0.991, statistic = 17.141
                112)* weights = 146
              111) occupation == { Prof-specialty}
                113)* weights = 114
    92) educationnum > 13
      114) educationnum <= 14; criterion = 1, statistic = 46.391
        115) occupation == { Prof-specialty, Protective-serv}; criterion = 1, statistic = 48.1
          116) age <= 46; criterion = 0.954, statistic = 11.049
            117) workclass == { Private, Self-emp-not-inc, Self-emp-inc}; criterion = 0.983, statistic = 18.722
              118)* weights = 125
            117) workclass == { Local-gov, State-gov, Federal-gov}
              119)* weights = 138
          116) age > 46
            120)* weights = 145
        115) occupation == { Exec-managerial}
          121) hoursperweek <= 43; criterion = 0.999, statistic = 14.055
            122)* weights = 79
          121) hoursperweek > 43
            123) age <= 29; criterion = 0.963, statistic = 8.005
              124)* weights = 7
            123) age > 29
              125)* weights = 80
      114) educationnum > 14
        126) age <= 32; criterion = 0.988, statistic = 17.017
          127) maritalstatus == { Never-married, Married-spouse-absent}; criterion = 0.962, statistic = 10.668
            128)* weights = 61
          127) maritalstatus == { Divorced}
            129)* weights = 9
        126) age > 32
          130) sex == { Female}; criterion = 0.994, statistic = 17.268
            131) maritalstatus == { Divorced, Separated, Widowed, Married-spouse-absent}; criterion = 0.968, statistic = 15.369
              132)* weights = 46
            131) maritalstatus == { Never-married}
              133)* weights = 44
          130) sex == { Male}
            134)* weights = 99
  91) occupation == { Craft-repair, Adm-clerical, Sales, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Farming-fishing, Tech-support, Priv-house-serv, Armed-Forces}
    135) hoursperweek <= 41; criterion = 1, statistic = 268.244
      136) age <= 28; criterion = 1, statistic = 141.138
        137) maritalstatus == { Married-civ-spouse, Separated}; criterion = 1, statistic = 44.013
          138)* weights = 209
        137) maritalstatus == { Never-married, Divorced, Widowed, Married-spouse-absent}
          139) age <= 21; criterion = 0.993, statistic = 11.182
            140)* weights = 2627
          139) age > 21
            141) sex == { Female}; criterion = 0.951, statistic = 7.467
              142)* weights = 1225
            141) sex == { Male}
              143)* weights = 1477
      136) age > 28
        144) educationnum <= 12; criterion = 1, statistic = 95.835
          145) occupation == { Craft-repair, Farming-fishing, Tech-support}; criterion = 1, statistic = 68.331
            146)* weights = 986
          145) occupation == { Adm-clerical, Sales, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Priv-house-serv}
            147) maritalstatus == { Married-civ-spouse}; criterion = 1, statistic = 35.81
              148)* weights = 51
            147) maritalstatus == { Never-married, Divorced, Separated, Widowed, Married-spouse-absent, Married-AF-spouse}
              149) age <= 50; criterion = 0.994, statistic = 19.088
                150)* weights = 2958
              149) age > 50
                151)* weights = 1269
        144) educationnum > 12
          152) maritalstatus == { Married-civ-spouse}; criterion = 0.969, statistic = 20.743
            153)* weights = 11
          152) maritalstatus == { Never-married, Divorced, Separated, Widowed, Married-spouse-absent}
            154)* weights = 565
    135) hoursperweek > 41
      155) educationnum <= 12; criterion = 1, statistic = 103.455
        156) age <= 38; criterion = 1, statistic = 65.904
          157) maritalstatus == { Married-civ-spouse}; criterion = 1, statistic = 41.235
            158)* weights = 22
          157) maritalstatus == { Never-married, Divorced, Separated, Widowed, Married-spouse-absent}
            159) age <= 29; criterion = 1, statistic = 19.805
              160) workclass == { Self-emp-not-inc}; criterion = 0.999, statistic = 27.501
                161)* weights = 50
              160) workclass == { Private, Local-gov, ?, State-gov, Self-emp-inc, Federal-gov}
                162)* weights = 729
            159) age > 29
              163)* weights = 569
        156) age > 38
          164) sex == { Female}; criterion = 1, statistic = 35.112
            165)* weights = 315
          164) sex == { Male}
            166) maritalstatus == { Never-married, Divorced, Married-spouse-absent}; criterion = 0.996, statistic = 22.004
              167)* weights = 340
            166) maritalstatus == { Married-civ-spouse, Separated, Widowed}
              168)* weights = 48
      155) educationnum > 12
        169) educationnum <= 13; criterion = 1, statistic = 28.34
          170) age <= 32; criterion = 1, statistic = 18.878
            171)* weights = 167
          170) age > 32
            172)* weights = 180
        169) educationnum > 13
          173)* weights = 63

Cross-Validation

Apply the tree to test set

## Making sure the levels of categorical variables match between training and test datasets
outcome <- levels(class)
levels(test.data$class) <- outcome
test.data$sex <- factor(test.data$sex, levels(sex))
test.data$workclass <- factor(test.data$workclass, levels(workclass))
test.data$maritalstatus <- factor(test.data$maritalstatus, levels(maritalstatus))
test.data$occupation <- factor(test.data$occupation, levels(occupation))
test.data$relationship <- factor(test.data$relationship, levels(relationship))
## Prediction of test data
pred <- predict(tree1, test.data)
N.test <- length(pred)

Confusion Table

Calculate true positive (\( TP \)), false positive (\( FP \)), false negative (\( FN \)), and true negative (\( TN \)), and evaluate accuracy:

\[ accuracy = \frac{TP+TN}{TP+FP+FN+TN} \]

confusion.table <- table(pred, test.data$class)
colnames(confusion.table) <- c(paste("Actual",outcome[1]), outcome[2])
rownames(confusion.table) <- c(paste("Predicted",outcome[1]), outcome[2])
print(xtable(confusion.table, caption='Confusion Table for Decision Tree'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="pink"')
Confusion Table for Decision Tree
Actual <=50K >50K
Predicted <=50K 11571 1814
>50K 863 2032
accuracy <- sum(diag(confusion.table)) / N.test
cat("accuracy =", accuracy*100, "%<br>missclassification =", 100-accuracy*100, "%\n")

accuracy = 83.56 %
missclassification = 16.44 %