rm(list=ls(all=TRUE)) ## Clear memory
library(RCurl) ## package for downloading URLs
library(XML) ## package for reading HTML files
library(xtable) ## package for printing HTML tables
library(corrplot) ## package for plotting correlations
library(prettyR) ## package for cross-tabulation
library(ggplot2) ## package for producing graphical plots
library(gridExtra) ## package for drawing multiple graphs on a grid
This function looks up the Index file in a given URL and downloads the content of URL using RCurl package.
download.files <- function(url) {
## Get the list of files in URL
html <- getURLContent(url) ## Get the Index file
html.content <- htmlParse(html) ## Get the html content
html.table <- readHTMLTable(content) ## Read the html table
files <- html.table[[1]]$Name ## Get file names
files <- files[!is.na(files)] ## Remove NA
files <- as.character(files[3:length(files)]) ## Ignore the first two entries: "Parent Directory" and "Index"
print(files)
## Download files and save into local directory
for (f in files){
fname <- file.path(url,f)
content <- getURLContent(fname) ## Get the file content
write(content,f)
}
}
If files are already downloaded then import the saved data, otherwise read data from files:
## Check if files are already downloaded
if (file.exists("train.test.RData")) {
load("train.test.RData")
} else {
url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/"
download.files(url)
## Get training and test data
training.file <- list.files(pattern=".data")
training.data <- as.data.frame(read.csv(training.file))
test.file <- list.files(pattern=".test")
test.data <- as.data.frame(read.csv(test.file, skip=1))
## Get the names of variables
description.file <- list.files(pattern=".names")[-1]
content <- readLines(description.file)
i <- grep("Attribute Information",content) + 2
var.names <- NULL
while(content[i]!="") {
j <- gregexpr(":", content[i])[[1]][1]
var.names <- c(var.names, substr(content[i],1,j-1))
i <- i + 1
}
print(var.names)
names(training.data) <- gsub("-","",var.names)
names(test.data) <- gsub("-","",var.names)
save(training.data, test.data, var.names, file="train.test.RData")
}
N.obs <- dim(training.data)[1] ## Number of observations
N.var <- dim(training.data)[2] ## Number of variables
cat("Number of observations:",N.obs,"
Number of variables:",N.var,"\n")
Number of observations: 32560
Number of variables: 15
print(xtable(head(training.data), caption='Training Data'), caption.placement = 'top', type="html",
html.table.attributes='border=1 align="center" bgcolor="#FFCC00"')
age | workclass | fnlwgt | education | educationnum | maritalstatus | occupation | relationship | race | sex | capitalgain | capitalloss | hoursperweek | nativecountry | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
5 | 37 | Private | 284582 | Masters | 14 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 0 | 0 | 40 | United-States | <=50K |
6 | 49 | Private | 160187 | 9th | 5 | Married-spouse-absent | Other-service | Not-in-family | Black | Female | 0 | 0 | 16 | Jamaica | <=50K |
cat("Number of missing data in training set:", sum(is.na(training.data)), "\n")
Number of missing data in training set: 0
print(xtable(summary(training.data), caption='Summary'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="green"')
age | workclass | fnlwgt | education | educationnum | maritalstatus | occupation | relationship | race | sex | capitalgain | capitalloss | hoursperweek | nativecountry | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Min. :17.0 | Private :22696 | Min. : 12285 | HS-grad :10501 | Min. : 1.0 | Married-civ-spouse :14976 | Prof-specialty :4140 | Husband :13193 | White :27815 | Male :21789 | Min. : 0 | Min. : 0 | Min. : 1.0 | United-States:29169 | <=50K:24719 |
2 | 1st Qu.:28.0 | Self-emp-not-inc: 2541 | 1st Qu.: 117832 | Some-college: 7291 | 1st Qu.: 9.0 | Never-married :10682 | Craft-repair :4099 | Not-in-family : 8304 | Black : 3124 | Female:10771 | 1st Qu.: 0 | 1st Qu.: 0 | 1st Qu.:40.0 | Mexico : 643 | >50K : 7841 |
3 | Median :37.0 | Local-gov : 2093 | Median : 178363 | Bachelors : 5354 | Median :10.0 | Divorced : 4443 | Exec-managerial:4066 | Own-child : 5068 | Asian-Pac-Islander: 1039 | Median : 0 | Median : 0 | Median :40.0 | ? : 583 | ||
4 | Mean :38.6 | ? : 1836 | Mean : 189782 | Masters : 1723 | Mean :10.1 | Separated : 1025 | Adm-clerical :3769 | Unmarried : 3446 | Amer-Indian-Eskimo: 311 | Mean : 1078 | Mean : 87 | Mean :40.4 | Philippines : 198 | ||
5 | 3rd Qu.:48.0 | State-gov : 1297 | 3rd Qu.: 237054 | Assoc-voc : 1382 | 3rd Qu.:12.0 | Widowed : 993 | Sales :3650 | Wife : 1568 | Other : 271 | 3rd Qu.: 0 | 3rd Qu.: 0 | 3rd Qu.:45.0 | Germany : 137 | ||
6 | Max. :90.0 | Self-emp-inc : 1116 | Max. :1484705 | 11th : 1175 | Max. :16.0 | Married-spouse-absent: 418 | Other-service :3295 | Other-relative: 981 | Max. :99999 | Max. :4356 | Max. :99.0 | Canada : 121 | |||
7 | (Other) : 981 | (Other) : 5134 | Married-AF-spouse : 23 | (Other) :9541 | (Other) : 1709 |
categ.sort <- function(x){reorder(x,x,function(y){-length(y)})} ## Sorting function for categorical variables
categ.var <- which(sapply(training.data, is.factor)) ## Find the categorical variables
for (c in categ.var){ ## Apply the sort function on each categorical variable
training.data[,c] <- categ.sort(training.data[,c])
}
attach(training.data)
## Function for displaying histograms using ggplot2
p1 <- ggplot(training.data, aes(x=age)) + ggtitle("Age") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=5, colour="black", fill="white") + ylab("Percentage")
p2 <- ggplot(training.data, aes(x=log10(fnlwgt))) + ggtitle("log( Weight )") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage")
p3 <- ggplot(training.data, aes(x=educationnum)) + ggtitle("Years of Education") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=1, colour="black", fill="white") + ylab("Percentage")
p4 <- ggplot(training.data, aes(x=hoursperweek)) + ggtitle("Hours per Week") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage")
p5 <- ggplot(training.data, aes(x=log10(capitalgain+1))) + ggtitle("log( Capital Gain )") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage") +
annotate("text", x = 3, y = 50, label = "X", colour="red", size=30, fontface="bold")
p6 <- ggplot(training.data, aes(x=log10(capitalloss+1))) + ggtitle("log( Capital Loss )") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage") +
annotate("text", x = 2, y = 50, label = "X", colour="red", size=30, fontface="bold")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3)
cat("% of data with zero Capital Gain:", sum(capitalgain==0)/N.obs*100,
"<br>% of data with zero Capital Loss:", sum(capitalloss==0)/N.obs*100)
% of data with zero Capital Gain: 91.67
% of data with zero Capital Loss: 95.33
Feature Analysis: Age
, log(Weight)
, Years of Education
, and Hours per Week
have broad distributions, therefore will be considered for regression analysis.
Capital Gain
and Capital Loss
, however, have very narrow distributions (more than 90% of data are clustered at zero) therefore they will be excluded from the model.
p1 <- ggplot(training.data, aes(x=workclass)) + ggtitle("Work Class") + xlab("Work Class") +
geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() +
scale_x_discrete(limits = rev(levels(workclass)))
p2 <- ggplot(training.data, aes(x=education)) + ggtitle("Education") + xlab("Education") +
geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() +
scale_x_discrete(limits = rev(levels(education)))
p3 <- ggplot(training.data, aes(x=occupation)) + ggtitle("Occupation") + xlab("Occupation") +
geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() +
scale_x_discrete(limits = rev(levels(occupation)))
p4 <- ggplot(training.data, aes(x=nativecountry)) + ggtitle("Native Country") + xlab("Native Country") +
geom_bar(aes(y = 100*(..count..)/sum(..count..))) + ylab("Percentage") + coord_flip() +
scale_x_discrete(limits = rev(levels(nativecountry))) +
annotate("text", x = 21, y = 50, label = "X", colour="red", size=30, fontface="bold")
grid.arrange(p1, p2, p3, p4, ncol=2)
Native Country
has a very narrow distribution (90% of population coming from the United States), therefore will be excluded from the model.
p1 <- ggplot(training.data, aes(x=factor(1), fill=maritalstatus)) +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) +
xlab("") + ylab("") + ggtitle("Marital Status")
p2 <- ggplot(training.data, aes(x=factor(1), fill=relationship)) +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) +
xlab("") + ylab("") + ggtitle("Relationship")
p3 <- ggplot(training.data, aes(x=factor(1), fill=race)) +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) +
xlab("") + ylab("") + ggtitle("Race")
p4 <- ggplot(training.data, aes(x=factor(1), fill=sex)) +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 1) + coord_polar(theta="y") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.title=element_blank()) +
xlab("") + ylab("") + ggtitle("Sex")
grid.arrange(p1, p2, p3, p4, ncol=2)
All these variables have reasonable spread of distribution, therefore will be considered.
numeric.var <- sapply(training.data, is.numeric) ## Find numerical variables
corr.matrix <- cor(training.data[,numeric.var]) ## Calculate the correlation matrix
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables") ## Correlation plot
par(mar=c(5.1,4.1,4.1,2.1)) ## Restore plot margins
The numerical variables are nearly uncorrelated.
par(mfrow=c(2,2)) ## Arrange plots in a 4x4 grid
boxplot(age~class, main="Age vs. Income Class",
xlab="Income Class", ylab="Age")
boxplot(educationnum~class, main="Years of Eduction vs. Income Class",
xlab="Income Class", ylab="Years of Eduction")
boxplot(log(fnlwgt)~class, main="log(Weight) vs. Income Class",
xlab="Income Class", ylab="log(Weight)")
text(0.4, 12, "X", cex=8, font=2, col="red", xpd=TRUE)
boxplot(hoursperweek~class, main="Hours per Week vs. Income Class",
xlab="Income Class", ylab="Hours per Week")
par(mfrow=c(1,1))
Weight
shows little variation with Income Class
therefore it will be excluded from the model. Other variables have significant correlation with Income
hence will be selected.
Education
and Years of Education
exhibit perfect correlation (zero standard deviation within each category), therefore the categorical variable Education
will be excluded from the model.
boxplot(educationnum~education, main="Years of Eduction\nvs. Education",
xlab="Education", ylab="Years of Eduction")
text(8.5, 0, "X", cex=8, font=2, col="red", xpd=TRUE)
AOV <- aov(educationnum~education)
print(xtable(summary(AOV), caption='Analysis of Variance'), caption.placement='top', type="html",
html.table.attributes='border=1 align="center" bgcolor="lightgray"')
Df | Sum Sq | Mean Sq | F value | Pr(>F) | |
---|---|---|---|---|---|
education | 15 | 215502.53 | 14366.84 | 54063530208304329392208800202220400.00 | 0.0000 |
Residuals | 32544 | 0.00 | 0.00 |
par(mfrow=c(3,2))
par(las=1) ## horizontal axis labels
plot(table(class, workclass), main="Work Class vs. Income Class", cex=1.5)
plot(table(class, maritalstatus), main="Marital Status vs. Income Class", cex=1.5)
plot(table(class, occupation), main="Occupation vs. Income Class", cex=1.5)
plot(table(class, relationship), main="Relationship vs. Income Class", cex=1.5)
plot(table(class, race), main="Race vs. Income Class", cex=1.5)
plot(table(class, sex), main="Sex vs. Income Class", cex=1.5)
par(las=0) ## parallel axis labels
Work Class
and Race
show weak correlation with Income Class
, nevertheless all these variables are selected.
Use the selected features in the glm model:
## Create a binary response variable
y <- rep(0, N.obs)
y[class==levels(class)[2]] <- 1
## GLM fit
fit <- glm(y ~ age + educationnum + hoursperweek + workclass + maritalstatus + occupation + relationship + race + sex, family=binomial("logit"))
save(fit, file="fit.RData")
## Get the coefficients
tab <- summary(fit)$coefficients
## Sort by p-value
sorter <- order(tab[,4])
tab <- tab[sorter,]
## Display the result
print(xtable(tab, digits=3, caption='GLM coefficients<br>(sorted in ascending order of p-value)'), caption.placement='top', type="html", html.table.attributes='border=1 align="center" bgcolor="lightgray"')
Estimate | Std. Error | z value | Pr(>|z|) | |
---|---|---|---|---|
educationnum | 0.296 | 0.009 | 34.138 | 0.000 |
(Intercept) | -5.470 | 0.160 | -34.100 | 0.000 |
hoursperweek | 0.031 | 0.002 | 20.073 | 0.000 |
age | 0.030 | 0.002 | 19.281 | 0.000 |
relationship Wife | 1.360 | 0.095 | 14.288 | 0.000 |
occupation Other-service | -1.510 | 0.107 | -14.085 | 0.000 |
occupation Farming-fishing | -1.546 | 0.124 | -12.426 | 0.000 |
sex Female | -0.856 | 0.072 | -11.931 | 0.000 |
workclass ? | -1.237 | 0.105 | -11.819 | 0.000 |
maritalstatus Never-married | -2.511 | 0.248 | -10.105 | 0.000 |
occupation Handlers-cleaners | -1.291 | 0.132 | -9.765 | 0.000 |
occupation Machine-op-inspct | -0.896 | 0.092 | -9.741 | 0.000 |
maritalstatus Divorced | -2.081 | 0.254 | -8.194 | 0.000 |
occupation Transport-moving | -0.708 | 0.088 | -8.068 | 0.000 |
occupation Adm-clerical | -0.587 | 0.075 | -7.846 | 0.000 |
maritalstatus Separated | -2.206 | 0.284 | -7.772 | 0.000 |
occupation Craft-repair | -0.514 | 0.067 | -7.683 | 0.000 |
workclass Self-emp-not-inc | -0.454 | 0.060 | -7.615 | 0.000 |
maritalstatus Widowed | -1.968 | 0.281 | -7.006 | 0.000 |
maritalstatus Married-spouse-absent | -2.168 | 0.316 | -6.857 | 0.000 |
workclass Federal-gov | 0.446 | 0.088 | 5.054 | 0.000 |
occupation Sales | -0.294 | 0.066 | -4.432 | 0.000 |
workclass State-gov | -0.372 | 0.086 | -4.318 | 0.000 |
occupation Exec-managerial | 0.223 | 0.059 | 3.800 | 0.000 |
workclass Self-emp-inc | 0.241 | 0.079 | 3.058 | 0.002 |
occupation Priv-house-serv | -3.279 | 1.126 | -2.912 | 0.004 |
workclass Local-gov | -0.195 | 0.069 | -2.843 | 0.004 |
relationship Own-child | -0.671 | 0.252 | -2.664 | 0.008 |
race Other | -0.669 | 0.252 | -2.657 | 0.008 |
race Amer-Indian-Eskimo | -0.504 | 0.208 | -2.425 | 0.015 |
relationship Not-in-family | 0.565 | 0.251 | 2.248 | 0.025 |
race Asian-Pac-Islander | -0.165 | 0.096 | -1.722 | 0.085 |
race Black | -0.119 | 0.070 | -1.705 | 0.088 |
relationship Other-relative | -0.351 | 0.231 | -1.521 | 0.128 |
relationship Unmarried | 0.357 | 0.265 | 1.344 | 0.179 |
occupation Armed-Forces | -1.418 | 1.260 | -1.125 | 0.260 |
maritalstatus Married-AF-spouse | 0.370 | 0.481 | 0.769 | 0.442 |
occupation Protective-serv | -0.055 | 0.112 | -0.495 | 0.621 |
occupation Tech-support | 0.019 | 0.098 | 0.198 | 0.843 |
workclass Without-pay | -11.968 | 118.027 | -0.101 | 0.919 |
workclass Never-worked | -10.953 | 162.917 | -0.067 | 0.946 |
The top three most-relevant features include Years of Education
, Hours per Week
, and Age
, all of which are numeric variables.
At the bottom of the table, instances of Work Class
, Occupation
, Relationship
, and Marital Status
are observed. However these variables are seen all across the table, therefore cannot be eliminated from the model.
From the remaining two categorical variables, Sex
appears near the top of the table and is highly ranked, however Race
categories are ranked near the bottom of the table and therefore can be removed to avoid model overfitting.
## Prediction of test data
pred <- predict(fit, test.data, type="response")
N.test <- length(pred)
## Use a threshold of 0.5 to assign the predicted classes
y.hat <- rep(0, N.test)
y.hat[pred>=0.5] <- 1
Calculate true positive (\( TP \)), false positive (\( FP \)), false negative (\( FN \)), and true negative (\( TN \)), and evaluate accuracy:
## Get the true outcome of the test data
outcome <- levels(test.data$class)
y.test <- rep(0, N.test)
y.test[test.data$class==outcome[2]] <- 1
## Tabulate the true outcome vs. prediction
confusion.table <- table(y.hat, y.test)
colnames(confusion.table) <- c(paste("Actual",outcome[1]), outcome[2])
rownames(confusion.table) <- c(paste("Predicted",outcome[1]), outcome[2])
print(xtable(confusion.table, caption='Confusion Table for Logistic Regression'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="pink"')
Actual <=50K | >50K | |
---|---|---|
Predicted <=50K | 11452 | 1705 |
>50K | 982 | 2141 |
accuracy <- sum(diag(confusion.table)) / N.test
cat("accuracy =", accuracy*100, "%<br>missclassification =", 100-accuracy*100, "%\n")
accuracy = 83.5 %
missclassification = 16.5 %
library(party)
tree1 <- ctree(class ~ educationnum + hoursperweek + age + sex + workclass + maritalstatus + occupation + relationship, data=training.data)
print.tree(tree1)
Conditional inference tree with 87 terminal nodes
Response: class
Inputs: educationnum, hoursperweek, age, sex, workclass, maritalstatus, occupation, relationship
Number of observations: 32560
1) relationship == { Husband, Wife}; criterion = 1, statistic = 6698.472
2) educationnum <= 12; criterion = 1, statistic = 2346.306
3) occupation == { Prof-specialty, Exec-managerial, Adm-clerical, Sales, Tech-support, Protective-serv}; criterion = 1, statistic = 755.73
4) educationnum <= 9; criterion = 1, statistic = 95.657
5) hoursperweek <= 34; criterion = 1, statistic = 41.951
6) workclass == { Self-emp-inc, Federal-gov}; criterion = 0.972, statistic = 19.414
7)* weights = 23
6) workclass == { Private, Self-emp-not-inc, Local-gov, State-gov, Without-pay}
8)* weights = 184
5) hoursperweek > 34
9) educationnum <= 8; criterion = 1, statistic = 31.898
10)* weights = 195
9) educationnum > 8
11) age <= 34; criterion = 1, statistic = 37.574
12) age <= 27; criterion = 1, statistic = 20.729
13) hoursperweek <= 55; criterion = 0.956, statistic = 7.689
14)* weights = 100
13) hoursperweek > 55
15)* weights = 12
12) age > 27
16)* weights = 280
11) age > 34
17) workclass == { Self-emp-not-inc, State-gov}; criterion = 1, statistic = 27.78
18)* weights = 191
17) workclass == { Private, Local-gov, Self-emp-inc, Federal-gov}
19) occupation == { Exec-managerial, Tech-support}; criterion = 0.984, statistic = 18.824
20)* weights = 308
19) occupation == { Prof-specialty, Adm-clerical, Sales, Protective-serv}
21)* weights = 557
4) educationnum > 9
22) age <= 33; criterion = 1, statistic = 51.745
23) age <= 27; criterion = 1, statistic = 17.139
24)* weights = 185
23) age > 27
25)* weights = 338
22) age > 33
26) workclass == { Self-emp-not-inc, State-gov, Without-pay}; criterion = 1, statistic = 45.321
27)* weights = 218
26) workclass == { Private, Local-gov, Self-emp-inc, Federal-gov}
28) hoursperweek <= 34; criterion = 1, statistic = 28.93
29) sex == { Female}; criterion = 0.986, statistic = 12.072
30)* weights = 36
29) sex == { Male}
31)* weights = 64
28) hoursperweek > 34
32) occupation == { Adm-clerical, Sales}; criterion = 0.995, statistic = 21.56
33)* weights = 524
32) occupation == { Prof-specialty, Exec-managerial, Tech-support, Protective-serv}
34)* weights = 821
3) occupation == { Craft-repair, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Farming-fishing, Priv-house-serv, Armed-Forces}
35) educationnum <= 7; criterion = 1, statistic = 222.588
36) hoursperweek <= 45; criterion = 1, statistic = 30.09
37) hoursperweek <= 38; criterion = 0.964, statistic = 17.103
38)* weights = 216
37) hoursperweek > 38
39)* weights = 860
36) hoursperweek > 45
40)* weights = 228
35) educationnum > 7
41) occupation == { Other-service, ?, Handlers-cleaners, Farming-fishing, Priv-house-serv, Armed-Forces}; criterion = 1, statistic = 116.098
42) age <= 40; criterion = 0.998, statistic = 13.415
43) age <= 26; criterion = 0.973, statistic = 10.186
44)* weights = 132
43) age > 26
45) educationnum <= 10; criterion = 0.97, statistic = 8.859
46)* weights = 512
45) educationnum > 10
47)* weights = 74
42) age > 40
48) hoursperweek <= 34; criterion = 0.98, statistic = 10.071
49) educationnum <= 9; criterion = 0.999, statistic = 15.251
50) age <= 55; criterion = 0.993, statistic = 10.993
51)* weights = 34
50) age > 55
52)* weights = 107
49) educationnum > 9
53)* weights = 71
48) hoursperweek > 34
54) occupation == { Other-service, Farming-fishing, Priv-house-serv}; criterion = 0.953, statistic = 14.465
55)* weights = 370
54) occupation == { ?, Handlers-cleaners}
56)* weights = 243
41) occupation == { Craft-repair, Machine-op-inspct, Transport-moving}
57) age <= 35; criterion = 1, statistic = 93.145
58) hoursperweek <= 47; criterion = 1, statistic = 21.257
59) age <= 24; criterion = 0.997, statistic = 12.568
60)* weights = 99
59) age > 24
61)* weights = 790
58) hoursperweek > 47
62)* weights = 296
57) age > 35
63) hoursperweek <= 35; criterion = 1, statistic = 24.539
64)* weights = 140
63) hoursperweek > 35
65) educationnum <= 9; criterion = 1, statistic = 17.47
66)* weights = 1380
65) educationnum > 9
67)* weights = 741
2) educationnum > 12
68) occupation == { Craft-repair, Adm-clerical, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Farming-fishing}; criterion = 1, statistic = 289.065
69) hoursperweek <= 41; criterion = 0.997, statistic = 23.655
70)* weights = 502
69) hoursperweek > 41
71) workclass == { Private, Local-gov, ?, State-gov, Self-emp-inc, Federal-gov}; criterion = 0.999, statistic = 28.308
72)* weights = 201
71) workclass == { Self-emp-not-inc}
73)* weights = 53
68) occupation == { Prof-specialty, Exec-managerial, Sales, Tech-support, Protective-serv, Armed-Forces}
74) educationnum <= 13; criterion = 1, statistic = 49.21
75) workclass == { Private, Self-emp-inc, Federal-gov}; criterion = 1, statistic = 36.172
76) age <= 28; criterion = 1, statistic = 19.916
77) age <= 25; criterion = 0.976, statistic = 8.765
78)* weights = 46
77) age > 25
79) sex == { Female}; criterion = 0.986, statistic = 9.79
80)* weights = 22
79) sex == { Male}
81)* weights = 76
76) age > 28
82) occupation == { Prof-specialty, Sales, Tech-support, Protective-serv}; criterion = 0.965, statistic = 15.113
83)* weights = 852
82) occupation == { Exec-managerial}
84)* weights = 651
75) workclass == { Self-emp-not-inc, Local-gov, State-gov}
85)* weights = 490
74) educationnum > 13
86) hoursperweek <= 25; criterion = 1, statistic = 24.662
87) sex == { Female}; criterion = 0.967, statistic = 8.195
88)* weights = 14
87) sex == { Male}
89)* weights = 52
86) hoursperweek > 25
90)* weights = 1473
1) relationship == { Not-in-family, Own-child, Unmarried, Other-relative}
91) occupation == { Prof-specialty, Exec-managerial, Protective-serv}; criterion = 1, statistic = 1146.951
92) educationnum <= 13; criterion = 1, statistic = 201.34
93) hoursperweek <= 44; criterion = 1, statistic = 99.093
94) age <= 34; criterion = 1, statistic = 56.847
95)* weights = 1152
94) age > 34
96) sex == { Female}; criterion = 1, statistic = 30.016
97)* weights = 687
96) sex == { Male}
98)* weights = 353
93) hoursperweek > 44
99) educationnum <= 12; criterion = 1, statistic = 20.214
100) sex == { Male}; criterion = 0.998, statistic = 16.955
101) age <= 42; criterion = 0.972, statistic = 12.414
102)* weights = 182
101) age > 42
103)* weights = 74
100) sex == { Female}
104) occupation == { Prof-specialty, Protective-serv}; criterion = 0.989, statistic = 13.259
105)* weights = 41
104) occupation == { Exec-managerial}
106)* weights = 136
99) educationnum > 12
107) age <= 27; criterion = 1, statistic = 23.707
108)* weights = 96
107) age > 27
109) workclass == { Self-emp-not-inc, Local-gov, State-gov, Self-emp-inc}; criterion = 0.998, statistic = 23.402
110)* weights = 124
109) workclass == { Private, Federal-gov}
111) occupation == { Exec-managerial, Protective-serv}; criterion = 0.991, statistic = 17.141
112)* weights = 146
111) occupation == { Prof-specialty}
113)* weights = 114
92) educationnum > 13
114) educationnum <= 14; criterion = 1, statistic = 46.391
115) occupation == { Prof-specialty, Protective-serv}; criterion = 1, statistic = 48.1
116) age <= 46; criterion = 0.954, statistic = 11.049
117) workclass == { Private, Self-emp-not-inc, Self-emp-inc}; criterion = 0.983, statistic = 18.722
118)* weights = 125
117) workclass == { Local-gov, State-gov, Federal-gov}
119)* weights = 138
116) age > 46
120)* weights = 145
115) occupation == { Exec-managerial}
121) hoursperweek <= 43; criterion = 0.999, statistic = 14.055
122)* weights = 79
121) hoursperweek > 43
123) age <= 29; criterion = 0.963, statistic = 8.005
124)* weights = 7
123) age > 29
125)* weights = 80
114) educationnum > 14
126) age <= 32; criterion = 0.988, statistic = 17.017
127) maritalstatus == { Never-married, Married-spouse-absent}; criterion = 0.962, statistic = 10.668
128)* weights = 61
127) maritalstatus == { Divorced}
129)* weights = 9
126) age > 32
130) sex == { Female}; criterion = 0.994, statistic = 17.268
131) maritalstatus == { Divorced, Separated, Widowed, Married-spouse-absent}; criterion = 0.968, statistic = 15.369
132)* weights = 46
131) maritalstatus == { Never-married}
133)* weights = 44
130) sex == { Male}
134)* weights = 99
91) occupation == { Craft-repair, Adm-clerical, Sales, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Farming-fishing, Tech-support, Priv-house-serv, Armed-Forces}
135) hoursperweek <= 41; criterion = 1, statistic = 268.244
136) age <= 28; criterion = 1, statistic = 141.138
137) maritalstatus == { Married-civ-spouse, Separated}; criterion = 1, statistic = 44.013
138)* weights = 209
137) maritalstatus == { Never-married, Divorced, Widowed, Married-spouse-absent}
139) age <= 21; criterion = 0.993, statistic = 11.182
140)* weights = 2627
139) age > 21
141) sex == { Female}; criterion = 0.951, statistic = 7.467
142)* weights = 1225
141) sex == { Male}
143)* weights = 1477
136) age > 28
144) educationnum <= 12; criterion = 1, statistic = 95.835
145) occupation == { Craft-repair, Farming-fishing, Tech-support}; criterion = 1, statistic = 68.331
146)* weights = 986
145) occupation == { Adm-clerical, Sales, Other-service, Machine-op-inspct, ?, Transport-moving, Handlers-cleaners, Priv-house-serv}
147) maritalstatus == { Married-civ-spouse}; criterion = 1, statistic = 35.81
148)* weights = 51
147) maritalstatus == { Never-married, Divorced, Separated, Widowed, Married-spouse-absent, Married-AF-spouse}
149) age <= 50; criterion = 0.994, statistic = 19.088
150)* weights = 2958
149) age > 50
151)* weights = 1269
144) educationnum > 12
152) maritalstatus == { Married-civ-spouse}; criterion = 0.969, statistic = 20.743
153)* weights = 11
152) maritalstatus == { Never-married, Divorced, Separated, Widowed, Married-spouse-absent}
154)* weights = 565
135) hoursperweek > 41
155) educationnum <= 12; criterion = 1, statistic = 103.455
156) age <= 38; criterion = 1, statistic = 65.904
157) maritalstatus == { Married-civ-spouse}; criterion = 1, statistic = 41.235
158)* weights = 22
157) maritalstatus == { Never-married, Divorced, Separated, Widowed, Married-spouse-absent}
159) age <= 29; criterion = 1, statistic = 19.805
160) workclass == { Self-emp-not-inc}; criterion = 0.999, statistic = 27.501
161)* weights = 50
160) workclass == { Private, Local-gov, ?, State-gov, Self-emp-inc, Federal-gov}
162)* weights = 729
159) age > 29
163)* weights = 569
156) age > 38
164) sex == { Female}; criterion = 1, statistic = 35.112
165)* weights = 315
164) sex == { Male}
166) maritalstatus == { Never-married, Divorced, Married-spouse-absent}; criterion = 0.996, statistic = 22.004
167)* weights = 340
166) maritalstatus == { Married-civ-spouse, Separated, Widowed}
168)* weights = 48
155) educationnum > 12
169) educationnum <= 13; criterion = 1, statistic = 28.34
170) age <= 32; criterion = 1, statistic = 18.878
171)* weights = 167
170) age > 32
172)* weights = 180
169) educationnum > 13
173)* weights = 63
## Making sure the levels of categorical variables match between training and test datasets
outcome <- levels(class)
levels(test.data$class) <- outcome
test.data$sex <- factor(test.data$sex, levels(sex))
test.data$workclass <- factor(test.data$workclass, levels(workclass))
test.data$maritalstatus <- factor(test.data$maritalstatus, levels(maritalstatus))
test.data$occupation <- factor(test.data$occupation, levels(occupation))
test.data$relationship <- factor(test.data$relationship, levels(relationship))
## Prediction of test data
pred <- predict(tree1, test.data)
N.test <- length(pred)
Calculate true positive (\( TP \)), false positive (\( FP \)), false negative (\( FN \)), and true negative (\( TN \)), and evaluate accuracy:
confusion.table <- table(pred, test.data$class)
colnames(confusion.table) <- c(paste("Actual",outcome[1]), outcome[2])
rownames(confusion.table) <- c(paste("Predicted",outcome[1]), outcome[2])
print(xtable(confusion.table, caption='Confusion Table for Decision Tree'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="pink"')
Actual <=50K | >50K | |
---|---|---|
Predicted <=50K | 11571 | 1814 |
>50K | 863 | 2032 |
accuracy <- sum(diag(confusion.table)) / N.test
cat("accuracy =", accuracy*100, "%<br>missclassification =", 100-accuracy*100, "%\n")
accuracy = 83.56 %
missclassification = 16.44 %