Machine Learning: Spam Classification

A supervised learning project for spam classification utilizing Naive Bayes (reference: "Doing Data Science").
Data downloaded from Enron's email database. Emails are labeled as 'spam' or 'ham'.
Data mining is performed using the tm package.

Initialization

rm(list=ls(all=TRUE))  ## Clear memory
library(tm)            ## load tm package

Read data and clean up

This function creates a corpus containing the spam/ham emails using the tm package.
Punctuations, stopwords, and numbers are removed, and the result is saved in an .RData file:

read.files <- function(label) {
  if (!label %in% c("spam","ham")) stop("label must be either 'spam' or 'ham'")
    path <- paste0("./",label,"/")
    ## Create corpus
    myCorpus <- Corpus(DirSource(path))
    ## Convert to lower case
    myCorpus <- tm_map(myCorpus, tolower)
    ## Remove punctuation
    myCorpus <- tm_map(myCorpus, removePunctuation)
    ## Remove stopwords
    myCorpus2 <- tm_map(myCorpus, removeWords, stopwords('english'))
    ## Remove numbers
    myCorpus <- tm_map(myCorpus, removeNumbers)
    ## Save the result
    save(myCorpus, file=paste0(label,".corpus.RData"))
}

Split data into training and test sets

This function randomly splits the spam and ham corpora into training and test sets.
The words in each corpus are stored in a TextDocumentMatrix (TDM) object.
Result is saved as train.test.RData:

train.test <- function(training=0.9){
  if (training>=1 | training<=0) stop("training must be between 0 and 1")
    ## Randomly split the spam emails
    load(paste0("spam.corpus.RData"))
    N <- length(myCorpus)
    Ntest <- N * (1-training)
    test.set <- sample(1:N, Ntest, replace=F)
    training.set <- setdiff(1:N, test.set)
    spam.test <- myCorpus[test.set]
    spam.training <- myCorpus[training.set]
    ## Create TermDocumentMatrix for spam training
    spam.TDM <- TermDocumentMatrix(spam.training, control=(list(wordLengths=c(1,Inf))))
    ## Randomly split the ham emails
    load(paste0("ham.corpus.RData"))
    N <- length(myCorpus)
    Ntest <- N * (1-training)
    test.set <- sample(1:N, Ntest, replace=F)
    training.set <- setdiff(1:N, test.set)
    ham.test <- myCorpus[test.set]
    ham.training <- myCorpus[training.set]
    ## Create TermDocumentMatrix for ham training
    ham.TDM <- TermDocumentMatrix(ham.training, control=(list(wordLengths=c(1,Inf))))
    ## Create TermDocumentMatrix for test, combining spam and ham
    test.TDM <- TermDocumentMatrix(c(spam.test,ham.test), control=(list(wordLengths=c(1,Inf))))
    ## Store the true labels (1 = spam; 0 = ham)
    test.true.labels <- c(rep(1,length(spam.test)), rep(0,length(ham.test)))
    ## Save the result
    save(spam.training, spam.test, spam.TDM,
        ham.training, ham.test, ham.TDM,
        test.TDM, test.true.labels, file="train.test.RData")
}

Calculate word frequencies

This function calculates word frequencies in a TextDocumentMatrix.
The result is sorted either alphabetically or by the decreasing order of frequencies (default):

word.freq <- function(TDM, sortBy='freq'){
  ## TDM is a "simple_triplet_matrix" object from package "slam",
    ## So use "slam::row_sums" to sum frequencies across the rows
    freq <- slam::row_sums(TDM)
    if (sortBy=='freq') freq <- sort(freq,decreasing=T)
    return(freq)
}

Analyzing Training Set

## Read the data
if (!file.exists("spam.corpus.RData")) read.files("spam")
if (!file.exists("ham.corpus.RData")) read.files("ham")

## Create training/test sets
if (!file.exists("train.test.RData")) train.test(0.9)
load("train.test.RData")

## Analyze spam words
spam.TDM <- weightBin(spam.TDM)  ## binary weight; ignoring word repetition in single email
spam.N <- dim(spam.TDM)[2]
spam.word.freq <- word.freq(spam.TDM)
spam.words <- names(spam.word.freq)

## Analyze ham words
ham.TDM <- weightBin(ham.TDM)
ham.N <- dim(ham.TDM)[2]
ham.word.freq <- word.freq(ham.TDM)
ham.words <- names(ham.word.freq)

## Print the number of words in each set
cat(spam.N,"spam emails in the training set\n",
    ham.N,"ham emails in the training set\n")

1351 spam emails in the training set
 3305 ham emails in the training set

Generate a spam filter using the training dataset

Find words that appear only in spam emails. Store these words as spam.filter1:

common <- spam.words %in% ham.words
spam.filter1 <- spam.words[!common]
cat(length(spam.filter1),"words appear only in spam -> added to spam filter\n",
    sum(common),"words appear in both spam and ham\n")

27801 words appear only in spam -> added to spam filter
 7083 words appear in both spam and ham

Naive Bayes

Calculate the spam probability for the common words using the Bayes rule:

\[ P(spam|word) = \frac{P(word|spam) \times P(spam)}{P(word)} \]

\[ P(word) = P(word|spam) \times P(spam) + P(word|ham) \times P(ham) \]

## Calculate probabilities
p.spam <- spam.N/(spam.N+ham.N)  ## P(spam)
p.ham  <- 1 - p.spam             ## P(ham)
common.words <- spam.words[common]
p.word.spam <- spam.word.freq[common]/spam.N     ## P(word|spam)
p.word.ham <- ham.word.freq[common.words]/ham.N  ## P(word|ham)
p.word <- p.word.spam*p.spam + p.word.ham*p.ham  ## P(word)
p.spam.word <- p.word.spam*p.spam / p.word       ## P(spam|word)

## Create a dataframe
spam.prob <- data.frame(p.word,p.spam.word)
colnames(spam.prob) <- c("p(word)","p(spam|word)")

Display the training result, sorted by \( P(word) \):

library(xtable)
sorter <- order(spam.prob[,1],decreasing=T)
spam.prob <- spam.prob[sorter,]
print(xtable(head(spam.prob), caption='Spam Probability<br>sorted by $P(word)$'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="#FFCC00"')

Spam Probability
sorted by \( P(word) \)
	p(word)	p(spam\|word)
subject	1.00	0.29
please	0.36	0.17
will	0.32	0.20
s	0.28	0.35
thanks	0.28	0.06
cc	0.25	0.01

Display the training result, sorted by \( P(spam|word) \):

sorter <- order(spam.prob[,2],decreasing=T)
spam.prob <- spam.prob[sorter,]
print(xtable(head(spam.prob), caption='Spam Probability<br>sorted by $P(spam|word)$'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="#FFCC00"')

Spam Probability
sorted by \( P(spam|word) \)
	p(word)	p(spam\|word)
pain	0.02	0.99
spam	0.01	0.98
differ	0.01	0.98
weight	0.01	0.98
adobe	0.01	0.98
creative	0.01	0.98

Spam probability threshold

Set a probability threshold for assigning spam words. Store these words as spam.filter2:

thld <- 0.90
spam.filter2 <- names(p.spam.word[p.spam.word>=thld])
spam.filter <- c(spam.filter1,spam.filter2)
cat("Spam probability threshold:",thld,"\n",
    length(spam.filter2),"new words added to spam filter\n")

Spam probability threshold: 0.9 
 208 new words added to spam filter

Analyzing Test Set

Classify emails using the spam filter:

test.N      <- test.TDM$ncol               ## number of test emails
test.spam   <- which(test.true.labels==1)  ## spam emails in test
test.spam.N <- length(test.spam)           ## number of spam emails in test
test.ham    <- which(test.true.labels==0)  ## ham emails in test
test.ham.N  <- length(test.ham)            ## number of ham emails in test
test.words  <- test.TDM$dimnames$Terms     ## number of words in test
prediction  <- array()
for (j in 1:test.N){
  label <- 0  ## default label (0 = ham)
  word.list <- test.words[test.TDM$i[test.TDM$j==j]] ## list of words in current test email
    if (any(word.list %in% spam.filter))  label <- 1     ## label = spam
    prediction[j] <- label
}

Confusion Table

Calculate true positive (\( TP \)), false positive (\( FP \)), false negative (\( FN \)), and true negative (\( TN \)), and evaluate performance metrics:

\[ accuracy = \frac{TP+TN}{TP+FP+FN+TN} \]

\[ precision = \frac{TP}{TP+FP} \]

\[ sensitivity = \frac{TP}{TP+FN} \]

\[ specificity = \frac{TN}{TN+FP} \]

\[ F_{score} = 2\frac{precision \times sensitivity}{precision + sensitivity} \]

TP <- sum(prediction[test.spam]==1)
FN <- sum(prediction[test.spam]==0)
FP <- sum(prediction[test.ham]==1)
TN <- sum(prediction[test.ham]==0)
confusion.table <- rbind(c(TP,FP),c(FN,TN))
colnames(confusion.table) <- c("Actual<br>spam","ham")
rownames(confusion.table) <- c("Predicted spam","ham")
accuracy    <- (TP + TN)/(TP + FP + FN + TN)
precision   <- TP/(TP + FP)
sensitivity <- TP/(TP + FN)
specificity <- TN/(TN + FP)
Fscore      <- 2 * precision * sensitivity / (precision + sensitivity)
print(xtable(confusion.table, caption='Confusion Table'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="#FFCC00"'); cat("accuracy:",accuracy,"\n"); cat("precision:",precision,"\n"); cat("sensitivity:",sensitivity,"\n"); cat("specificity:",specificity,"\n"); cat("Fscore:",Fscore,"\n");

Confusion Table
	Actual spam	ham
Predicted spam	140	78
ham	9	289

accuracy: 0.8314 precision: 0.6422 sensitivity: 0.9396 specificity: 0.7875 Fscore: 0.7629

Hyperparameter Optimization

Try improving performance by increasing the number of matched words:

prediction <- array()
for (k in 2:10) {
  for (j in 1:test.N){
    label <- 0
        word.list <- test.words[test.TDM$i[test.TDM$j==j]]
        if (sum(word.list %in% spam.filter)>=k)  label <- 1
        prediction[j] <- label
    }
    TP[k] <- sum(prediction[test.spam]==1)
  FN[k] <- sum(prediction[test.spam]==0)
  FP[k] <- sum(prediction[test.ham]==1)
    TN[k] <- sum(prediction[test.ham]==0)
}
accuracy    <- (TP + TN)/(TP + FP + FN + TN)
precision   <- TP/(TP + FP)
sensitivity <- TP/(TP + FN)
specificity <- TN/(TN + FP)
Fscore      <- 2 * precision * sensitivity / (precision + sensitivity)

Plot the result:

par(mar=c(5.1,5.1,4.1,2.1))
plot(0, 0, xlim=c(1,10), ylim=c(min(sensitivity),1), type="n", 
     xlab='Minimum no. of matched words', ylab='Probability', cex.lab=2, cex.axis=1.5)
points(accuracy, type='o', pch=16, col='red', cex=2)
points(precision, type='o', pch=1, col='blue', cex=2)
points(sensitivity, type='o', pch=2, col='green4', cex=2)
points(specificity, type='o', pch=6, col='coral', cex=2)
points(Fscore, type='o', pch=15, col='black', cex=2)
text(9,accuracy[10],"accuracy", col='red', cex=1.5, adj=c(0.5,1))
text(9,precision[10]-0.01,"precision", col='blue', cex=1.5, adj=c(0.5,1))
text(9,sensitivity[10],"sensitivity", col='green4', cex=1.5, adj=0.5)
text(9,specificity[10]-0.01,"specificity", col='coral', cex=1.5, adj=c(0.5,1))
text(9,Fscore[10],"Fscore", col='black', cex=1.5, adj=0.5)
abline(v=which.max(Fscore), lty=2, lwd=2)

Conclusions

Increasing the number of matched words increases \( precision \) and \( specificity \) due to decline in \( FP \), but decreases sensitivity.
Overall, maximum \( accuracy \) and \( F_{score} \) are obtained when matching 3 spam words.
Other hyperparameters that can be optimized include probability threshold (currently set to 0.9) and Laplace smoothing factors as Naive Bayesian Classifiers.