rm(list=ls(all=TRUE)) ## Clear memory
library(tm) ## load tm package
This function creates a corpus containing the spam/ham emails using the tm package.
Punctuations, stopwords, and numbers are removed, and the result is saved in an .RData file:
read.files <- function(label) {
if (!label %in% c("spam","ham")) stop("label must be either 'spam' or 'ham'")
path <- paste0("./",label,"/")
## Create corpus
myCorpus <- Corpus(DirSource(path))
## Convert to lower case
myCorpus <- tm_map(myCorpus, tolower)
## Remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
## Remove stopwords
myCorpus2 <- tm_map(myCorpus, removeWords, stopwords('english'))
## Remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
## Save the result
save(myCorpus, file=paste0(label,".corpus.RData"))
}
This function randomly splits the spam and ham corpora into training and test sets.
The words in each corpus are stored in a TextDocumentMatrix (TDM) object.
Result is saved as train.test.RData:
train.test <- function(training=0.9){
if (training>=1 | training<=0) stop("training must be between 0 and 1")
## Randomly split the spam emails
load(paste0("spam.corpus.RData"))
N <- length(myCorpus)
Ntest <- N * (1-training)
test.set <- sample(1:N, Ntest, replace=F)
training.set <- setdiff(1:N, test.set)
spam.test <- myCorpus[test.set]
spam.training <- myCorpus[training.set]
## Create TermDocumentMatrix for spam training
spam.TDM <- TermDocumentMatrix(spam.training, control=(list(wordLengths=c(1,Inf))))
## Randomly split the ham emails
load(paste0("ham.corpus.RData"))
N <- length(myCorpus)
Ntest <- N * (1-training)
test.set <- sample(1:N, Ntest, replace=F)
training.set <- setdiff(1:N, test.set)
ham.test <- myCorpus[test.set]
ham.training <- myCorpus[training.set]
## Create TermDocumentMatrix for ham training
ham.TDM <- TermDocumentMatrix(ham.training, control=(list(wordLengths=c(1,Inf))))
## Create TermDocumentMatrix for test, combining spam and ham
test.TDM <- TermDocumentMatrix(c(spam.test,ham.test), control=(list(wordLengths=c(1,Inf))))
## Store the true labels (1 = spam; 0 = ham)
test.true.labels <- c(rep(1,length(spam.test)), rep(0,length(ham.test)))
## Save the result
save(spam.training, spam.test, spam.TDM,
ham.training, ham.test, ham.TDM,
test.TDM, test.true.labels, file="train.test.RData")
}
This function calculates word frequencies in a TextDocumentMatrix.
The result is sorted either alphabetically or by the decreasing order of frequencies (default):
word.freq <- function(TDM, sortBy='freq'){
## TDM is a "simple_triplet_matrix" object from package "slam",
## So use "slam::row_sums" to sum frequencies across the rows
freq <- slam::row_sums(TDM)
if (sortBy=='freq') freq <- sort(freq,decreasing=T)
return(freq)
}
## Read the data
if (!file.exists("spam.corpus.RData")) read.files("spam")
if (!file.exists("ham.corpus.RData")) read.files("ham")
## Create training/test sets
if (!file.exists("train.test.RData")) train.test(0.9)
load("train.test.RData")
## Analyze spam words
spam.TDM <- weightBin(spam.TDM) ## binary weight; ignoring word repetition in single email
spam.N <- dim(spam.TDM)[2]
spam.word.freq <- word.freq(spam.TDM)
spam.words <- names(spam.word.freq)
## Analyze ham words
ham.TDM <- weightBin(ham.TDM)
ham.N <- dim(ham.TDM)[2]
ham.word.freq <- word.freq(ham.TDM)
ham.words <- names(ham.word.freq)
## Print the number of words in each set
cat(spam.N,"spam emails in the training set\n",
ham.N,"ham emails in the training set\n")
1351 spam emails in the training set
3305 ham emails in the training set
Find words that appear only in spam emails. Store these words as spam.filter1:
common <- spam.words %in% ham.words
spam.filter1 <- spam.words[!common]
cat(length(spam.filter1),"words appear only in spam -> added to spam filter\n",
sum(common),"words appear in both spam and ham\n")
27801 words appear only in spam -> added to spam filter
7083 words appear in both spam and ham
Calculate the spam probability for the common words using the Bayes rule:
\[ P(word) = P(word|spam) \times P(spam) + P(word|ham) \times P(ham) \]
## Calculate probabilities
p.spam <- spam.N/(spam.N+ham.N) ## P(spam)
p.ham <- 1 - p.spam ## P(ham)
common.words <- spam.words[common]
p.word.spam <- spam.word.freq[common]/spam.N ## P(word|spam)
p.word.ham <- ham.word.freq[common.words]/ham.N ## P(word|ham)
p.word <- p.word.spam*p.spam + p.word.ham*p.ham ## P(word)
p.spam.word <- p.word.spam*p.spam / p.word ## P(spam|word)
## Create a dataframe
spam.prob <- data.frame(p.word,p.spam.word)
colnames(spam.prob) <- c("p(word)","p(spam|word)")
Display the training result, sorted by \( P(word) \):
library(xtable)
sorter <- order(spam.prob[,1],decreasing=T)
spam.prob <- spam.prob[sorter,]
print(xtable(head(spam.prob), caption='Spam Probability<br>sorted by $P(word)$'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="#FFCC00"')
p(word) | p(spam|word) | |
---|---|---|
subject | 1.00 | 0.29 |
please | 0.36 | 0.17 |
will | 0.32 | 0.20 |
s | 0.28 | 0.35 |
thanks | 0.28 | 0.06 |
cc | 0.25 | 0.01 |
Display the training result, sorted by \( P(spam|word) \):
sorter <- order(spam.prob[,2],decreasing=T)
spam.prob <- spam.prob[sorter,]
print(xtable(head(spam.prob), caption='Spam Probability<br>sorted by $P(spam|word)$'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="#FFCC00"')
p(word) | p(spam|word) | |
---|---|---|
pain | 0.02 | 0.99 |
spam | 0.01 | 0.98 |
differ | 0.01 | 0.98 |
weight | 0.01 | 0.98 |
adobe | 0.01 | 0.98 |
creative | 0.01 | 0.98 |
Set a probability threshold for assigning spam words. Store these words as spam.filter2:
thld <- 0.90
spam.filter2 <- names(p.spam.word[p.spam.word>=thld])
spam.filter <- c(spam.filter1,spam.filter2)
cat("Spam probability threshold:",thld,"\n",
length(spam.filter2),"new words added to spam filter\n")
Spam probability threshold: 0.9
208 new words added to spam filter
Classify emails using the spam filter:
test.N <- test.TDM$ncol ## number of test emails
test.spam <- which(test.true.labels==1) ## spam emails in test
test.spam.N <- length(test.spam) ## number of spam emails in test
test.ham <- which(test.true.labels==0) ## ham emails in test
test.ham.N <- length(test.ham) ## number of ham emails in test
test.words <- test.TDM$dimnames$Terms ## number of words in test
prediction <- array()
for (j in 1:test.N){
label <- 0 ## default label (0 = ham)
word.list <- test.words[test.TDM$i[test.TDM$j==j]] ## list of words in current test email
if (any(word.list %in% spam.filter)) label <- 1 ## label = spam
prediction[j] <- label
}
Calculate true positive (\( TP \)), false positive (\( FP \)), false negative (\( FN \)), and true negative (\( TN \)), and evaluate performance metrics:
\[ precision = \frac{TP}{TP+FP} \]
\[ sensitivity = \frac{TP}{TP+FN} \]
\[ specificity = \frac{TN}{TN+FP} \]
\[ F_{score} = 2\frac{precision \times sensitivity}{precision + sensitivity} \]
TP <- sum(prediction[test.spam]==1)
FN <- sum(prediction[test.spam]==0)
FP <- sum(prediction[test.ham]==1)
TN <- sum(prediction[test.ham]==0)
confusion.table <- rbind(c(TP,FP),c(FN,TN))
colnames(confusion.table) <- c("Actual<br>spam","ham")
rownames(confusion.table) <- c("Predicted spam","ham")
accuracy <- (TP + TN)/(TP + FP + FN + TN)
precision <- TP/(TP + FP)
sensitivity <- TP/(TP + FN)
specificity <- TN/(TN + FP)
Fscore <- 2 * precision * sensitivity / (precision + sensitivity)
print(xtable(confusion.table, caption='Confusion Table'), caption.placement = 'top', type="html", html.table.attributes='border=1 align="center" bgcolor="#FFCC00"'); cat("accuracy:",accuracy,"\n"); cat("precision:",precision,"\n"); cat("sensitivity:",sensitivity,"\n"); cat("specificity:",specificity,"\n"); cat("Fscore:",Fscore,"\n");
Actual spam | ham | |
---|---|---|
Predicted spam | 140 | 78 |
ham | 9 | 289 |
accuracy: 0.8314 precision: 0.6422 sensitivity: 0.9396 specificity: 0.7875 Fscore: 0.7629
Try improving performance by increasing the number of matched words:
prediction <- array()
for (k in 2:10) {
for (j in 1:test.N){
label <- 0
word.list <- test.words[test.TDM$i[test.TDM$j==j]]
if (sum(word.list %in% spam.filter)>=k) label <- 1
prediction[j] <- label
}
TP[k] <- sum(prediction[test.spam]==1)
FN[k] <- sum(prediction[test.spam]==0)
FP[k] <- sum(prediction[test.ham]==1)
TN[k] <- sum(prediction[test.ham]==0)
}
accuracy <- (TP + TN)/(TP + FP + FN + TN)
precision <- TP/(TP + FP)
sensitivity <- TP/(TP + FN)
specificity <- TN/(TN + FP)
Fscore <- 2 * precision * sensitivity / (precision + sensitivity)
Plot the result:
par(mar=c(5.1,5.1,4.1,2.1))
plot(0, 0, xlim=c(1,10), ylim=c(min(sensitivity),1), type="n",
xlab='Minimum no. of matched words', ylab='Probability', cex.lab=2, cex.axis=1.5)
points(accuracy, type='o', pch=16, col='red', cex=2)
points(precision, type='o', pch=1, col='blue', cex=2)
points(sensitivity, type='o', pch=2, col='green4', cex=2)
points(specificity, type='o', pch=6, col='coral', cex=2)
points(Fscore, type='o', pch=15, col='black', cex=2)
text(9,accuracy[10],"accuracy", col='red', cex=1.5, adj=c(0.5,1))
text(9,precision[10]-0.01,"precision", col='blue', cex=1.5, adj=c(0.5,1))
text(9,sensitivity[10],"sensitivity", col='green4', cex=1.5, adj=0.5)
text(9,specificity[10]-0.01,"specificity", col='coral', cex=1.5, adj=c(0.5,1))
text(9,Fscore[10],"Fscore", col='black', cex=1.5, adj=0.5)
abline(v=which.max(Fscore), lty=2, lwd=2)