学习内容来自机器学习实例分析
数据来源
邮件数据来自SpamAssaion的公开语料库可以在(http://spamassassin.apache.org/publiccorpus/)免费下载。
数据组成:
垃圾邮件 spam
易识别的正常邮件 easy
不易识别的正常邮件 hard
每式各两份
环境以及包要求
windows10+ Rstudio3.4
ggplot2 用于可视化
tm 用于文本处理(分割)
数据处理
邮件分类是一个二分类问题,在这里使用简单的朴素贝叶斯分类方法。
- 首先,将文件夹的文本导入到文本语料库中,形成文本矩阵,有多少封邮件就有多少个字符串。
- 对文本语料库进行分割,使用tm包处理,变成词项文档矩阵dtm,假设邮件N条,特征词M个,就形成MxN的特征词矩阵。
构建模型
- 利用处理的数据构建贝叶斯分类模型
- 考虑先验概率和拉斯平滑
- 检查模型准确度、错判率
- 推广到其他数据,是否有适用性
- 可视化
library(tm)
library(ggplot2)
#存储路径
spam.path <- 'ML_for_Hackers/03-Classification/data/spam/'
spam2.path <- 'ML_for_Hackers/03-Classification/data/spam_2/'
easyham.path <-'ML_for_Hackers/03-Classification/data/easy_ham/'
easyham2.path <- 'ML_for_Hackers/03-Classification/data/easy_ham_2/'
hardham.path <- 'ML_for_Hackers/03-Classification/data/hard_ham/'
hardham2.path <- 'ML_for_Hackers/03-Classification/data/hard_ham_2/'
#将doc文档变成文本语料库(文本矩阵)
get.msg <- function(path){
con<-file(path,open = 'rt',encoding = 'latin1')
text <- readLines(con)
if (!is.na(which(text =="")[1])) {
msg <- text[seq(which(text == "")[1] + 1, length(text))]
close(con)
return(paste(msg,collapse = "\n"))
}
else
{close(con)}
#msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
#msg <- text[seq(which(text=="")[1]+1,length(text),1)]
#close(con)
#return(paste(msg,collapse = "\n"))
}
spam.docs <-dir(spam.path)
spam.docs <- spam.docs[which(spam.docs!='cmds')]
all.spam <- sapply(spam.docs,function(p) get.msg(paste(spam.path,p,sep ="")))
#得到词项文档矩阵
get.tdm <- function(doc.vec){
doc.corpus <- Corpus(VectorSource(doc.vec))
control <- list(stopwords=TRUE,removePunctuation=T,
removeNumbers=T,minDocFreq=2)
doc.tdm <- TermDocumentMatrix(doc.corpus,control)
return(doc.tdm)
}
spam.tdm <-get.tdm(all.spam)
#构建数据框保存特征词的条件概率
spam.matrix <- as.matrix(spam.tdm)
spam.counts <- rowSums(spam.matrix)
spam.df <- data.frame(cbind(names(spam.counts),
as.numeric(spam.counts)),stringsAsFactors = FALSE)
names(spam.df) <-c('term','frequency')
spam.df$frequency <- as.numeric(spam.df$frequency)
spam.occurrence <- sapply(1:nrow(spam.matrix), function(i)
{length(which(spam.matrix[i,]>0))/ncol(spam.matrix)})
spam.density <- spam.df$frequency/sum(spam.df$frequency)
spam.df <- transform(spam.df,density=spam.density,occurrence = spam.occurrence)
head(spam.df[with(spam.df,order(-occurrence)),])
# Now do the same for the EASY HAM email
easyham.docs <- dir(easyham.path)
easyham.docs <- easyham.docs[which(easyham.docs != "cmds")]
all.easyham <- sapply(easyham.docs[1:length(spam.docs)],
function(p) get.msg(file.path(easyham.path, p)))
easyham.tdm <- get.tdm(all.easyham)
easyham.matrix <- as.matrix(easyham.tdm)
easyham.counts <- rowSums(easyham.matrix)
easyham.df <- data.frame(cbind(names(easyham.counts),
as.numeric(easyham.counts)),
stringsAsFactors = FALSE)
names(easyham.df) <- c("term", "frequency")
easyham.df$frequency <- as.numeric(easyham.df$frequency)
easyham.occurrence <- sapply(1:nrow(easyham.matrix),
function(i)
{
length(which(easyham.matrix[i, ] > 0)) / ncol(easyham.matrix)
})
easyham.density <- easyham.df$frequency / sum(easyham.df$frequency)
easyham.df <- transform(easyham.df,
density = easyham.density,
occurrence = easyham.occurrence)
head(easyham.df)
classify.email <- function(path,training.df,prior = 0.2,c = 1e-6){
msg <- get.msg(path)
msg.tdm <-get.tdm(msg)
msg.freq <- rowSums(as.matrix(msg.tdm))
#find interestions of words
msg.match <- intersect (names(msg.freq),training.df$term)
if (length(msg.match)<1) {
return(prior*c^(length(msg.freq)))
}
else{
match.probs <- training.df$occurrence[match(msg.match,training.df$term)]
return(prior*prod(match.probs)*c^(length(msg.freq)-length(msg.match)))
}
}
hardham.docs <- dir(hardham.path)
haardham.docs <- hardham.docs[which(hardham.docs!='cmds')]
hardham.spamtest <- sapply(hardham.docs,
function(p) classify.email(file.path(hardham.path,p),
training.df = spam.df))
hardham.hamtest <- sapply(hardham.docs,
function(p) classify.email(file.path(hardham.path,p),
training.df = easyham.df))
hardham.res <- ifelse(hardham.spamtest>hardham.hamtest,T,F)
summary(hardham.res)
#用所有的邮件类型测试分类器
spam.classifier <- function(path){
pr.spam <- classify.email(path,spam.df)
pr.ham <- classify.email(path,easyham.df)
return(c(pr.spam,pr.ham,ifelse(pr.spam>pr.ham,1,0)))
}
# Get lists of all the email messages
easyham2.docs <- dir(easyham2.path)
easyham2.docs <- easyham2.docs[which(easyham2.docs != "cmds")]
hardham2.docs <- dir(hardham2.path)
hardham2.docs <- hardham2.docs[which(hardham2.docs != "cmds")]
spam2.docs <- dir(spam2.path)
spam2.docs <- spam2.docs[which(spam2.docs != "cmds")]
# Classify them all!
easyham2.class <- suppressWarnings(lapply(easyham2.docs,
function(p)
{
spam.classifier(file.path(easyham2.path, p))
}))
hardham2.class <- suppressWarnings(lapply(hardham2.docs,
function(p)
{
spam.classifier(file.path(hardham2.path, p))
}))
spam2.class <- suppressWarnings(lapply(spam2.docs,
function(p)
{
spam.classifier(file.path(spam2.path, p))
}))
# Create a single, final, data frame with all of the classification data in it
easyham2.matrix <- do.call(rbind, easyham2.class)
easyham2.final <- cbind(easyham2.matrix, "EASYHAM")
hardham2.matrix <- do.call(rbind, hardham2.class)
hardham2.final <- cbind(hardham2.matrix, "HARDHAM")
spam2.matrix <- do.call(rbind, spam2.class)
spam2.final <- cbind(spam2.matrix, "SPAM")
class.matrix <- rbind(easyham2.final, hardham2.final, spam2.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.SPAM" ,"Pr.HAM", "Class", "Type")
class.df$Pr.SPAM <- as.numeric(class.df$Pr.SPAM)
class.df$Pr.HAM <- as.numeric(class.df$Pr.HAM)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)
# Create final plot of results
class.plot <- ggplot(class.df, aes(x = log(Pr.HAM), log(Pr.SPAM))) +
geom_point(aes(shape = Type, alpha = 0.5)) +
#geom_abline(yintercept = 0, slope = 1) +
geom_abline(intercept = 0,slope = 1) +
scale_shape_manual(values = c("EASYHAM" = 1,
"HARDHAM" = 2,
"SPAM" = 3),
name = "Email Type") +
scale_alpha(guide = "none") +
xlab("log[Pr(HAM)]") +
ylab("log[Pr(SPAM)]") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.text.y = element_blank())
class.plot
get.results <- function(bool.vector)
{
results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),
length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))
return(results)
}
# Save results as a 2x3 table
easyham2.col <- get.results(subset(class.df, Type == "EASYHAM")$Class)
hardham2.col <- get.results(subset(class.df, Type == "HARDHAM")$Class)
spam2.col <- get.results(subset(class.df, Type == "SPAM")$Class)
class.res <- rbind(easyham2.col, hardham2.col, spam2.col)
colnames(class.res) <- c("NOT SPAM", "SPAM")
print(class.res)
网友评论