> library(pacman)
> p_load(dplyr, caret, chinese.misc, stringr, tidytext)
朴素贝叶斯分类器是一种有向图模型。
任务:预测电影评论里的情绪。
1、数据准备
该大电影评论数据集包含25000条影评数据。
> path_pos <- "data_set/aclImdb/train/pos/"
> path_neg <-"data_set/aclImdb/train/neg/"
>
> # 获取文件夹下所有文件名称
> pos_files <- dir_or_file(path_pos)
> head(pos_files)
## [1] "/home/ez/R/data_set/aclImdb/train/pos/0_9.txt" "/home/ez/R/data_set/aclImdb/train/pos/1_7.txt"
## [3] "/home/ez/R/data_set/aclImdb/train/pos/10_9.txt" "/home/ez/R/data_set/aclImdb/train/pos/100_7.txt"
## [5] "/home/ez/R/data_set/aclImdb/train/pos/1000_8.txt" "/home/ez/R/data_set/aclImdb/train/pos/10000_8.txt"
文档名称最后的数字代表评论分数(下划线后的数字),评分在0~4和7~10之间。
> # 先提取下划线后面的数字
> # 然后清除下划线,转换为数值型
> get_score <- function(x) {
+ x = stringr::str_extract(x, "_\\d+")
+ x = stringr::str_remove(x, "_")
+ x = as.numeric(x)
+ return(x)
+ }
构建读取文件内容函数,并创建数据框,第一列为评论内容,第二列为评分。
> read_fun <- function(x) {
+ txt <- readLines(x)
+ score <- get_score(x)
+ df <- tibble(content = txt,
+ score = score)
+ return(df)
+ }
读取pos文件夹下所有评论及对应的评分:
> nb_pos <- lapply(pos_files, read_fun) %>%
+ do.call(bind_rows, .)
>
> head(nb_pos)
## # A tibble: 6 x 2
## content score
## <chr> <dbl>
## 1 "Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, … 9
## 2 "If you like adult comedy cartoons, like South Park, then this is nearly a similar format about the sm… 7
## 3 "I'm a male, not given to women's movies, but this is really a well done special story. I have no pers… 9
## 4 "Scott Bartlett's 'OffOn' is nine minutes of pure craziness. It is a full-frontal assault of psychedel… 7
## 5 "I liked the film. Some of the action scenes were very interesting, tense and well done. I especially … 8
## 6 "Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan … 8
使用同样的方法读取neg文件夹下所有文件。
> neg_files <- dir_or_file(path_neg)
> head(neg_files)
## [1] "/home/ez/R/data_set/aclImdb/train/neg/0_3.txt" "/home/ez/R/data_set/aclImdb/train/neg/1_1.txt"
## [3] "/home/ez/R/data_set/aclImdb/train/neg/10_2.txt" "/home/ez/R/data_set/aclImdb/train/neg/100_3.txt"
## [5] "/home/ez/R/data_set/aclImdb/train/neg/1000_4.txt" "/home/ez/R/data_set/aclImdb/train/neg/10000_4.txt"
> nb_neg <- lapply(neg_files, read_fun) %>%
+ do.call(bind_rows, .)
>
> head(nb_neg)
## # A tibble: 6 x 2
## content score
## <chr> <dbl>
## 1 "Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrif… 3
## 2 "Robert DeNiro plays the most unbelievably intelligent illiterate of all time. This movie is so wastef… 1
## 3 "This film had a lot of promise, and the plot was relatively interesting, however the actors, director… 2
## 4 "OK its not the best film I've ever seen but at the same time I've been able to sit and watch it TWICE… 3
## 5 "The plot for Descent, if it actually can be called a plot, has two noteworthy events. One near the be… 4
## 6 "Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belong… 4
将两个文件合并为一个,并根据分数进行分类,将>=5的分为正向评论,<5的为负向评论。(也可以通过文件夹来区分,pos文件夹下的所有评论都为正向评论)
> nb_all <- bind_rows(nb_pos, nb_neg) %>%
+ mutate(sentiment = ifelse(score < 5, "negative", "positive"))
> str(nb_all)
## tibble [25,000 × 3] (S3: tbl_df/tbl/data.frame)
## $ content : chr [1:25000] "Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as \""| __truncated__ "If you like adult comedy cartoons, like South Park, then this is nearly a similar format about the small advent"| __truncated__ "I'm a male, not given to women's movies, but this is really a well done special story. I have no personal love "| __truncated__ "Scott Bartlett's 'OffOn' is nine minutes of pure craziness. It is a full-frontal assault of psychedelic, pulsat"| __truncated__ ...
## $ score : num [1:25000] 9 7 9 7 8 8 10 7 8 8 ...
## $ sentiment: chr [1:25000] "positive" "positive" "positive" "positive" ...
2、去除数字、标点等停用词
去除数字、标点符号,全部转换为小写字母,然后将多个空格替换为单个空格。
> pre_fun <- function(string) {
+ string <- str_remove_all(string, "[[:digit:]]")
+ string <- str_remove_all(string, "[[:punct:]]")
+ string <- str_to_lower(string)
+ string <- str_replace_all(string, "\\s+", " ")
+ return(string)
+ }
>
> nb_all$content <- nb_all$content %>%
+ lapply(pre_fun) %>%
+ unlist()
>
> # 随机显示一条处理后的评论
> sample_n(nb_all, 1)[, 1]
## # A tibble: 1 x 1
## content
## <chr>
## 1 this movie was excellent i was not expecting it to live up to all the hype but it did like all the bourne mo…
> # 新增id列,并设为第一列
> nb_all <- mutate(nb_all, id = 1:nrow(nb_all)) %>%
+ select(id, everything())
3、创建词向量
按照文档编号(id)统计每个词在文档中出现的次数。
> nb_words <- nb_all %>%
+ unnest_tokens(words, content) %>%
+ count(words, id, sort = T)
>
> str(nb_words)
## tibble [3,480,049 × 3] (S3: tbl_df/tbl/data.frame)
## $ words: chr [1:3480049] "the" "the" "the" "the" ...
## $ id : int [1:3480049] 54 1948 4350 5921 3083 16037 16035 17204 1948 1014 ...
## $ n : int [1:3480049] 198 198 138 128 108 103 99 99 98 97 ...
4、去除英语停用词
> stop_words <- readr::read_csv("dict/english_stopword.txt", col_names = F) %>%
+ setNames("words")
> str(stop_words)
## tibble [1,435 × 1] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ words: chr [1:1435] "à" "able" "about" "above" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_character()
## .. )
> nb_words <- nb_words %>%
+ anti_join(stop_words, by = "words")
> str(nb_words)
## tibble [1,818,205 × 3] (S3: tbl_df/tbl/data.frame)
## $ words: chr [1:1818205] "br" "br" "br" "br" ...
## $ id : int [1:1818205] 18564 21701 24451 18360 12526 1095 14875 18438 5025 20903 ...
## $ n : int [1:1818205] 92 86 84 72 70 68 60 60 58 54 ...
词矩阵行数有3,480,049缩减为1,818,205。"br"应该是html标记,手动删除掉。
> usr_words <- tibble(words = c("br", "aa"))
> nb_words <- nb_words %>%
+ anti_join(usr_words, by = "words")
> str(nb_words)
## tibble [1,803,533 × 3] (S3: tbl_df/tbl/data.frame)
## $ words: chr [1:1803533] "rob" "titanic" "victor" "film" ...
## $ id : int [1:1803533] 6262 54 13037 20332 22218 1948 23221 1948 1948 5323 ...
## $ n : int [1:1803533] 34 34 31 27 27 27 27 26 25 25 ...
5、创建文档-词条矩阵(DTM)
> nb_dtm <- cast_dtm(data = nb_words,
+ document = id,
+ term = words,
+ value = n)
> dim(nb_dtm)
## [1] 25000 107753
该文档-词条矩阵共25000行(文档数量),107753列(词条数量)。
> nb_dtm
## <<DocumentTermMatrix (documents: 25000, terms: 107753)>>
## Non-/sparse entries: 1803533/2692021467
## Sparsity : 100%
## Maximal term length: 64
## Weighting : term frequency (tf)
Non-/sparse entries: 1803533/2692021467表示非稀疏与稀疏元素之比,在2692021467个元素中,只有1803533个不为0。
为了减少列数(特征数),我们可以通过丢弃稀疏列(很多词条在语料库里只出现一两次,对用户情绪没有多少信息量)和词干提取(movie和movies表示的是同样的意思)的方式。当然,还有一种做法是进行PCA。
使用tm包丢弃所有文档中出现次数不足1%的词条:
> nb_dtm <- tm::removeSparseTerms(nb_dtm, sparse = 0.99)
> dim(nb_dtm)
## [1] 25000 1227
现在列数显著地减少到了1227个,相对于行数,这明显是个更合理的特征数量。
查看其中某些行和列的信息:
> tm::inspect(nb_dtm[10:15, 20:28])
## <<DocumentTermMatrix (documents: 6, terms: 9)>>
## Non-/sparse entries: 13/41
## Sparsity : 76%
## Maximal term length: 9
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs ben book character dr flick guy james la story
## 17466 0 0 0 0 0 1 0 0 6
## 21540 0 1 0 0 0 0 0 0 2
## 22439 0 1 2 0 0 0 0 0 2
## 22476 0 0 2 0 0 3 0 0 0
## 23202 0 0 0 0 0 0 0 0 2
## 9686 0 2 1 0 0 0 0 0 1
6、转换为tibble
训练模型需要输入为数据框,将DTM转换为数据框。
> nb_data <- as_tibble(as.matrix(nb_dtm))
> nb_data[10:15, 20:28]
## # A tibble: 6 x 9
## guy story ben book character dr flick james la
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 2 0 1 2 0 0 0 0
## 2 3 0 0 0 2 0 0 0 0
## 3 0 2 0 0 0 0 0 0 0
## 4 0 2 0 1 0 0 0 0 0
## 5 0 1 0 2 1 0 0 0 0
## 6 1 6 0 0 0 0 0 0 0
将sentiment列合并回去,转换为因子型作为因变量,即得到我们需要的数据集。
> nb_data <- nb_data %>%
+ bind_cols(sentiment = as.factor(nb_all$sentiment))
>
> table(nb_data$sentiment)
##
## negative positive
## 12500 12500
7、拆分训练集和测试集
> set.seed(123)
> ind <- createDataPartition(nb_data$sentiment, p = 0.8, list = F)
> dtrain <- nb_data[ind, ]
> dtest <- nb_data[-ind, ]
8、训练朴素贝叶斯模型
> fit.nb <- train(sentiment ~ ., data = dtrain, method = "naive_bayes")
> fit.nb
## Naive Bayes
##
## 20000 samples
## 1227 predictor
## 2 classes: 'negative', 'positive'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 20000, 20000, 20000, 20000, 20000, 20000, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.7312269 0.4623403
## TRUE 0.5003438 0.0000000
##
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning parameter 'adjust' was held constant at
## a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = FALSE and adjust = 1.
准确率:
> # 训练集准确率
> mean(predict(fit.nb, newdata = dtrain) == dtrain$sentiment)
## [1] 0.73725
> # 测试集准确率
> mean(predict(fit.nb, newdata = dtest) == dtest$sentiment)
## [1] 0.735
准确率大概为73.5%。
如果使用score作为因变量,使用回归模型,也可以用来预测评论的分数。
网友评论