1、读取并整理数据
library(pacman)
p_load(tidytext,tidyverse,stringr)
# 读取文件
txt <- readLines("./data_set/xslm/血色浪漫.txt",encoding = "unknown",n=-1L,ok=T,warn=T,skipNul=T)
血色浪漫
# 删除分节等重复信息
txt <- txt %>% gsub("出版社:长江文艺出版社作者:都梁|\\[上一篇\\]","",.) %>% str_trim()
# 按【下一篇】拆分
txt2 <- txt %>% paste(collapse = " ") %>% str_split("\\[下一篇\\]")
# 按章节整理为数据框
chapter <- vector("character",244)
content <- vector("character",244)
for (line in txt2) {
temp <- line %>% str_split("引子|\\([0-9]+\\)")
for (i in 1:length(line)) {
chapter[i] <- temp[[i]][1] %>% gsub("[《血色浪漫上》]","",.) %>% str_trim(side = "both")
content[i] <- temp[[i]][2] %>% str_trim(side = "both")
}
df <- tibble(chapter=chapter,content=content)
}
# 给第0章添加标题“引子”
df$chapter[1] <- "引子"
# 按章节(chapter)将各小节(content)合并
txt3 <- aggregate(content~chapter,df,paste)
# 去掉字符及字母
txt3$content <- txt3$content %>% gsub("[%a-z\\\\\\\"()]","",.)
# 创建有效水平章节名称列表
chapter.levels <- c("引子","第一章","第二章","第三章","第四章","第五章","第六章",
"第七章","第八章","第九章","第十章","第十一章","第十二章",
"第十三章","第十四章","第十五章","第十六章","第十七章",
"第十八章","第十九章","第二十章","第二十一章","第二十二章",
"第二十三章","第二十四章","第二十五章","尾声")
txt3$chapter <- factor(txt3$chapter,levels = chapter.levels)
# 按章节重新排序
txt3 <- txt3 %>% arrange(chapter)
# 写入文件
# write.csv(txt3,"./txt3.csv")
整理后的文档
2、中文分词
使用dplyr::count()函数计算词频一直有问题,最后选择使用text2vec包。
p_load(jiebaR,plyr,text2vec)
# txt3 <- read.csv("./txt3.csv",header = T,stringsAsFactors = F)
# 将小说主要人物姓名保存为用户字典xslm
wk <-worker(user = "./dict/characters-master/xslm")
tok.fun <- function(strings) {llply(strings,segment,wk)}
# 设置分词迭代器
it <- itoken(txt3$content,
preprocessor = identity,
tokenizer = tok.fun,
ids = txt3$chapter,
progressbar = interactive())
# 创建词矩阵
vocab <- create_vocabulary(it)
3、计算TF和TF-IDF
3.1 计算TF
book.words <- tibble(chapter=vocab$doc_count,
word=as.character(vocab$term),
n=as.integer(vocab$term_count))
# 计算每章中频次最高的词
max.word <- aggregate(n~chapter,book.words,max)
# 词频tf=某个词在文章中出现的次数/该文出现次数最多的词出现的次数
book <- book.words %>% left_join(max.word,by="chapter",copy=T) %>%
dplyr::rename(n=n.x,max=n.y) %>% mutate(tf=n/max)
print(book)
# A tibble: 24,572 x 5
chapter word n max tf
<int> <chr> <int> <int> <dbl>
1 1 得太早 1 35 0.0286
2 1 招待 1 35 0.0286
3 1 全副武装 1 35 0.0286
4 1 演义 1 35 0.0286
5 1 低么 1 35 0.0286
6 1 黄皮书 1 35 0.0286
7 1 养人 1 35 0.0286
8 1 付些 1 35 0.0286
9 1 解 1 35 0.0286
10 1 破裂 1 35 0.0286
# ... with 24,562 more rows
3.2 查看前6章节TF的分布情况
book %>% filter(chapter<7) %>%
ggplot(aes(tf, fill = chapter)) +
geom_histogram(show.legend = FALSE,bins = 40,na.rm = T,col="white") +
# xlim(NA, 1) +
labs(x=NULL,y=NULL) +
facet_wrap(~chapter, ncol=2,scales = "free_y")
词频长尾分布
3.3 齐普夫定律(Zipf’s law)
在一个自然语言的语料库中,一个词的出现频数和这个词在这个语料中的排名(这个排名是基于出现次数的)成反比。Zipf定律是文献计量学的重要定律之一,它和罗特卡定律、布拉德福定律一起被并称为文献计量学的三大定律。
book %>% select(chapter,n) %>% mutate(rank=10*(1:length(n))) %>%
ggplot(aes(rank,n)) + geom_point(col="red") + scale_x_log10()
齐普夫定律
3.4 TF-IDF矩阵
book.tfidf <- book %>% select(-max) %>% bind_tf_idf(word,chapter,n) %>%
arrange(-tf_idf)
# tf_idf表示词语在文章中的重要程度,可视化前两章最重要的13个词
book.tfidf %>% mutate(word=factor(word,levels = rev(unique(word)))) %>%
filter(chapter<3) %>% group_by(chapter) %>% top_n(13,tf_idf) %>%
ungroup() %>% ggplot(aes(word,tf_idf,fill=chapter)) +
geom_col(show.legend = F) +
labs(x=NULL,y="tf_idf") +
facet_wrap(~chapter,ncol = 2,scales = "free") +
coord_flip()
TF-IDF
网友评论