1、载入数据
require(pacman)
p_load(tidyverse,tidytext,text2vec,jiebaR,plyr)
df <- read.csv("./signature.csv",stringsAsFactors = F,header = T) %>%
select("id",signature="Signature")
# 繁体字转换为简体字
p_load(ropencc)
df$signature <- run_convert(converter(TW2S), df$signature)
2、中文分词
wk <-worker(stop_word = "./dict/characters-master/stop_words",lines = T)
text <- df %>% mutate(words = map(signature,segment,jieba = wk)) %>% select(c("id","words")) %>% tbl_df()
# 将分词拼接为一个长的字符串
text$words <- map(text$words,paste,collapse=" ")
3、使用unnest_tokens()函数整理为tidy结构
# 将text中的words列整理后命名为words
text.df <- text %>% unnest_tokens(words,words)
# 将文本按章节整理
# austen_chapters <- austen_books() %>% unnest_tokens(chapter, text, token = "regex", pattern = "Chapter|CHAPTER [\\dIVXLC]")
4、去除停用词
# 使用第二种方法去除停用词
stop.words <- read.table("./dict/停用词表.txt",header = F,sep="\n",quote = "",
fileEncoding = "UTF-8",col.names = "words")
# 去掉停用词两边的空格
stop.words$words <- str_trim(stop.words$words)
# 向停用词词典中添加新词
# stop.words <- bind_rows(tibble(words=c("于","有")),stop.words)
text.df <- text.df %>% anti_join(stop.words,by="words")
5、dplyr::count()函数查找频次最高的词
text.df %>% dplyr::count(words,sort=TRUE)
## # A tibble: 1,267 x 2
## words n
## <chr> <int>
## 1 人生 22
## 2 心 22
## 3 不 17
## 4 爱 16
## 5 中 15
## 6 生活 14
## 7 努力 13
## 8 有 12
## 9 无 11
## 10 做 11
## # ... with 1,257 more rows
6、根据词频画条形图
text.df %>% dplyr::count(words,sort=TRUE) %>%
filter(n>11) %>%
ggplot(aes(reorder(words,n),n)) +
geom_col() +
coord_flip() +
labs(x="",y="")
词频条形图
7、计算词频TF
tf <- text.df %>% dplyr::count(id,words)
8、加载知网(hownet)情感词典
positive <- read.table("./dict/情感及修饰词/正面情感词语(中文).txt",header = F,
stringsAsFactors = F,strip.white = T,skip = 1,col.names = "words")
# 去掉两边的空格
positive$words <- str_trim(positive$words)
negative <- read.table("./dict/情感及修饰词/负面情感词语(中文).txt",header = F,
stringsAsFactors = F,strip.white = T,skip = 1,col.names = "words")
negative$words <- str_trim(negative$words)
9、匹配情感词典并可视化
# 计算每个签名中有多少个正向词
df.positive <- text.df %>% inner_join(positive,by="words") %>% dplyr::count(words,id)
df.positive <- aggregate(n~id,df.positive,sum)
# 计算每个签名中有多少个负向词
df.negative <- text.df %>% inner_join(negative) %>% dplyr::count(words,id=id) %>% select(id,n=n)
## Joining, by = "words"
df.negative <- aggregate(n~id,df.negative,sum)
df.sentiment <- df.positive %>% full_join(df.negative,by="id")
df.sentiment[is.na(df.sentiment)] <- 0
# 总体为正的为正面,总体为负的为负面,总体无情绪的为中立
df.sentiment <- df.sentiment %>%
mutate(sentiment = case_when(n.x - n.y > 0 ~ "正面",
n.x - n.y == 0 ~ "中立",
n.x - n.y < 0 ~ "负面")) %>%
select(id,sentiment)
table(df.sentiment$sentiment) %>% as.data.frame() %>%
ggplot(aes(as.factor(Var1),Freq)) +
geom_col(show.legend = F) +
labs(x="",y="")
情绪所占比例
10、词云图
p_load(wordcloud2)
temp <- text.df %>% dplyr::count(words) %>% filter(n>=2)
wordcloud2(temp,size=1,color = "random-dark",backgroundColor = "gray",minRotation = -pi/4,
maxRotation = -pi/4,shape = "circle",fontFamily = "苹方")
词云图
11、最多的正、负情感词云
p_load(reshape2)
# 统计正面情感词
posi <- text.df %>% select(words) %>% dplyr::count(words) %>%
inner_join(positive,by="words") %>%
filter(n>=2) %>% arrange(-n) %>%
mutate(sentiment = "positive")
# 统计负面情感词
nega <- text.df %>% select(words) %>%
dplyr::count(words) %>%
inner_join(negative,by="words") %>%
arrange(-n) %>%
mutate(sentiment = "negative")
posi %>% rbind(nega) %>%
acast(words ~ sentiment,value.var = "n",fill = 0) %>%
wordcloud::comparison.cloud(scale=c(3.3,.3),colors=c("gray80","gray20"),
match.colors = T,
rot.per = 0.1,title.size = 2.5,
title.bg.colors = c("green","red"),
title.colors = "gray20")
正负情感词云
网友评论