美文网首页R语言与统计分析R语言作业数据科学与R语言
30-tidytext包学习:文本整理与情绪分析

30-tidytext包学习:文本整理与情绪分析

作者: wonphen | 来源:发表于2020-02-07 19:13 被阅读0次

    1、载入数据

    require(pacman)
    p_load(tidyverse,tidytext,text2vec,jiebaR,plyr)
    df <- read.csv("./signature.csv",stringsAsFactors = F,header = T) %>% 
      select("id",signature="Signature")
    
    # 繁体字转换为简体字
    p_load(ropencc)
    df$signature <- run_convert(converter(TW2S), df$signature)
    

    2、中文分词

    wk <-worker(stop_word = "./dict/characters-master/stop_words",lines = T)
    
    text <- df %>% mutate(words = map(signature,segment,jieba = wk)) %>% select(c("id","words")) %>% tbl_df()
    
    # 将分词拼接为一个长的字符串
    text$words <- map(text$words,paste,collapse=" ")
    

    3、使用unnest_tokens()函数整理为tidy结构

    # 将text中的words列整理后命名为words
    text.df <- text %>% unnest_tokens(words,words)
    
    # 将文本按章节整理
    # austen_chapters <- austen_books() %>% unnest_tokens(chapter, text, token = "regex", pattern = "Chapter|CHAPTER [\\dIVXLC]")
    

    4、去除停用词

    # 使用第二种方法去除停用词
    stop.words <- read.table("./dict/停用词表.txt",header = F,sep="\n",quote = "",
                             fileEncoding = "UTF-8",col.names = "words")
    
    # 去掉停用词两边的空格
    stop.words$words <- str_trim(stop.words$words)
    
    # 向停用词词典中添加新词
    # stop.words <- bind_rows(tibble(words=c("于","有")),stop.words)
    
    text.df <- text.df %>% anti_join(stop.words,by="words")
    

    5、dplyr::count()函数查找频次最高的词

    text.df %>% dplyr::count(words,sort=TRUE)
    
    ## # A tibble: 1,267 x 2
    ##    words     n
    ##    <chr> <int>
    ##  1 人生     22
    ##  2 心       22
    ##  3 不       17
    ##  4 爱       16
    ##  5 中       15
    ##  6 生活     14
    ##  7 努力     13
    ##  8 有       12
    ##  9 无       11
    ## 10 做       11
    ## # ... with 1,257 more rows
    

    6、根据词频画条形图

    text.df %>% dplyr::count(words,sort=TRUE) %>%
      filter(n>11) %>%
      ggplot(aes(reorder(words,n),n)) +
      geom_col() +
      coord_flip() +
      labs(x="",y="")
    
    词频条形图

    7、计算词频TF

    tf <- text.df %>% dplyr::count(id,words)
    

    8、加载知网(hownet)情感词典

    positive <- read.table("./dict/情感及修饰词/正面情感词语(中文).txt",header = F,
        stringsAsFactors = F,strip.white = T,skip = 1,col.names = "words")
    # 去掉两边的空格
    positive$words <- str_trim(positive$words)
    
    negative <- read.table("./dict/情感及修饰词/负面情感词语(中文).txt",header = F,
        stringsAsFactors = F,strip.white = T,skip = 1,col.names = "words")
    
    negative$words <- str_trim(negative$words)
    

    9、匹配情感词典并可视化

    # 计算每个签名中有多少个正向词
    df.positive <- text.df %>% inner_join(positive,by="words") %>% dplyr::count(words,id)
    
    df.positive <- aggregate(n~id,df.positive,sum)
    
    # 计算每个签名中有多少个负向词
    df.negative <- text.df %>% inner_join(negative) %>% dplyr::count(words,id=id) %>% select(id,n=n)
    
    ## Joining, by = "words"
    
    df.negative <- aggregate(n~id,df.negative,sum)
    
    df.sentiment <- df.positive %>% full_join(df.negative,by="id")
    
    df.sentiment[is.na(df.sentiment)] <- 0
    
    # 总体为正的为正面,总体为负的为负面,总体无情绪的为中立
    df.sentiment <- df.sentiment %>% 
      mutate(sentiment = case_when(n.x - n.y > 0 ~ "正面",
                                   n.x - n.y == 0 ~ "中立",
                                   n.x - n.y < 0 ~ "负面")) %>%
      select(id,sentiment)
    
    table(df.sentiment$sentiment) %>% as.data.frame() %>%
      ggplot(aes(as.factor(Var1),Freq)) +
      geom_col(show.legend = F) +
      labs(x="",y="")
    
    情绪所占比例

    10、词云图

    p_load(wordcloud2)
    temp <- text.df %>% dplyr::count(words) %>% filter(n>=2) 
    wordcloud2(temp,size=1,color = "random-dark",backgroundColor = "gray",minRotation = -pi/4,
    maxRotation = -pi/4,shape = "circle",fontFamily = "苹方")
    
    词云图

    11、最多的正、负情感词云

    p_load(reshape2)
    # 统计正面情感词
    posi <- text.df %>% select(words) %>% dplyr::count(words) %>%
      inner_join(positive,by="words") %>%
      filter(n>=2) %>% arrange(-n) %>%
      mutate(sentiment = "positive")
    
    # 统计负面情感词
    nega <- text.df %>% select(words) %>%
      dplyr::count(words) %>% 
      inner_join(negative,by="words") %>%
      arrange(-n) %>%
      mutate(sentiment = "negative")
    
    posi %>% rbind(nega) %>% 
      acast(words ~ sentiment,value.var = "n",fill = 0) %>%
      wordcloud::comparison.cloud(scale=c(3.3,.3),colors=c("gray80","gray20"),
                                  match.colors = T,
                                  rot.per = 0.1,title.size = 2.5,
                                  title.bg.colors = c("green","red"),
                                  title.colors = "gray20")
    
    正负情感词云

    相关文章

      网友评论

        本文标题:30-tidytext包学习:文本整理与情绪分析

        本文链接:https://www.haomeiwen.com/subject/hlkvxhtx.html