31-tidytext包学习:计算TF和TF-IDF

作者: wonphen | 来源:发表于2020-02-08 21:17 被阅读0次

    1、读取并整理数据

    library(pacman)
    p_load(tidytext,tidyverse,stringr)
    
    # 读取文件
    txt <- readLines("./data_set/xslm/血色浪漫.txt",encoding = "unknown",n=-1L,ok=T,warn=T,skipNul=T) 
    
    血色浪漫
    # 删除分节等重复信息
    txt <- txt %>% gsub("出版社:长江文艺出版社作者:都梁|\\[上一篇\\]","",.) %>% str_trim() 
    
    # 按【下一篇】拆分
    txt2 <- txt %>% paste(collapse = " ") %>% str_split("\\[下一篇\\]") 
    
    # 按章节整理为数据框
    chapter <- vector("character",244)
    content <- vector("character",244)
    for (line in txt2) {
      temp <- line %>% str_split("引子|\\([0-9]+\\)")
      for (i in 1:length(line)) {
        chapter[i] <- temp[[i]][1] %>% gsub("[《血色浪漫上》]","",.) %>% str_trim(side = "both")
        content[i] <- temp[[i]][2] %>% str_trim(side = "both")
      }
      df <- tibble(chapter=chapter,content=content)
    }
    
    # 给第0章添加标题“引子”
    df$chapter[1] <- "引子"
    
    # 按章节(chapter)将各小节(content)合并
    txt3 <- aggregate(content~chapter,df,paste)
    
    # 去掉字符及字母
    txt3$content <- txt3$content %>% gsub("[%a-z\\\\\\\"()]","",.) 
    
    # 创建有效水平章节名称列表
    chapter.levels <- c("引子","第一章","第二章","第三章","第四章","第五章","第六章",
                        "第七章","第八章","第九章","第十章","第十一章","第十二章",
                        "第十三章","第十四章","第十五章","第十六章","第十七章",
                        "第十八章","第十九章","第二十章","第二十一章","第二十二章",
                        "第二十三章","第二十四章","第二十五章","尾声")
    txt3$chapter <- factor(txt3$chapter,levels = chapter.levels)
    # 按章节重新排序
    txt3 <- txt3 %>% arrange(chapter)
    
    # 写入文件
    # write.csv(txt3,"./txt3.csv")
    
    整理后的文档

    2、中文分词

    使用dplyr::count()函数计算词频一直有问题,最后选择使用text2vec包。

    p_load(jiebaR,plyr,text2vec)
    # txt3 <- read.csv("./txt3.csv",header = T,stringsAsFactors = F)
    
    
    # 将小说主要人物姓名保存为用户字典xslm
    wk <-worker(user = "./dict/characters-master/xslm")
    
    tok.fun <- function(strings) {llply(strings,segment,wk)}
    # 设置分词迭代器
    it <- itoken(txt3$content,
                 preprocessor = identity,
                 tokenizer = tok.fun,
                 ids = txt3$chapter,
                 progressbar = interactive())
    
    # 创建词矩阵
    vocab <- create_vocabulary(it)
    

    3、计算TF和TF-IDF

    3.1 计算TF

    book.words <- tibble(chapter=vocab$doc_count,
                         word=as.character(vocab$term),
                         n=as.integer(vocab$term_count))
    
    # 计算每章中频次最高的词
    max.word <- aggregate(n~chapter,book.words,max)
    
    # 词频tf=某个词在文章中出现的次数/该文出现次数最多的词出现的次数
    book <- book.words %>% left_join(max.word,by="chapter",copy=T) %>% 
      dplyr::rename(n=n.x,max=n.y) %>% mutate(tf=n/max)
    print(book)
    
    # A tibble: 24,572 x 5
       chapter word         n   max     tf
         <int> <chr>    <int> <int>  <dbl>
     1       1 得太早       1    35 0.0286
     2       1 招待         1    35 0.0286
     3       1 全副武装     1    35 0.0286
     4       1 演义         1    35 0.0286
     5       1 低么         1    35 0.0286
     6       1 黄皮书       1    35 0.0286
     7       1 养人         1    35 0.0286
     8       1 付些         1    35 0.0286
     9       1 解           1    35 0.0286
    10       1 破裂         1    35 0.0286
    # ... with 24,562 more rows
    

    3.2 查看前6章节TF的分布情况

    book %>% filter(chapter<7) %>%
      ggplot(aes(tf, fill = chapter)) +
        geom_histogram(show.legend = FALSE,bins = 40,na.rm = T,col="white") +
      #  xlim(NA, 1) +
        labs(x=NULL,y=NULL) +
        facet_wrap(~chapter, ncol=2,scales = "free_y")
    
    词频长尾分布

    3.3 齐普夫定律(Zipf’s law)

    在一个自然语言的语料库中,一个词的出现频数和这个词在这个语料中的排名(这个排名是基于出现次数的)成反比。Zipf定律是文献计量学的重要定律之一,它和罗特卡定律、布拉德福定律一起被并称为文献计量学的三大定律。

    book %>% select(chapter,n) %>% mutate(rank=10*(1:length(n))) %>%
      ggplot(aes(rank,n)) + geom_point(col="red") + scale_x_log10()
    
    齐普夫定律

    3.4 TF-IDF矩阵

    book.tfidf <- book %>% select(-max) %>% bind_tf_idf(word,chapter,n) %>%
      arrange(-tf_idf)
    
    # tf_idf表示词语在文章中的重要程度,可视化前两章最重要的13个词
    book.tfidf %>% mutate(word=factor(word,levels = rev(unique(word)))) %>%
      filter(chapter<3) %>% group_by(chapter) %>% top_n(13,tf_idf) %>%
      ungroup() %>% ggplot(aes(word,tf_idf,fill=chapter)) +
      geom_col(show.legend = F) +
      labs(x=NULL,y="tf_idf") +
      facet_wrap(~chapter,ncol = 2,scales = "free") +
      coord_flip()
    
    TF-IDF

    相关文章

      网友评论

        本文标题:31-tidytext包学习:计算TF和TF-IDF

        本文链接:https://www.haomeiwen.com/subject/iskwxhtx.html