美文网首页
文本词条化、频数统计及相关性

文本词条化、频数统计及相关性

作者: 王吉林 | 来源:发表于2018-11-13 22:50 被阅读0次

    我们阅读一本书,基本上是从目录或者序言开始的;我们阅读一篇文献,首先是从摘要开始的;我们看领导的发言稿,首先是从提纲开始的。为什么要这样做呢,有很多事情我们没有细思考虑过,却已然成了习惯。
    现在市面上的一本书,基本上都是大部头,动不动就是500页上下。值不值得买,值不值得读,其实也可以用统计的方法做一些基本的判断。统计,在人们的印象中,通常是处理数字的,其实更准确的说,它是处理数据的。
    下面我们建立一个字符串向量,内容是我们熟悉的倚天屠龙记,然后对它进行词条化处理——将文本分为按个的词条。

    text <- c("张无忌到底爱谁","小昭去了波斯","周芷若嫁给了宋青书","珠儿死了(假的)","只剩下赵敏不离不弃")
    text 
    > text
    [1] "张无忌到底爱谁"     "小昭去了波斯"       "周芷若嫁给了宋青书"
    [4] "珠儿死了"           "只剩下赵敏不离不弃"
    
    text_df <- data_frame(line=1:5,text=text)
    library(tidytext)
    text_df %>%
      unnest_tokens(word,text,token = "words")
    # A tibble: 28 x 2
        line word 
       <int> <chr>
     1     1 张   
     2     1 无忌 
     3     1 到底 
     4     1 爱   
     5     1 谁   
     6     2 小   
     7     2 昭   
     8     2 去了 
     9     2 波斯 
    10     3 周   
    # ... with 18 more rows
    

    简·奥斯汀是我们熟悉的作家,我们都读过他的代表作,《傲慢与偏见》、《理智与情感》等。下面我们对见简·奥斯汀的作品做一些分析,看看简·奥斯汀在写作过程中喜欢用什么样的词。

    这里分析的作品是Sense & Sensibility、Pride & Prejudice、Mansfield Park、Emma、 Northanger Abbey、Persuasion 这六部作品。 简·奥斯汀
    加载janeaustenr包,然后调入读取相应的作品数据,新建两个变量linenum等于行号,chapter为正则表达式获取(判断每一行是否有chapter开头的单词,如果有,为TRUE,否则为FALSE,然后对该结果累计相加得到)。
    
    library(janeaustenr)
    library(dplyr)
    library(stringr)
    orig_books <- austen_books() %>%
      group_by(book) %>%
      mutate(linenum=row_number(),
             chapter=cumsum(
               str_detect(text,regex("^chapter [\\divxlc]",
                                     ignore_case = TRUE)))) %>%
      ungroup()
    orig_books
    > orig_books
    # A tibble: 73,422 x 4
       text                  book                linenum chapter
       <chr>                 <fct>                 <int>   <int>
     1 SENSE AND SENSIBILITY Sense & Sensibility       1       0
     2 ""                    Sense & Sensibility       2       0
     3 by Jane Austen        Sense & Sensibility       3       0
     4 ""                    Sense & Sensibility       4       0
     5 (1811)                Sense & Sensibility       5       0
     6 ""                    Sense & Sensibility       6       0
     7 ""                    Sense & Sensibility       7       0
     8 ""                    Sense & Sensibility       8       0
     9 ""                    Sense & Sensibility       9       0
    10 CHAPTER 1             Sense & Sensibility      10       1
    # ... with 73,412 more rows
    

    加载tidytext包,对获得的数据进行规范化处理。

    library(tidytext)
    tidy_books <- orig_books  %>%
      unnest_tokens(word,text,token = "words")
    tidy_books
    > tidy_books
    # A tibble: 725,055 x 4
       book                linenum chapter word       
       <fct>                 <int>   <int> <chr>      
     1 Sense & Sensibility       1       0 sense      
     2 Sense & Sensibility       1       0 and        
     3 Sense & Sensibility       1       0 sensibility
     4 Sense & Sensibility       3       0 by         
     5 Sense & Sensibility       3       0 jane       
     6 Sense & Sensibility       3       0 austen     
     7 Sense & Sensibility       5       0 1811       
     8 Sense & Sensibility      10       1 chapter    
     9 Sense & Sensibility      10       1 1          
    10 Sense & Sensibility      13       1 the        
    # ... with 725,045 more rows
    

    tidy_books数据中包含了大量的to,the,of词语,这些词语对于我们分析文本,通常没有太大的意义,因此我们需要将其删除,这里需要用到连接和停用词概念。下面我们举例说明一下连接。

    ###各种join
    
    a=data_frame(x=1:4,y=c(1,2,3,4))
    a
    b=data_frame(x=1:3,z=c(1,5,6))
    b
    anti_join(a,b)
    left_join(a,b)
    right_join(a,b)
    semi_join(a,b)
    full_join(a,b)
    > a=data_frame(x=1:4,y=c(1,2,3,4))
    > a
    # A tibble: 4 x 2
          x     y
      <int> <dbl>
    1     1     1
    2     2     2
    3     3     3
    4     4     4
    > b=data_frame(x=1:3,z=c(1,5,6))
    > b
    # A tibble: 3 x 2
          x     z
      <int> <dbl>
    1     1     1
    2     2     5
    3     3     6
    > anti_join(a,b)
    Joining, by = "x"
    # A tibble: 1 x 2
          x     y
      <int> <dbl>
    1     4     4
    > left_join(a,b)
    Joining, by = "x"
    # A tibble: 4 x 3
          x     y     z
      <int> <dbl> <dbl>
    1     1     1     1
    2     2     2     5
    3     3     3     6
    4     4     4    NA
    > right_join(a,b)
    Joining, by = "x"
    # A tibble: 3 x 3
          x     y     z
      <int> <dbl> <dbl>
    1     1     1     1
    2     2     2     5
    3     3     3     6
    > semi_join(a,b)
    Joining, by = "x"
    # A tibble: 3 x 2
          x     y
      <int> <dbl>
    1     1     1
    2     2     2
    3     3     3
    > full_join(a,b)
    Joining, by = "x"
    # A tibble: 4 x 3
          x     y     z
      <int> <dbl> <dbl>
    1     1     1     1
    2     2     2     5
    3     3     3     6
    4     4     4    NA
    

    现在用anti_join函数删除停止词。

    tidy_books <- tidy_books %>%
      anti_join(stop_words)
    > tidy_books
    # A tibble: 217,609 x 4
       book                linenum chapter word       
       <fct>                 <int>   <int> <chr>      
     1 Sense & Sensibility       1       0 sense      
     2 Sense & Sensibility       1       0 sensibility
     3 Sense & Sensibility       3       0 jane       
     4 Sense & Sensibility       3       0 austen     
     5 Sense & Sensibility       5       0 1811       
     6 Sense & Sensibility      10       1 chapter    
     7 Sense & Sensibility      10       1 1          
     8 Sense & Sensibility      13       1 family     
     9 Sense & Sensibility      13       1 dashwood   
    10 Sense & Sensibility      13       1 settled    
    # ... with 217,599 more rows
    

    下面对tidy_books数据中的单词进行统计并可视化,查看简奥斯汀最喜欢用什么样的词。

    library(ggplot2)
    tidy_books %>%
      count(word,sort=TRUE)%>%
      filter(n>600)%>%
      arrange(n)%>%
      #mutate(word=reorder(word,n))%>%
      mutate(word=factor(word,levels = word))%>%
      ggplot(aes(word,n))+
      geom_col()+
      xlab(NULL)+
      coord_flip()
    
    奥斯汀小说中最常见的单词

    现在比较一下简奥斯汀与其他作者作品的词是否有相似之处。

    library(gutenbergr)
    gutenberg_works()%>%
      filter(author=="Brontë, Charlotte")
    
    hgwells <- gutenberg_download(c(35,36,5230,159))
    
    tidy_hgwells <- hgwells %>%
      unnest_tokens(word,text,token = "words") %>%
      anti_join(stop_words)
    
    tidy_hgwells %>% count(word,sort = TRUE)%>%
      filter(n>200)%>%
      arrange(n)%>%
      #mutate(word=reorder(word,n))%>%
      mutate(word=factor(word,levels = word))%>%
      ggplot(aes(word,n))+
      geom_col()+
      xlab(NULL)+
      coord_flip()
    
    hgwells常用的词
    bronte <- gutenberg_download(c(1260,768,969,9182,767)) 
    tidy_bronte <- bronte%>%
     unnest_tokens(word,text)%>%
     anti_join(stop_words)
     
    tidy_bronte%>%
     count(word,sort = TRUE)
    
    bronte常用的词
    下面我们将三位作者的作品合并,并统计相应的词频。
    frequency <- bind_rows(mutate(tidy_bronte,author="Brontë Sisters"),
    mutate(tidy_hgwells,author="H.G.Wells"),
    mutate(tidy_books,author="Jane Austen"))%>%
      mutate(word=str_extract(word,"[a-z']+"))%>%
      count(author,word)%>%
      group_by(author)%>%
      mutate(prop=n/sum(n))%>%
      select(-n)%>%
      spread(author,prop)%>%
      gather(author,prop,2:3 )
    > frequency
    # A tibble: 56,408 x 4
       word        `Jane Austen` author                prop
       <chr>               <dbl> <chr>                <dbl>
     1 a              0.00000919 Brontë Sisters  0.0000319 
     2 a'most        NA          Brontë Sisters  0.0000159 
     3 a'n't          0.00000460 Brontë Sisters NA         
     4 aback         NA          Brontë Sisters  0.00000398
     5 abaht         NA          Brontë Sisters  0.00000398
     6 abandon       NA          Brontë Sisters  0.0000319 
     7 abandoned      0.00000460 Brontë Sisters  0.0000916 
     8 abandoning    NA          Brontë Sisters  0.00000398
     9 abandonment   NA          Brontë Sisters  0.0000199 
    10 abart         NA          Brontë Sisters NA         
    # ... with 56,398 more rows
    
    ggplot(frequency,aes(x=prop,y=`Jane Austen`,
                         colour=abs(`Jane Austen`-prop)))+
      geom_abline(colour="gray40",lty=2)+
      geom_jitter(alpha=0.1,size=2.5,width = 0.3,height = 0.3)+
      geom_text(aes(label=word),check_overlap = TRUE,vjust=1.5)+
      scale_x_log10(labels=percent_format())+
      scale_y_log10(labels=percent_format())+
      scale_color_gradient(limit=c(0,0.001),
                           low = "darkslategray4",high = "gray75")+
      facet_wrap(~author,ncol = 2)+
      theme(legend.position = "none")+
      labs(y="Jane Austen",x=NULL)
    
    奥斯汀与其他作者用词相似程度.jpeg
    > cor.test(data=frequency[frequency$author=="Brontë Sisters",],
    +          ~prop+`Jane Austen`)
    
        Pearson's product-moment correlation
    
    data:  prop and Jane Austen
    t = 119.65, df = 10404, p-value < 2.2e-16
    alternative hypothesis: true correlation is not equal to 0
    95 percent confidence interval:
     0.7527869 0.7689641
    sample estimates:
          cor 
    0.7609938 
    
    > cor.test(data=frequency[frequency$author=="H.G.Wells",],
    +          ~prop+`Jane Austen`)
    
        Pearson's product-moment correlation
    
    data:  prop and Jane Austen
    t = 33.134, df = 5365, p-value < 2.2e-16
    alternative hypothesis: true correlation is not equal to 0
    95 percent confidence interval:
     0.3896964 0.4341215
    sample estimates:
          cor 
    0.4121538 
    

    无论从图形还是用用检验函数检验,都说明奥斯汀与bronte姐妹使用的词更相近一些。

    相关文章

      网友评论

          本文标题:文本词条化、频数统计及相关性

          本文链接:https://www.haomeiwen.com/subject/lqylfqtx.html