美文网首页R炒面
109-文本分析之文本预处理

109-文本分析之文本预处理

作者: wonphen | 来源:发表于2022-09-30 09:32 被阅读0次

    参考:《文本数据挖掘——基于R语言》

    library(pacman)
    p_load(dplyr, stringr, purrr)
    

    1、读取数据

    随便文本代替即可,包括两列,一列为文档名或编号,一列为文本内容。

    storagebottles <- read.csv("dataset/ali/storagebottles0905.csv", 
                               header = F) %>% 
      set_names(c("sku_name", "sku_price", "sku_sale_volume", "sku_score",
                  "sku_ship", "sku_isNewin", "sku_isPromotion", 
                  "sku_isTopselling", "shop_name", "sku_link", "category4")) %>%
      distinct(.keep_all = T)
    
    storagebottles <- storagebottles %>% 
      filter(!is.na(sku_name)) %>%
      filter(str_detect(sku_price, "^US")) %>% 
      filter(str_detect(sku_link, "aliexpress")) %>% 
      filter(str_detect(sku_sale_volume, "sold")) %>% 
      mutate(category = "home",
             category2 = "Home Storage",
             category3 = "Storage Bottles & Jars")  %>% 
      mutate(sku_id = str_extract(sku_link, "\\d{16}"),
             sku_link = paste0("http:", sku_link)) %>% 
      mutate(sku_id = as.character(sku_id)) %>% 
      arrange(sku_sale_volume) %>% 
      group_by(sku_id, .drop = T) %>% 
      slice_tail(n=1) %>% 
      ungroup()
    
    df <- select(storagebottles, sku_id, sku_name)
    

    2、文本纠错

    p_load(hunspell)
    
    # 检查是否有错
    hunspell_check(df$sku_name[1])
    
    ## [1] FALSE
    
    # 识别错误单词
    bad <- hunspell(df$sku_name[1])
    print(bad[[1]])
    
    ## [1] "pcs"
    
    # 修正建议
    hunspell_suggest(bad[[1]])
    
    ## [[1]]
    ##  [1] "cps"  "cs"   "pecs" "pics" "pis"  "pas"  "pct"  "pus"  "p cs" "PCs"
    

    3、切分

    p_load(tokenizers, tidytext)
    
    txt <- paste0(df$sku_name[1:2], collapse = "。")
    txt
    
    ## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles。1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"
    
    # 段落切分
    # 设置切分标记
    tokenize_paragraphs(txt, paragraph_break = "。")
    
    ## [[1]]
    ## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles"
    ## [2] "1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"
    
    # 句子切分
    tokenize_sentences(txt)
    
    ## [[1]]
    ## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles。"
    ## [2] "1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"
    
    # 词语切分,会同时去除标点,转小写
    tokenize_words(txt)
    
    ## [[1]]
    ##  [1] "1000pcs"     "8"           "32mm"        "0.5ml"       "plastic"    
    ##  [6] "centrifuge"  "tube"        "test"        "tubing"      "vial"       
    ## [11] "clear"       "plastic"     "container"   "home"        "garden"     
    ## [16] "storage"     "bottles"     "1000pcs"     "6"           "22mm"       
    ## [21] "0.2ml"       "plastic"     "bottles"     "gardening"   "storage"    
    ## [26] "container"   "transparent" "plastic"     "vials"       "pcr"        
    ## [31] "centrifuge"  "tube"
    
    # 也可以保留标点,去掉数字
    tokenize_words(txt, strip_punct = F, strip_numeric = T, simplify = T)
    
    ##  [1] "1000pcs"     "*"           "32mm"        "0.5ml"       "plastic"    
    ##  [6] "centrifuge"  "tube"        "test"        "tubing"      "vial"       
    ## [11] "clear"       "plastic"     "container"   "home"        "garden"     
    ## [16] "storage"     "bottles"     "。"          "1000pcs"     "*"          
    ## [21] "22mm"        "0.2ml"       "plastic"     "bottles"     "gardening"  
    ## [26] "storage"     "container"   "transparent" "plastic"     "vials"      
    ## [31] "pcr"         "centrifuge"  "tube"
    
    # n元切分,simplify = T输出向量而非列表
    tokenize_ngrams(txt, n = 2, simplify = T)
    
    ##  [1] "1000pcs 8"             "8 32mm"                "32mm 0.5ml"           
    ##  [4] "0.5ml plastic"         "plastic centrifuge"    "centrifuge tube"      
    ##  [7] "tube test"             "test tubing"           "tubing vial"          
    ## [10] "vial clear"            "clear plastic"         "plastic container"    
    ## [13] "container home"        "home garden"           "garden storage"       
    ## [16] "storage bottles"       "bottles 1000pcs"       "1000pcs 6"            
    ## [19] "6 22mm"                "22mm 0.2ml"            "0.2ml plastic"        
    ## [22] "plastic bottles"       "bottles gardening"     "gardening storage"    
    ## [25] "storage container"     "container transparent" "transparent plastic"  
    ## [28] "plastic vials"         "vials pcr"             "pcr centrifuge"       
    ## [31] "centrifuge tube"
    
    # 4、字符切分
    tokenize_characters(txt, simplify = T)
    
    ##   [1] "1" "0" "0" "0" "p" "c" "s" "8" "3" "2" "m" "m" "0" "5" "m" "l" "p" "l"
    ##  [19] "a" "s" "t" "i" "c" "c" "e" "n" "t" "r" "i" "f" "u" "g" "e" "t" "u" "b"
    ##  [37] "e" "t" "e" "s" "t" "t" "u" "b" "i" "n" "g" "v" "i" "a" "l" "c" "l" "e"
    ##  [55] "a" "r" "p" "l" "a" "s" "t" "i" "c" "c" "o" "n" "t" "a" "i" "n" "e" "r"
    ##  [73] "h" "o" "m" "e" "g" "a" "r" "d" "e" "n" "s" "t" "o" "r" "a" "g" "e" "b"
    ##  [91] "o" "t" "t" "l" "e" "s" "1" "0" "0" "0" "p" "c" "s" "6" "2" "2" "m" "m"
    ## [109] "0" "2" "m" "l" "p" "l" "a" "s" "t" "i" "c" "b" "o" "t" "t" "l" "e" "s"
    ## [127] "g" "a" "r" "d" "e" "n" "i" "n" "g" "s" "t" "o" "r" "a" "g" "e" "c" "o"
    ## [145] "n" "t" "a" "i" "n" "e" "r" "t" "r" "a" "n" "s" "p" "a" "r" "e" "n" "t"
    ## [163] "p" "l" "a" "s" "t" "i" "c" "v" "i" "a" "l" "s" "p" "c" "r" "c" "e" "n"
    ## [181] "t" "r" "i" "f" "u" "g" "e" "t" "u" "b" "e"
    

    5、扩展缩写

    p_load(qdap)
    
    # 连词缩写替换
    replace_contraction(c("MR. Jones isn't going."))
    
    ## [1] "MR. Jones is not going."
    
    # 缩写替换
    replace_abbreviation(c("MR. Jones isn't going."))
    
    ## [1] "Mister Jones isn't going."
    
    # 数字替换
    replace_number(c(1))
    
    ## [1] "one"
    
    # 序词替换
    replace_ordinal(c("3rd"))
    
    ## [1] "third"
    
    # 符号替换
    replace_symbol(c("&"))
    
    ## [1] "and"
    

    6、词干提取

    stem <- tokenize_word_stems(df$sku_name[1], simplify = T)
    stem
    
    ##  [1] "1000pcs"   "8"         "32mm"      "0.5ml"     "plastic"   "centrifug"
    ##  [7] "tube"      "test"      "tube"      "vial"      "clear"     "plastic"  
    ## [13] "contain"   "home"      "garden"    "storag"    "bottl"
    

    7、词形还原

    模型下载地址:
    英文:https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe
    中文:https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/chinese-gsd-ud-2.5-191206.udpipe

    p_load(udpipe)
    
    # 从github上下载英文词形还原的模型,也可以下载chinese
    # 网络可能导致下载失败
    udmodel <- udpipe_download_model(language = "english", 
                                     model_dir = "model/")
    
    # 加载模型
    en_model <- udpipe_load_model(udmodel$file_model)
    
    # 词形还原
    udpipe_annotate(en_model, stem) %>% 
      as_tibble() %>% 
      # 分词结果,词形还原结果
      select(token, lemma)
    
    ## token  lemma
    ## <chr>  <chr>
    ## 1000pcs  1000pcs         
    ## 8        8           
    ## 32mm 32mm            
    ## 0.5ml    0.5ml           
    ## plastic  plastic         
    ## centrifug    centrifug           
    ## tube tube            
    ## test test            
    ## tube tube            
    ## vial vial
    ## clear    clear           
    ## plastic  plastic         
    ## contain  contain         
    ## home home            
    ## garden   garden          
    ## storag   storag          
    ## bottl    bottl
    
    udmodel <- udpipe_download_model(language = "chinese", 
                                     model_dir = "model/")
    
    # 加载模型
    cn_model <- udpipe_load_model(udmodel$file_model)
    
    # 中文词形还原,# 转换为UTF-8编码
    udpipe_annotate(cn_model, iconv(c("别人笑我忒疯癫"), to = "UTF-8")) %>% 
      as_tibble() %>% 
      # 分词结果,词形还原结果
      select(token, lemma)
    
    ## token  lemma
    ## <chr>  <chr>
    ## 别    别           
    ## 人笑   人笑          
    ## 我忒   我忒          
    ## 疯    疯           
    ## 癫    癫
    

    8、词性标注

    udpipe_annotate(en_model, df$sku_name[1]) %>% 
      as_tibble() %>% 
      select(token, upos)
    
    ## token  upos
    ## <chr>  <chr>
    ## 1000 NUM         
    ## pcs  NOUN            
    ## 8    NUM         
    ## *    PUNCT           
    ## 32   NUM         
    ## mm   NOUN            
    ## 0.5  NUM         
    ## ml   NOUN            
    ## Plastic  PROPN           
    ## Centrifuge   PROPN   
    ……(太多就不一一列出来了)
    

    PROPN表示专有名词,AUX表示助动词,ADJ表示形容词,DET表示限定词,NOUN表示名词,PUNCT表示标点符号

    ADJ: adjective
    ADP: adposition
    ADV: adverb
    AUX: auxiliary
    CCONJ: coordinating conjunction
    DET: determiner
    INTJ: interjection
    NOUN: noun
    NUM: numeral
    PART: particle
    PRON: pronoun
    PROPN: proper noun
    PUNCT: punctuation
    SCONJ: subordinating conjunction
    SYM: symbol
    VERB: verb
    X: other

    9、批量文本预处理

    p_load(tidytext)
    
    df <- unnest_tokens(tbl = df,
                        # 输出列名称
                        output = stem, 
                        input = sku_name,
                        # 词干提取
                        token = tokenize_word_stems)
    df
    
    ## # A tibble: 21,765 × 2
    ##    sku_id           stem     
    ##    <chr>            <chr>    
    ##  1 2251801564728378 1000pcs  
    ##  2 2251801564728378 8        
    ##  3 2251801564728378 32mm     
    ##  4 2251801564728378 0.5ml    
    ##  5 2251801564728378 plastic  
    ##  6 2251801564728378 centrifug
    ##  7 2251801564728378 tube     
    ##  8 2251801564728378 test     
    ##  9 2251801564728378 tube     
    ## 10 2251801564728378 vial     
    ## # … with 21,755 more rows
    

    相关文章

      网友评论

        本文标题:109-文本分析之文本预处理

        本文链接:https://www.haomeiwen.com/subject/nifdartx.html