美文网首页R炒面
113-文本分析之有监督分类

113-文本分析之有监督分类

作者: wonphen | 来源:发表于2022-10-28 20:32 被阅读0次

    1、二分类

    p_load(fastrtext, tidyfst)
    
    data("train_sentences")
    data("test_sentences")
    
    # 只提取(“AIMX”和“CONT”)两类
    train_raw <- train_sentences %>% 
      as_tibble() %>% 
      filter(class.text %in% c("AIMX", "CONT"))
    
    test_raw <- test_sentences %>% 
      as_tibble() %>% 
      filter(class.text %in% c("AIMX", "CONT"))
    
    table(train_raw$class.text)
    
    ## 
    ## AIMX CONT 
    ##  149  144
    
    # 预处理,分词、去除停止词,计算tf-idf
    p_load(tidytext)
    
    sel_word <- train_raw %>% 
      # 分词
      unnest_tokens(word, text) %>% 
      # 去除停用词
      anti_join(stop_words) %>% 
      group_by(class.text) %>% 
      count(word) %>% 
      ungroup() %>% 
      bind_tf_idf(word, class.text, n) %>% 
      distinct(word, .keep_all = T) %>% 
      # 前100
      top_n(100, tf_idf) %>% 
      select(word, tf_idf)
    
    sel_word
    
    ## # A tibble: 107 × 2
    ##    word           tf_idf
    ##    <chr>           <dbl>
    ##  1 adopted       0.00210
    ##  2 algebraic     0.00210
    ##  3 anchors       0.00140
    ##  4 answer        0.00175
    ##  5 ases          0.00210
    ##  6 attempt       0.00210
    ##  7 avoid         0.00175
    ##  8 balanced      0.00210
    ##  9 bernoulli     0.00210
    ## 10 circumstances 0.00210
    ## # … with 97 more rows
    

    因为分值相同,所以最终结果多余100个。

    # 重新构造训练集和测试集
    train1 <- train_raw %>% 
      # 为每一个文本单独编号
      mutate(id = 1:n()) %>% 
      unnest_tokens(word, text) %>% 
      anti_join(stop_words) %>%
      # 筛选目标词
      inner_join(sel_word) %>% 
      # 去除句内重复
      distinct(id, word, .keep_all = T) %>% 
      # 长表转宽表
      wider_dt(name = "word",
               value = "tf_idf",
               fill = 0)
    
    # 有可能有的文档完全没有目标词,所以需要补充,并标记为0
    train <- train_raw %>% 
      mutate(id = 1:n()) %>% 
      select(id, class.text) %>% 
      left_join(train1) %>% 
      # 缺失值插入0
      replace_na_dt(to = 0) %>% 
      select(-id)
    
    # 测试集进行同样的操作
    test1 <- test_raw %>% 
      # 为每一个文本单独编号
      mutate(id = 1:n()) %>% 
      unnest_tokens(word, text) %>% 
      anti_join(stop_words) %>%
      # 筛选目标词
      inner_join(sel_word) %>% 
      # 去除句内重复
      distinct(id, word, .keep_all = T) %>% 
      # 长表转宽表
      wider_dt(name = "word",
               value = "tf_idf",
               fill = 0)
    
    # 有可能有的文档完全没有目标词,所以需要补充,并标记为0
    test2 <- test_raw %>% 
      mutate(id = 1:n()) %>% 
      select(id, class.text) %>% 
      left_join(test1) %>% 
      # 缺失值插入0
      replace_na_dt(to = 0) %>% 
      select(-id)
    
    # 数据框需要补齐到与训练集长度一致,所以需要补全所有单词,并标记为0
    # 获取需要补全的单词
    to_add <- setdiff(names(train), names(test2))
    
    # 一定要有小括号,表示向量
    test <- test2[, (to_add) := 0]
    
    # 检查长度是否一致
    length(train) == length(test)
    
    ## [1] TRUE
    
    # 检查两列名称是否一致
    setequal(names(train), names(test))
    
    ## [1] TRUE
    
    # 响应变量转化为因子型,否则很多机器学习模型会误认为是回归问题
    train <- train %>% 
      mutate_dt(class.text = as.factor(class.text))
    
    test <- test %>% 
      mutate_dt(class.text = as.factor(class.text))
    
    # 建模分析与评估
    train_model <- glm(class.text ~ ., data = train,
                       family = "binomial")
    summary(train_model)
    
    ## 
    ## Call:
    ## glm(formula = class.text ~ ., family = "binomial", data = train)
    ## 
    ## Deviance Residuals: 
    ##      Min        1Q    Median        3Q       Max  
    ## -1.62589  -0.00003   0.00000   0.00004   0.78760  
    ## 
    ## Coefficients: (13 not defined because of singularities)
    ##                  Estimate Std. Error z value Pr(>|z|)    
    ## (Intercept)     1.012e+00  2.919e-01   3.465  0.00053 ***
    ## adopted        -1.111e+03  6.440e+06   0.000  0.99986    
    ## al              9.213e+03  6.783e+06   0.001  0.99892    
    ## algebraic      -1.027e+04  6.089e+06  -0.002  0.99865    
    ## anchors         1.443e+04  2.449e+07   0.001  0.99953    
    ## answer          1.306e+04  2.365e+07   0.001  0.99956    
    ## ases           -9.862e+03  5.586e+06  -0.002  0.99859    
    ## aspects         1.086e+04  6.838e+06   0.002  0.99873    
    ## attempt         9.617e+03  1.633e+07   0.001  0.99953    
    ## avoid          -1.333e+03  1.234e+07   0.000  0.99991    
    ## balanced       -2.050e+04  2.912e+07  -0.001  0.99944    
    ## bernoulli      -1.073e+04  5.670e+06  -0.002  0.99849    
    ## biological      1.149e+04  4.592e+07   0.000  0.99980    
    ## bounds         -3.674e+03  4.995e+07   0.000  0.99994    
    ## cdna           -1.073e+03  1.901e+07   0.000  0.99995    
    ## cellular       -3.655e+02  9.442e+06   0.000  0.99997    
    ## choices         6.010e+03  5.164e+06   0.001  0.99907    
    ## circumstances   1.217e+04  3.864e+07   0.000  0.99975    
    ## clinical        1.352e+04  7.915e+07   0.000  0.99986    
    ## computer       -1.514e+04  7.555e+06  -0.002  0.99840    
    ## conditions     -2.507e+01  2.147e+07   0.000  1.00000    
    ## continuous     -6.085e+03  3.183e+06  -0.002  0.99847    
    ## control        -1.609e+04  2.690e+07  -0.001  0.99952    
    ## current        -1.077e+04  2.630e+07   0.000  0.99967    
    ## describe       -1.609e+04  2.083e+07  -0.001  0.99938    
    ## designed       -1.077e+04  2.197e+07   0.000  0.99961    
    ## difficult       3.709e+03  2.250e+06   0.002  0.99868    
    ## discuss        -1.566e+04  1.092e+07  -0.001  0.99886    
    ## drawn                  NA         NA      NA       NA    
    ## easy            1.357e+04  1.206e+07   0.001  0.99910    
    ## empirical       1.369e+04  9.249e+06   0.001  0.99882    
    ## equal           1.144e+04  9.394e+06   0.001  0.99903    
    ## evs            -2.972e+04  4.291e+07  -0.001  0.99945    
    ## examine        -1.216e+04  6.315e+06  -0.002  0.99846    
    ## existing       -1.482e+04  8.713e+07   0.000  0.99986    
    ## expected       -5.598e+03  1.838e+07   0.000  0.99976    
    ## experience     -8.787e-08  1.944e+07   0.000  1.00000    
    ## extends         1.168e+04  4.297e+07   0.000  0.99978    
    ## extensive      -1.135e+03  3.134e+07   0.000  0.99997    
    ## fixed           1.449e+04  4.484e+07   0.000  0.99974    
    ## generalization -2.215e+03  8.198e+07   0.000  0.99998    
    ## generation     -8.624e+02  1.003e+07   0.000  0.99993    
    ## genome         -1.481e+04  8.492e+06  -0.002  0.99861    
    ## implies                NA         NA      NA       NA    
    ## independently   1.505e+04  3.466e+07   0.000  0.99965    
    ## indirect        1.368e+04  9.175e+06   0.001  0.99881    
    ## influence       4.287e+01  2.807e+07   0.000  1.00000    
    ## influences      1.029e+04  2.848e+07   0.000  0.99971    
    ## introduce      -6.291e+03  1.095e+07  -0.001  0.99954    
    ## iterations     -1.870e+02  1.374e+07   0.000  0.99999    
    ## judgments              NA         NA      NA       NA    
    ## libraries              NA         NA      NA       NA    
    ## library        -1.604e+04  2.946e+07  -0.001  0.99957    
    ## limitations     2.338e+02  3.617e+07   0.000  0.99999    
    ## limited         4.238e+03  2.048e+06   0.002  0.99835    
    ## means          -1.467e+04  4.186e+07   0.000  0.99972    
    ## measure         1.442e+04  2.410e+07   0.001  0.99952    
    ## minimization    1.154e+04  2.074e+07   0.001  0.99956    
    ## missing         1.392e+04  1.551e+07   0.001  0.99928    
    ## modified        1.494e+03  3.181e+07   0.000  0.99996    
    ## mouse           1.075e+03  2.722e+07   0.000  0.99997    
    ## network        -7.355e+03  3.956e+06  -0.002  0.99852    
    ## neuronal        4.767e+03  3.914e+06   0.001  0.99903    
    ## openness        9.574e+03  4.421e+07   0.000  0.99983    
    ## optimal         1.430e+04  1.174e+07   0.001  0.99903    
    ## paper          -1.342e+03  4.259e+05  -0.003  0.99749    
    ## parallel       -9.937e+03  6.048e+06  -0.002  0.99869    
    ## parameters      1.383e+04  5.279e+07   0.000  0.99979    
    ## parametric      1.144e+04  1.627e+07   0.001  0.99944    
    ## peptides       -8.782e+03  4.142e+06  -0.002  0.99831    
    ## personality    -1.537e+04  3.681e+07   0.000  0.99967    
    ## pie                    NA         NA      NA       NA    
    ## pml            -2.615e+04  4.132e+07  -0.001  0.99949    
    ## pro                    NA         NA      NA       NA    
    ## probability    -1.171e+04  4.055e+07   0.000  0.99977    
    ## processes              NA         NA      NA       NA    
    ## propose        -9.195e+03  5.952e+06  -0.002  0.99877    
    ## question       -9.195e+03  8.418e+06  -0.001  0.99913    
    ## radically       1.884e+04  2.282e+07   0.001  0.99934    
    ## range                  NA         NA      NA       NA    
    ## regularized            NA         NA      NA       NA    
    ## representative -2.824e+04  2.248e+07  -0.001  0.99900    
    ## require         3.128e+02  8.615e+06   0.000  0.99997    
    ## response        1.078e+04  6.368e+06   0.002  0.99865    
    ## risk           -1.073e+04  1.389e+07  -0.001  0.99938    
    ## sampling               NA         NA      NA       NA    
    ## sces           -1.616e+04  4.698e+07   0.000  0.99973    
    ## selection       4.287e+01  1.965e+07   0.000  1.00000    
    ## sequences      -4.554e+02  1.504e+07   0.000  0.99998    
    ## short          -5.361e+03  9.521e+07   0.000  0.99996    
    ## slightly       -4.269e+02  2.352e+07   0.000  0.99999    
    ## stability       1.701e+04  3.439e+07   0.000  0.99961    
    ## structure      -9.061e+03  5.052e+06  -0.002  0.99857    
    ## students       -1.306e+04  2.365e+07  -0.001  0.99956    
    ## substantial     1.133e+04  7.623e+06   0.001  0.99881    
    ## tailored       -2.747e+02  2.322e+07   0.000  0.99999    
    ## target          1.501e+03  2.632e+07   0.000  0.99995    
    ## taxonomy               NA         NA      NA       NA    
    ## theorem        -5.853e+02  9.615e+06   0.000  0.99995    
    ## therapy         7.685e+03  6.300e+06   0.001  0.99903    
    ## trait          -1.744e+04  1.837e+07  -0.001  0.99924    
    ## transfer       -1.287e+04  1.179e+07  -0.001  0.99913    
    ## type           -1.482e+04  8.563e+06  -0.002  0.99862    
    ## typically       1.105e+04  6.614e+06   0.002  0.99867    
    ## uncertainty    -5.476e-08  1.591e+07   0.000  1.00000    
    ## unlike          1.133e+04  1.548e+07   0.001  0.99942    
    ## weakening              NA         NA      NA       NA    
    ## widely                 NA         NA      NA       NA    
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## (Dispersion parameter for binomial family taken to be 1)
    ## 
    ##     Null deviance: 406.10  on 292  degrees of freedom
    ## Residual deviance:  69.59  on 198  degrees of freedom
    ## AIC: 259.59
    ## 
    ## Number of Fisher Scoring iterations: 20
    
    obj <- pull(test, class.text)
    
    # 对测试集进行预测
    test_pre <- predict.glm(train_model, 
                            select(test, -class.text),
                            type = "response")
    
    test_pre <- ifelse(test_pre >= 0.5, 
                       levels(obj)[2], levels(obj)[1]) %>% 
      as.factor()
    
    # 混淆矩阵计算精确度、KAPPA值
    caret::confusionMatrix(test_pre, obj)
    
    ## Confusion Matrix and Statistics
    ## 
    ##           Reference
    ## Prediction AIMX CONT
    ##       AIMX   35    2
    ##       CONT    6   22
    ##                                           
    ##                Accuracy : 0.8769          
    ##                  95% CI : (0.7718, 0.9453)
    ##     No Information Rate : 0.6308          
    ##     P-Value [Acc > NIR] : 8.823e-06       
    ##                                           
    ##                   Kappa : 0.7446          
    ##                                           
    ##  Mcnemar's Test P-Value : 0.2888          
    ##                                           
    ##             Sensitivity : 0.8537          
    ##             Specificity : 0.9167          
    ##          Pos Pred Value : 0.9459          
    ##          Neg Pred Value : 0.7857          
    ##              Prevalence : 0.6308          
    ##          Detection Rate : 0.5385          
    ##    Detection Prevalence : 0.5692          
    ##       Balanced Accuracy : 0.8852          
    ##                                           
    ##        'Positive' Class : AIMX
    
    # ROC曲线
    p_load(ROCit)
    
    # 需要将因子变量转换为数值型
    obj_roc <- rocit(score = as.numeric(obj), 
                     class = as.numeric(test_pre))
    
    summary(obj_roc)
    
    ## Method used: empirical               
    ## Number of positive(s): 28                
    ## Number of negative(s): 37                
    ## Area under curve: 0.8658
    
    plot(obj_roc, legend = F, YIndex = F)
    
    ROC曲线

    图中虚线表示基准值,如果实线在虚线之下,说明模型效果不如随机猜测有效。

    2、多分类

    与二分类类似,标签多于两个,算法包括决策树、朴素贝叶斯、支持向量机等,而决策树又包括C4.5、CART、C5.0方法,本例使用CART方法。

    train_raw <- train_sentences %>% 
      as_tibble() %>% 
      filter(class.text %in% c("AIMX", "CONT", "BASE"))
    
    test_raw <- test_sentences %>% 
      as_tibble() %>% 
      filter(class.text %in% c("AIMX", "CONT", "BASE"))
    
    table(train_raw$class.text)
    
    ## 
    ## AIMX BASE CONT 
    ##  149   48  144
    
    sel_word <- train_raw %>% 
      # 分词
      unnest_tokens(word, text) %>% 
      # 去除停用词
      anti_join(stop_words) %>% 
      group_by(class.text) %>% 
      count(word) %>% 
      ungroup() %>% 
      bind_tf_idf(word, class.text, n) %>% 
      distinct(word, .keep_all = T) %>% 
      # 前100
      top_n(100, tf_idf) %>% 
      select(word, tf_idf)
    
    sel_word
    
    ## # A tibble: 127 × 2
    ##    word           tf_idf
    ##    <chr>           <dbl>
    ##  1 adopted       0.00334
    ##  2 algebraic     0.00334
    ##  3 attempt       0.00334
    ##  4 balanced      0.00334
    ##  5 bernoulli     0.00334
    ##  6 circumstances 0.00334
    ##  7 continuous    0.00556
    ##  8 drawn         0.00334
    ##  9 experience    0.00389
    ## 10 implies       0.00334
    ## # … with 117 more rows
    
    # 重新构造训练集和测试集
    train1 <- train_raw %>% 
      # 为每一个文本单独编号
      mutate(id = 1:n()) %>% 
      unnest_tokens(word, text) %>% 
      anti_join(stop_words) %>%
      # 筛选目标词
      inner_join(sel_word) %>% 
      # 去除句内重复
      distinct(id, word, .keep_all = T) %>% 
      # 长表转宽表
      wider_dt(name = "word",
               value = "tf_idf",
               fill = 0)
    
    # 有可能有的文档完全没有目标词,所以需要补充,并标记为0
    train <- train_raw %>% 
      mutate(id = 1:n()) %>% 
      select(id, class.text) %>% 
      left_join(train1) %>% 
      # 缺失值插入0
      replace_na_dt(to = 0) %>% 
      select(-id)
    
    # 测试集进行同样的操作
    test1 <- test_raw %>% 
      # 为每一个文本单独编号
      mutate(id = 1:n()) %>% 
      unnest_tokens(word, text) %>% 
      anti_join(stop_words) %>%
      # 筛选目标词
      inner_join(sel_word) %>% 
      # 去除句内重复
      distinct(id, word, .keep_all = T) %>% 
      # 长表转宽表
      wider_dt(name = "word",
               value = "tf_idf",
               fill = 0)
    
    # 有可能有的文档完全没有目标词,所以需要补充,并标记为0
    test2 <- test_raw %>% 
      mutate(id = 1:n()) %>% 
      select(id, class.text) %>% 
      left_join(test1) %>% 
      # 缺失值插入0
      replace_na_dt(to = 0) %>% 
      select(-id)
    
    # 数据框需要补齐到与训练集长度一致,所以需要补全所有单词,并标记为0
    # 获取需要补全的单词
    to_add <- setdiff(names(train), names(test2))
    
    test <- test2[, (to_add) := 0]
    
    # 检查长度是否一致
    length(train) == length(test)
    
    ## [1] TRUE
    
    # 检查两列名称是否一致
    setequal(names(train), names(test))
    
    ## [1] TRUE
    
    # 响应变量转化为因子型,否则很多机器学习模型会误认为是回归问题
    train <- train %>% 
      mutate_dt(class.text = as.factor(class.text))
    
    test <- test %>% 
      mutate_dt(class.text = as.factor(class.text))
    
    # 建模分析与评估
    p_load(rpart)
    rpart_model <- rpart(class.text ~ ., data = train)
    summary(rpart_model)
    
    ## Call:
    ## rpart(formula = class.text ~ ., data = train)
    ##   n= 341 
    ## 
    ##          CP nsplit rel error    xerror       xstd
    ## 1 0.1979167      0 1.0000000 1.0781250 0.04697417
    ## 2 0.0100000      1 0.8020833 0.8020833 0.04786331
    ## 
    ## Variable importance
    ##     paper   adopted   attempt     drawn   implies weakening 
    ##        60         8         8         8         8         8 
    ## 
    ## Node number 1: 341 observations,    complexity param=0.1979167
    ##   predicted class=AIMX  expected loss=0.5630499  P(node) =1
    ##     class counts:   149    48   144
    ##    probabilities: 0.437 0.141 0.422 
    ##   left son=2 (45 obs) right son=3 (296 obs)
    ##   Primary splits:
    ##       paper     < 0.00441169  to the right, improve=23.668390, (0 missing)
    ##       difficult < 0.00398667  to the left,  improve= 7.949241, (0 missing)
    ##       limited   < 0.003701908 to the left,  improve= 7.358934, (0 missing)
    ##       introduce < 0.002779889 to the right, improve= 5.307298, (0 missing)
    ##       neuronal  < 0.003417146 to the left,  improve= 5.033265, (0 missing)
    ##   Surrogate splits:
    ##       adopted   < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
    ##       attempt   < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
    ##       drawn     < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
    ##       implies   < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
    ##       weakening < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
    ## 
    ## Node number 2: 45 observations
    ##   predicted class=AIMX  expected loss=0.04444444  P(node) =0.1319648
    ##     class counts:    43     2     0
    ##    probabilities: 0.956 0.044 0.000 
    ## 
    ## Node number 3: 296 observations
    ##   predicted class=CONT  expected loss=0.5135135  P(node) =0.8680352
    ##     class counts:   106    46   144
    ##    probabilities: 0.358 0.155 0.486
    
    # 测试集
    obj <- pull(test, class.text)
    
    # 对测试集进行预测
    test_pre <- predict(rpart_model, test, type = "class") %>% 
      as.factor()
    
    # 混淆矩阵计算精确度、KAPPA值
    caret::confusionMatrix(test_pre, obj)
    
    ## Confusion Matrix and Statistics
    ## 
    ##           Reference
    ## Prediction AIMX BASE CONT
    ##       AIMX   15    0    0
    ##       BASE    0    0    0
    ##       CONT   26   13   24
    ## 
    ## Overall Statistics
    ##                                           
    ##                Accuracy : 0.5             
    ##                  95% CI : (0.3846, 0.6154)
    ##     No Information Rate : 0.5256          
    ##     P-Value [Acc > NIR] : 0.7149          
    ##                                           
    ##                   Kappa : 0.2312          
    ##                                           
    ##  Mcnemar's Test P-Value : NA              
    ## 
    ## Statistics by Class:
    ## 
    ##                      Class: AIMX Class: BASE Class: CONT
    ## Sensitivity               0.3659      0.0000      1.0000
    ## Specificity               1.0000      1.0000      0.2778
    ## Pos Pred Value            1.0000         NaN      0.3810
    ## Neg Pred Value            0.5873      0.8333      1.0000
    ## Prevalence                0.5256      0.1667      0.3077
    ## Detection Rate            0.1923      0.0000      0.3077
    ## Detection Prevalence      0.1923      0.0000      0.8077
    ## Balanced Accuracy         0.6829      0.5000      0.6389
    

    相关文章

      网友评论

        本文标题:113-文本分析之有监督分类

        本文链接:https://www.haomeiwen.com/subject/cekctdtx.html