美文网首页
R|tidyverse|节省管道%>%

R|tidyverse|节省管道%>%

作者: 高大石头 | 来源:发表于2021-04-20 00:27 被阅读0次

    翻看简书的时候偶尔发现一篇关于tidyverse管道操作的技术贴:节省tidyverse中的管道 %>% ,还挺实用,实际演练学习下。

    示例数据

    library(tidyverse)
    data("penguins",package = "palmerpenguins")
    penguins <- na.omit(penguins)
    

    1. rename()

    1.1 select()里用rename()

    penguins %>% 
      select(penguins_species = species, #直接进行重命名
             island)
    
    ## # A tibble: 333 x 2
    ##    penguins_species island   
    ##    <fct>            <fct>    
    ##  1 Adelie           Torgersen
    ##  2 Adelie           Torgersen
    ##  3 Adelie           Torgersen
    ##  4 Adelie           Torgersen
    ##  5 Adelie           Torgersen
    ##  6 Adelie           Torgersen
    ##  7 Adelie           Torgersen
    ##  8 Adelie           Torgersen
    ##  9 Adelie           Torgersen
    ## 10 Adelie           Torgersen
    ## # ... with 323 more rows
    

    1.2 count()里面用rename()

    penguins %>% 
      count(species, name = "total") #计数后直接赋值
    
    ## # A tibble: 3 x 2
    ##   species   total
    ##   <fct>     <int>
    ## 1 Adelie      146
    ## 2 Chinstrap    68
    ## 3 Gentoo      119
    

    赋值后再计数

    penguins %>% 
      count(penguins_species = species, name = "total")
    
    ## # A tibble: 3 x 2
    ##   penguins_species total
    ##   <fct>            <int>
    ## 1 Adelie             146
    ## 2 Chinstrap           68
    ## 3 Gentoo             119
    

    注意: 此处传递给name参数的新名称必须用引号引起来,但选定列的新名称不必用引号引起来。

    2. count()内部mutate()

    penguins %>% 
      count(long_beak=bill_length_mm > 50)
    
    ## # A tibble: 2 x 2
    ##   long_beak     n
    ##   <lgl>     <int>
    ## 1 FALSE       281
    ## 2 TRUE         52
    

    也可以指定多个变量

    penguins %>% 
      count(long_beak = bill_length_mm > 50,
            is_adelie = species == "Adelie")
    
    ## # A tibble: 3 x 3
    ##   long_beak is_adelie     n
    ##   <lgl>     <lgl>     <int>
    ## 1 FALSE     FALSE       135
    ## 2 FALSE     TRUE        146
    ## 3 TRUE      FALSE        52
    

    3.transmute()+select()

    实际上,transmute()就像select()继承未修改的列一样,当然可以在执行操作时对其“重命名”

    penguins %>% 
      transmute(penguins_species = species,
                island,
                body_mass_kg = body_mass_g/1000)
    
    ## # A tibble: 333 x 3
    ##    penguins_species island    body_mass_kg
    ##    <fct>            <fct>            <dbl>
    ##  1 Adelie           Torgersen         3.75
    ##  2 Adelie           Torgersen         3.8 
    ##  3 Adelie           Torgersen         3.25
    ##  4 Adelie           Torgersen         3.45
    ##  5 Adelie           Torgersen         3.65
    ##  6 Adelie           Torgersen         3.62
    ##  7 Adelie           Torgersen         4.68
    ##  8 Adelie           Torgersen         3.2 
    ##  9 Adelie           Torgersen         3.8 
    ## 10 Adelie           Torgersen         4.4 
    ## # ... with 323 more rows
    

    4. summarize()+ungroup()

    summarize(.groups="drop"):删除分组变量

    penguins %>% 
      group_by(island, species) %>% 
      summarize(mean_mass = mean(body_mass_g, na.rm = T),.groups = "drop") 
    
    ## # A tibble: 5 x 3
    ##   island    species   mean_mass
    ##   <fct>     <fct>         <dbl>
    ## 1 Biscoe    Adelie        3710.
    ## 2 Biscoe    Gentoo        5092.
    ## 3 Dream     Adelie        3701.
    ## 4 Dream     Chinstrap     3733.
    ## 5 Torgersen Adelie        3709.
    

    5. arrange()+其他功能slice()

    penguins %>% 
      top_n(5, wt=body_mass_g) #按列排序前5行
    
    ## # A tibble: 6 x 8
    ##   species island bill_length_mm bill_depth_mm flipper_length_~ body_mass_g sex  
    ##   <fct>   <fct>           <dbl>         <dbl>            <int>       <int> <fct>
    ## 1 Gentoo  Biscoe           49.2          15.2              221        6300 male 
    ## 2 Gentoo  Biscoe           59.6          17                230        6050 male 
    ## 3 Gentoo  Biscoe           51.1          16.3              220        6000 male 
    ## 4 Gentoo  Biscoe           45.2          16.4              223        5950 male 
    ## 5 Gentoo  Biscoe           49.8          15.9              229        5950 male 
    ## 6 Gentoo  Biscoe           48.8          16.2              222        6000 male 
    ## # ... with 1 more variable: year <int>
    
    penguins %>% 
      slice_max(order_by = body_mass_g,n=5) # slice_*的新功能
    
    ## # A tibble: 6 x 8
    ##   species island bill_length_mm bill_depth_mm flipper_length_~ body_mass_g sex  
    ##   <fct>   <fct>           <dbl>         <dbl>            <int>       <int> <fct>
    ## 1 Gentoo  Biscoe           49.2          15.2              221        6300 male 
    ## 2 Gentoo  Biscoe           59.6          17                230        6050 male 
    ## 3 Gentoo  Biscoe           51.1          16.3              220        6000 male 
    ## 4 Gentoo  Biscoe           48.8          16.2              222        6000 male 
    ## 5 Gentoo  Biscoe           45.2          16.4              223        5950 male 
    ## 6 Gentoo  Biscoe           49.8          15.9              229        5950 male 
    ## # ... with 1 more variable: year <int>
    

    slice_*()功能最大的变化是为分组数据添加了适当的行为,例如:

    penguins %>% 
      group_by(species) %>% 
      slice_max(body_mass_g,prop=.05) #返回每个物种重量百分比最高的5%的企鹅
    
    ## # A tibble: 16 x 8
    ## # Groups:   species [3]
    ##    species island bill_length_mm bill_depth_mm flipper_length_~ body_mass_g
    ##    <fct>   <fct>           <dbl>         <dbl>            <int>       <int>
    ##  1 Adelie  Biscoe           43.2          19                197        4775
    ##  2 Adelie  Biscoe           41            20                203        4725
    ##  3 Adelie  Torge~           42.9          17.6              196        4700
    ##  4 Adelie  Torge~           39.2          19.6              195        4675
    ##  5 Adelie  Dream            39.8          19.1              184        4650
    ##  6 Adelie  Dream            39.6          18.8              190        4600
    ##  7 Adelie  Biscoe           45.6          20.3              191        4600
    ##  8 Chinst~ Dream            52            20.7              210        4800
    ##  9 Chinst~ Dream            52.8          20                205        4550
    ## 10 Chinst~ Dream            53.5          19.9              205        4500
    ## 11 Gentoo  Biscoe           49.2          15.2              221        6300
    ## 12 Gentoo  Biscoe           59.6          17                230        6050
    ## 13 Gentoo  Biscoe           51.1          16.3              220        6000
    ## 14 Gentoo  Biscoe           48.8          16.2              222        6000
    ## 15 Gentoo  Biscoe           45.2          16.4              223        5950
    ## 16 Gentoo  Biscoe           49.8          15.9              229        5950
    ## # ... with 2 more variables: sex <fct>, year <int>
    

    6. add_count()

    按组计数和求和,add_count()添加一列,其中包含每组(或组的组合)的计数

    penguins %>% 
      add_count(species, name = "count_by_species") %>% 
      select(-contains("mm"))
    
    ## # A tibble: 333 x 6
    ##    species island    body_mass_g sex     year count_by_species
    ##    <fct>   <fct>           <int> <fct>  <int>            <int>
    ##  1 Adelie  Torgersen        3750 male    2007              146
    ##  2 Adelie  Torgersen        3800 female  2007              146
    ##  3 Adelie  Torgersen        3250 female  2007              146
    ##  4 Adelie  Torgersen        3450 female  2007              146
    ##  5 Adelie  Torgersen        3650 male    2007              146
    ##  6 Adelie  Torgersen        3625 female  2007              146
    ##  7 Adelie  Torgersen        4675 male    2007              146
    ##  8 Adelie  Torgersen        3200 female  2007              146
    ##  9 Adelie  Torgersen        3800 male    2007              146
    ## 10 Adelie  Torgersen        4400 male    2007              146
    ## # ... with 323 more rows
    

    也可以使用wt来按组有效的获取总和:

    penguins %>% 
      add_count(species,wt=body_mass_g,
                name = "total_weighted_by_species") %>% 
      select(-contains("mm"))
    
    ## # A tibble: 333 x 6
    ##    species island    body_mass_g sex     year total_weighted_by_species
    ##    <fct>   <fct>           <int> <fct>  <int>                     <int>
    ##  1 Adelie  Torgersen        3750 male    2007                    541100
    ##  2 Adelie  Torgersen        3800 female  2007                    541100
    ##  3 Adelie  Torgersen        3250 female  2007                    541100
    ##  4 Adelie  Torgersen        3450 female  2007                    541100
    ##  5 Adelie  Torgersen        3650 male    2007                    541100
    ##  6 Adelie  Torgersen        3625 female  2007                    541100
    ##  7 Adelie  Torgersen        4675 male    2007                    541100
    ##  8 Adelie  Torgersen        3200 female  2007                    541100
    ##  9 Adelie  Torgersen        3800 male    2007                    541100
    ## 10 Adelie  Torgersen        4400 male    2007                    541100
    ## # ... with 323 more rows
    

    默认情况下,add_tally()添加行数,此时可以使用mutate (n = n())进行处理

    penguins %>% 
      add_count(species, wt=body_mass_g,
                name = "total_weight_by_speices") %>% 
      add_tally(wt=body_mass_g,
                name = "total_weight_of_all_species") %>% 
      select(1:2,last_col(0):last_col(1))
    
    ## # A tibble: 333 x 4
    ##    species island    total_weight_of_all_species total_weight_by_speices
    ##    <fct>   <fct>                           <int>                   <int>
    ##  1 Adelie  Torgersen                     1400950                  541100
    ##  2 Adelie  Torgersen                     1400950                  541100
    ##  3 Adelie  Torgersen                     1400950                  541100
    ##  4 Adelie  Torgersen                     1400950                  541100
    ##  5 Adelie  Torgersen                     1400950                  541100
    ##  6 Adelie  Torgersen                     1400950                  541100
    ##  7 Adelie  Torgersen                     1400950                  541100
    ##  8 Adelie  Torgersen                     1400950                  541100
    ##  9 Adelie  Torgersen                     1400950                  541100
    ## 10 Adelie  Torgersen                     1400950                  541100
    ## # ... with 323 more rows
    

    参考链接:

    节省tidyverse中的管道%>%

    相关文章

      网友评论

          本文标题:R|tidyverse|节省管道%>%

          本文链接:https://www.haomeiwen.com/subject/wxtglltx.html