美文网首页
Week1: swirl教程 2: Grouping and C

Week1: swirl教程 2: Grouping and C

作者: Chamberzero | 来源:发表于2021-10-15 02:41 被阅读0次

    group_by()

    by_package <- group_by(cran,package)
    by_package
    # A tibble: 225,468 × 11
    # Groups:   package [6,023]
           X date       time        size r_version r_arch r_os      package      version country ip_id
       <int> <chr>      <chr>      <int> <chr>     <chr>  <chr>     <chr>        <chr>   <chr>   <int>
     1     1 2014-07-08 00:54:41   80589 3.1.0     x86_64 mingw32   htmltools    0.2.4   US          1
     2     2 2014-07-08 00:59:53  321767 3.1.0     x86_64 mingw32   tseries      0.10-32 US          2
     3     3 2014-07-08 00:47:13  748063 3.1.0     x86_64 linux-gnu party        1.0-15  US          3
     4     4 2014-07-08 00:48:05  606104 3.1.0     x86_64 linux-gnu Hmisc        3.14-4  US          3
     5     5 2014-07-08 00:46:50   79825 3.0.2     x86_64 linux-gnu digest       0.6.4   CA          4
     6     6 2014-07-08 00:48:04   77681 3.1.0     x86_64 linux-gnu randomForest 4.6-7   US          3
     7     7 2014-07-08 00:48:35  393754 3.1.0     x86_64 linux-gnu plyr         1.8.1   US          3
     8     8 2014-07-08 00:47:30   28216 3.0.2     x86_64 linux-gnu whisker      0.3-2   US          5
     9     9 2014-07-08 00:54:58    5928 NA        NA     NA        Rcpp         0.10.4  CN          6
    10    10 2014-07-08 00:15:35 2206029 3.0.2     x86_64 linux-gnu hflights     0.1     US          7
    # … with 225,458 more rows
    
    summarize(by_package,mean(size))
    # A tibble: 6,023 × 2
       package     `mean(size)`
       <chr>              <dbl>
     1 A3                62195.
     2 abc             4826665 
     3 abcdeFBA         455980.
     4 ABCExtremes       22904.
     5 ABCoptim          17807.
     6 ABCp2             30473.
     7 abctools        2589394 
     8 abd              453631.
     9 abf2              35693.
    10 abind             32939.
    # … with 6,013 more rows
    
    pack_sum <- summarize(by_package,
                          count = n(),
                          unique =  n_distinct(ip_id),
                          countries = n_distinct(country),
                          avg_bytes = mean(size))
    pack_sum
    
    # A tibble: 6,023 × 5
       package     count unique countries avg_bytes
       <chr>       <int>  <int>     <int>     <dbl>
     1 A3             25     24        10    62195.
     2 abc            29     25        16  4826665 
     3 abcdeFBA       15     15         9   455980.
     4 ABCExtremes    18     17         9    22904.
     5 ABCoptim       16     15         9    17807.
     6 ABCp2          18     17        10    30473.
     7 abctools       19     19        11  2589394 
     8 abd            17     16        10   453631.
     9 abf2           13     13         9    35693.
    10 abind         396    365        50    32939.
    # … with 6,013 more rows
    

    quantile()

    quantile(pack_sum$count, probs = 0.99)
    top_counts <- filter(pack_sum, count > 679)
    # A tibble: 61 × 5
       package    count unique countries avg_bytes
       <chr>      <int>  <int>     <int>     <dbl>
     1 bitops      1549   1408        76    28715.
     2 car         1008    837        64  1229122.
     3 caTools      812    699        64   176589.
     4 colorspace  1683   1433        80   357411.
     5 data.table   680    564        59  1252721.
     6 DBI         2599    492        48   206933.
     7 devtools     769    560        55   212933.
     8 dichromat   1486   1257        74   134732.
     9 digest      2210   1894        83   120549.
    10 doSNOW       740     75        24     8364.
    # … with 51 more rows
    
    top_counts_sorted <- arrange(top_counts,desc(count))
    
    

    pip

    ctrl+shift+M
    GSFA : group_by -> summarize -> filter -> arrange

    result3 <-
      cran %>%
      group_by(package) %>%
      summarize(count = n(),
                unique = n_distinct(ip_id),
                countries = n_distinct(country),
                avg_bytes = mean(size)
      ) %>%
      filter(countries > 60) %>%
      arrange(desc(countries), avg_bytes)
    

    相关文章

      网友评论

          本文标题:Week1: swirl教程 2: Grouping and C

          本文链接:https://www.haomeiwen.com/subject/djmpoltx.html