美文网首页
R for data science chap19——使用pur

R for data science chap19——使用pur

作者: 陆慕熙 | 来源:发表于2020-06-21 23:56 被阅读0次

    19.2 列表列

    dataframe

    • data.frame()默认将列表作为列的列表处理
    > data.frame(x=list(1:3,3:5))
      x.1.3 x.3.5
    1     1     3
    2     2     4
    3     3     5
    

    解决:使用I()

    > data.frame(
    +   x=I(list(1:3,3:5)),
    +   y=c("1,2","3,4,5")
    + )
            x     y
    1 1, 2, 3   1,2
    2 3, 4, 5 3,4,5
    

    tibble

    • 不需要修改输入
    > tibble(
    +   x=list(1:3,3:5),
    +   y=c("1,2","3,4,5")
    + )
    # A tibble: 2 x 2
      x         y    
      <list>    <chr>
    1 <int [3]> 1,2  
    2 <int [3]> 3,4,5
    > tb <-  tibble(
    +   x=list(1:3,3:5),
    +   y=c("1,2","3,4,5")
    + )
    > tb$x
    [[1]]
    [1] 1 2 3
    
    [[2]]
    [1] 3 4 5
    
    > tb$y
    [1] "1,2"   "3,4,5"
    

    tribble()

    • tribble()比tibble()更容易,可以自动识别想要的列表
    > trb <-  tribble(
    +   ~x,~y,
    +   1:3,"1,2",
    +   3:5,"3,4,5"
    + )
    > trb
    # A tibble: 2 x 2
      x         y    
      <list>    <chr>
    1 <int [3]> 1,2  
    2 <int [3]> 3,4,5
    

    19.3 创建列表列

    19.3.1 使用嵌套; nest()

    • 嵌套数据框
    元观测
    列 (列表列) 组成元观测的具体观测
    列(其他) 定义观测的变量
    • nest() 的使用
      • 用于分组数据框:保留用于分组的列,而将其他所有数据归并到列表列中(其他所有数据归并到列表列)
      • 用于未分组数据框: 需要指定嵌套哪些列

    19.3.2 使用向量化函数

    > df <-  tribble(
    +   ~x1,
    +   "a,b,c",
    +   "d,e,f,g"
    + )
    
    > str_split(df$x1,",")
    [[1]]
    [1] "a" "b" "c"
    
    [[2]]
    [1] "d" "e" "f" "g"
    #嵌套:mutate(function()) [function()生成一个list]
    > df %>% 
    +   mutate(x2= str_split(x1,","))
    # A tibble: 2 x 2
      x1      x2       
      <chr>   <list>   
    1 a,b,c   <chr [3]>
    2 d,e,f,g <chr [4]>
    # 还原嵌套:unnest()
    > df %>% 
    +   mutate(x2= str_split(x1,",")) %>% 
    +   unnest()
    # A tibble: 7 x 2
      x1      x2   
      <chr>   <chr>
    1 a,b,c   a    
    2 a,b,c   b    
    3 a,b,c   c    
    4 d,e,f,g d    
    5 d,e,f,g e    
    6 d,e,f,g f    
    7 d,e,f,g g  
    # 调用不同函数
    > sim <- tribble(
    +   ~f, ~params,
    +   "runif", list(min=-1,max=-1),
    +   "rnorm", list(sd=5),
    +   "rpois", list(lambda= 10)
    + )
    > sim %>% 
    +   mutate(sims = invoke_map(f,params,n=10))
    # A tibble: 3 x 3
      f     params           sims      
      <chr> <list>           <list>    
    1 runif <named list [2]> <dbl [10]>
    2 rnorm <named list [1]> <dbl [10]>
    3 rpois <named list [1]> <int [10]>
    

    19.3.3 使用多值摘要

    summarize()只能返回单一值的摘要函数,对于返回更长向量的函数,可以将结果包装在一个list中

    > mtcars %>% 
    +   group_by(cyl) %>% 
    +   summarize(q = quantile(mpg))
    `summarise()` regrouping output by 'cyl' (override with `.groups` argument)
    # A tibble: 15 x 2
    # Groups:   cyl [3]
         cyl     q
       <dbl> <dbl>
     1     4  21.4
     2     4  22.8
     3     4  26  
     4     4  30.4
     5     4  33.9
     6     6  17.8
     7     6  18.6
     8     6  19.7
     9     6  21  
    10     6  21.4
    11     8  10.4
    12     8  14.4
    13     8  15.2
    14     8  16.2
    15     8  19.2
    

    我也不知道为啥这里没有报错/(ㄒoㄒ)/~~

    * 修改代码: 将结果包装为list

    > mtcars %>% 
    +   group_by(cyl) %>% 
    +   summarize(q = list(quantile(mpg)))
    `summarise()` ungrouping output (override with `.groups` argument)
    # A tibble: 3 x 2
        cyl q        
      <dbl> <list>   
    1     4 <dbl [5]>
    2     6 <dbl [5]>
    3     8 <dbl [5]>
    

    酱紫就对啦O(∩_∩)O

    By the way, group_by 在不搭配其他函数使用就没什么用

    > mtcars %>% group_by(cyl)
    # A tibble: 32 x 11
    # Groups:   cyl [3]
         mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
     1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
     2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
     3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
     4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
     5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
     6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
     7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
     8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
     9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
    10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
    

    看起来啥也没变,但没有毛病就是酱紫

    直接unnes()显示结果

    > mtcars %>% 
    +   group_by(cyl) %>% 
    +   summarize(q = list(quantile(mpg))) %>% 
    +   unnest()
    `summarise()` ungrouping output (override with `.groups` argument)
    # A tibble: 15 x 2
         cyl     q
       <dbl> <dbl>
     1     4  21.4
     2     4  22.8
     3     4  26  
     4     4  30.4
     5     4  33.9
     6     6  17.8
     7     6  18.6
     8     6  19.7
     9     6  21  
    10     6  21.4
    11     8  10.4
    12     8  14.4
    13     8  15.2
    14     8  16.2
    15     8  19.2
    Warning message:
    `cols` is now required when using unnest().
    Please use `cols = c(q)` 
    

    为了让结果和比率值一同显示

    > mtcars %>% 
    +   group_by(cyl) %>% 
    +   summarise(p=list(probs),q=list(quantile(mpg,probs))) %>% 
    +   unnest()
    `summarise()` ungrouping output (override with `.groups` argument)
    # A tibble: 15 x 3
         cyl     p     q
       <dbl> <dbl> <dbl>
     1     4  0.01  21.4
     2     4  0.25  22.8
     3     4  0.5   26  
     4     4  0.75  30.4
     5     4  0.99  33.8
     6     6  0.01  17.8
     7     6  0.25  18.6
     8     6  0.5   19.7
     9     6  0.75  21  
    10     6  0.99  21.4
    11     8  0.01  10.4
    12     8  0.25  14.4
    13     8  0.5   15.2
    14     8  0.75  16.2
    15     8  0.99  19.1
    

    19.3.4 使用命名列表

    > x <-  list(
    +   a=1:5,
    +   b=3:4,
    +   c=5:6
    + )
    > x
    $a
    [1] 1 2 3 4 5
    
    $b
    [1] 3 4
    
    $c
    [1] 5 6
    
    > df <-  enframe(x)
    > df
    # A tibble: 3 x 2
      name  value    
      <chr> <list>   
    1 a     <int [5]>
    2 b     <int [2]>
    3 c     <int [2]>
    > library(stringr)
    > ?str_c
    > x <-  list(
    +   a=1:5,
    +   b=3:4,
    +   c=5:6
    + )
    > x
    $a
    [1] 1 2 3 4 5
    
    $b
    [1] 3 4
    
    $c
    [1] 5 6
    
    > df <-  enframe(x)
    > df
    # A tibble: 3 x 2
      name  value    
      <chr> <list>   
    1 a     <int [5]>
    2 b     <int [2]>
    3 c     <int [2]>
    > library(stringr)
    > df %>% 
    +   mutate(
    +     smry= map2_chr(
    +       name,
    +       value,
    +       ~str_c(.x,":",y[1])
    +     )
    +   )
    Error: Problem with `mutate()` input `smry`.
    x object 'y' not found
    i Input `smry` is `map2_chr(name, value, ~str_c(.x, ":", y[1]))`.
    Run `rlang::last_error()` to see where the error occurred.
    > df
    # A tibble: 3 x 2
      name  value    
      <chr> <list>   
    1 a     <int [5]>
    2 b     <int [2]>
    3 c     <int [2]>
    > df %>% 
    +   mutate(
    +     smry= map2_chr(
    +       name,
    +       value,
    +       ~str_c(.x,":",.y[1])
    +     )
    +   )
    # A tibble: 3 x 3
      name  value     smry 
      <chr> <list>    <chr>
    1 a     <int [5]> a:1  
    2 b     <int [2]> b:3  
    3 c     <int [2]> c:5  
    

    注意,str_c(.x,":",.y) .漏掉会报错

    异构列表筛选必备!!!(。^▽^)

    • 根据类型筛选
    > df %>% 
    +   mutate(
    +     smry= map2_chr(
    +       name,
    +       value,
    +       ~str_c(.x,":",y[1])
    +     )
    +   )
    Error: Problem with `mutate()` input `smry`.
    x object 'y' not found
    i Input `smry` is `map2_chr(name, value, ~str_c(.x, ":", y[1]))`.
    

    19.4 简化列表列

    > df <-  tribble(
    +   ~x,
    +   letters[1:5],
    +   1:3,
    +   runif(5)
    + )
    > df
    # A tibble: 3 x 1
      x        
      <list>   
    1 <chr [5]>
    2 <int [3]>
    3 <dbl [5]>
    
    > df %>% 
    +   mutate(
    +     type= map_chr(x,typeof),
    +     length= map_int(x,length)
    +   )
    # A tibble: 3 x 3
      x         type      length
      <list>    <chr>      <int>
    1 <chr [5]> character      5
    2 <int [3]> integer        3
    3 <dbl [5]> double         5
    
    • 从list中x的所有元素中提取指定变量中的内容

    .null=NA_real_ 可以提供一个缺失值的返回值

    df <- tribble(
      ~x,
      list(a=1,b=2),
      list(a=2,c=4)
    )
    > df2 %>% 
    +   mutate(
    +   a= map_dbl(x,"a"),
    +   b= map_dbl(x,"b",.null=NA_real_)
    + )
    # A tibble: 2 x 3
      x                    a     b
      <list>           <dbl> <dbl>
    1 <named list [2]>     1     2
    2 <named list [2]>     2    NA
    

    19.4.2 嵌套还原

    > tibble(
    +   x=1:2,
    +   y=list(1:4,1)
    + )
    # A tibble: 2 x 2
          x y        
      <int> <list>   
    1     1 <int [4]>
    2     2 <dbl [1]>
    
    
    > tibble(
    +   x=1:2,
    +   y=list(1:4,1)
    + ) %>% 
    +   unnest()
    # A tibble: 5 x 2
          x     y
      <int> <dbl>
    1     1     1
    2     1     2
    3     1     3
    4     1     4
    5     2     1
    Warning message:
    `cols` is now required when using unnest().
    Please use `cols = c(y)` 
    

    第二行之重复了一次,这意味着:

    • 不能同时还原包含不同数量元素的两个列表列
    df1 <- tribble(
      ~x,~y,~z,
      1,c("a","b"),1:2,
      2,"c",3
    )
    df1
    df1 %>% unnest()
    

    y和z每行中元素数量相等,可以正常运行

    > df1 <- tribble(
    +   ~x,~y,~z,
    +   1,c("a","b"),1:2,
    +   2,c("b","c"),3
    + )
    > df1
    # A tibble: 2 x 3
          x y         z        
      <dbl> <list>    <list>   
    1     1 <chr [2]> <int [2]>
    2     2 <chr [2]> <dbl [1]>
    > df1 %>% unnest()
    # A tibble: 4 x 3
          x y         z
      <dbl> <chr> <dbl>
    1     1 a         1
    2     1 b         2
    3     2 b         3
    4     2 c         3
    Warning message:
    `cols` is now required when using unnest().
    Please use `cols = c(y, z)` 
    

    (lll¬ω¬)书上这里应该跑不出来……可能更新后level up 了,应该x、y、z元素数量相同就可以

    相关文章

      网友评论

          本文标题:R for data science chap19——使用pur

          本文链接:https://www.haomeiwen.com/subject/hnnoxktx.html