美文网首页
R-tidyr 数据整理

R-tidyr 数据整理

作者: JY_Liu | 来源:发表于2020-03-24 16:36 被阅读0次

    Three interrelated rules which make a dataset tidy:

    1. Each variable must have its own column.
    2. Each observation must have its own row.
    3. Each value must have its own cell.

    1. pivot

    1. One variable might be spread across multiple columns. --> pivot_longer()
    2. One observation might be scattered across multiple rows. --> pivot_wider()

    1.1 pivot_longer

    pivot_longer(data, cols, names_to = "name", values_to = "value", values.drop.na = FALSE)
    data 需要转换的数据
    cols 进行转换的列, the columns to pivot are specified with dplyr::select() style notation.
    names_to 列名转换为
    values_to 值放置于
    values_drop_na 去掉缺失值

    > table4a
    # A tibble: 3 x 3
      country     `1999` `2000`
    * <chr>        <int>  <int>
    1 Afghanistan    745   2666
    2 Brazil       37737  80488
    3 China       212258 213766
    > table4a %>% 
    +   pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
    # A tibble: 6 x 3
      country     year   cases
      <chr>       <chr>  <int>
    1 Afghanistan 1999     745
    2 Afghanistan 2000    2666
    3 Brazil      1999   37737
    4 Brazil      2000   80488
    5 China       1999  212258
    6 China       2000  213766
    
    # table4a中列名1999与2000都不符合命名标准,所以需要用反括号 ’ 引用
    

    1.2 pivot_wider

    pivot_wider(data, names_from = 'name', values_from = 'values')

    > table2
    # A tibble: 12 x 4
       country      year type            count
       <chr>       <int> <chr>           <int>
     1 Afghanistan  1999 cases             745
     2 Afghanistan  1999 population   19987071
     3 Afghanistan  2000 cases            2666
     4 Afghanistan  2000 population   20595360
     5 Brazil       1999 cases           37737
     6 Brazil       1999 population  172006362
     7 Brazil       2000 cases           80488
     8 Brazil       2000 population  174504898
     9 China        1999 cases          212258
    10 China        1999 population 1272915272
    11 China        2000 cases          213766
    12 China        2000 population 1280428583
    > table2 %>%
    +   pivot_wider(names_from = type, values_from = count)
    # A tibble: 6 x 4
      country      year  cases population
      <chr>       <int>  <int>      <int>
    1 Afghanistan  1999    745   19987071
    2 Afghanistan  2000   2666   20595360
    3 Brazil       1999  37737  172006362
    4 Brazil       2000  80488  174504898
    5 China        1999 212258 1272915272
    6 China        2000 213766 1280428583
    

    exercise:

    > preg <- tribble(
    +   ~pregnant, ~male, ~female,
    +   "yes",     NA,    10,
    +   "no",      20,    12
    + )
    > preg
    # A tibble: 2 x 3
      pregnant  male female
      <chr>    <dbl>  <dbl>
    1 yes         NA     10
    2 no          20     12
    
    > preg %>% 
    +   preg %>% 
      pivot_longer(cols = 2:3,
                   # cols = male:female,
                   # cols = c("male","female"),
                   names_to = "gender", 
                   values_to = "count")
    # A tibble: 4 x 3
      pregnant gender count
      <chr>    <chr>  <dbl>
    1 yes      male      NA
    2 yes      female    10
    3 no       male      20
    4 no       female    12
    

    2 separate and unite

    2.1 separate

    separate(data, col, into, sep=, convert, remove)
    col 需要拆分的列
    into = 拆分为
    sep = 拆分位置(特定符号或制定位置)
    convert 是否转换数据类型(默认FALASE)
    remove 删除原有列(默认TRUE)

    > table3
    # A tibble: 6 x 3
      country      year rate             
    * <chr>       <int> <chr>            
    1 Afghanistan  1999 745/19987071     
    2 Afghanistan  2000 2666/20595360    
    3 Brazil       1999 37737/172006362  
    4 Brazil       2000 80488/174504898  
    5 China        1999 212258/1272915272
    6 China        2000 213766/1280428583
    
    > table3 %>% 
    +   separate(rate, into = c("cases", "population"), sep = "/", convert = TRUE)
    # A tibble: 6 x 4
      country      year  cases population
      <chr>       <int>  <int>      <int>
    1 Afghanistan  1999    745   19987071
    2 Afghanistan  2000   2666   20595360
    3 Brazil       1999  37737  172006362
    4 Brazil       2000  80488  174504898
    5 China        1999 212258 1272915272
    6 China        2000 213766 1280428583
    
    • extract() uses regluar expression to capture groups and turn groups into multiple columns.

    2.2 separate_rows

    separate_rows(data, ..., sep = "[^[:alnum:].]+", convert = FALSE)

    > teacher <- data.frame(
    +   teacher = c("a", "b"),
    +   class = c("1,2", "1,3,5")
    + )
    > teacher 
      teacher class
    1       a   1,2
    2       b 1,3,5
    > teacher %>% separate_rows("class", sep = ",", convert = TRUE)
      teacher class
    1       a     1
    2       a     2
    3       b     1
    4       b     3
    5       b     5
    

    2.3 unite

    unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)

    > table5
    # A tibble: 6 x 4
      country     century year  rate             
    * <chr>       <chr>   <chr> <chr>            
    1 Afghanistan 19      99    745/19987071     
    2 Afghanistan 20      00    2666/20595360    
    3 Brazil      19      99    37737/172006362  
    4 Brazil      20      00    80488/174504898  
    5 China       19      99    212258/1272915272
    6 China       20      00    213766/1280428583
    
    > table5 %>%
    +   unite(col = new,century, year, sep = "")
    # A tibble: 6 x 3
      country     new   rate             
      <chr>       <chr> <chr>            
    1 Afghanistan 1999  745/19987071     
    2 Afghanistan 2000  2666/20595360    
    3 Brazil      1999  37737/172006362  
    4 Brazil      2000  80488/174504898  
    5 China       1999  212258/1272915272
    6 China       2000  213766/1280428583
    

    相关文章

      网友评论

          本文标题:R-tidyr 数据整理

          本文链接:https://www.haomeiwen.com/subject/laptyhtx.html