美文网首页R语言R语言训练生信小白
R for data science ||使用readr进行数据

R for data science ||使用readr进行数据

作者: 周运来就是我 | 来源:发表于2019-07-19 08:24 被阅读11次

    使用R包提供的数据是学习数据科学工具的好方法,但是在某个时候,您希望停止学习,开始使用自己的数据。在本章中,您将学习如何将纯文本矩形文件读入r。在这里,我们只讨论数据导入的皮毛,但是许多原则将转换为其他形式的数据。

    library(tidyverse)
    setwd("D:\\Users\\Administrator\\Desktop\\RStudio\\R-Programming")
    heights <- read_csv("heights.csv")
    
    Parsed with column specification:
    cols(
      earn = col_double(),
      height = col_double(),
      sex = col_character(),
      ed = col_double(),
      age = col_double(),
      race = col_character()
    )
    
    
    ?read_csv()
    ? read_csv2()
    ? read_tsv()
    ? read_delim()
    ?read_fwf()
    ?read_log()
    

    直接创建行内csv文件。

    read_csv("a,b,c
              1,2,3
             4,5,6")
    
    
    # A tibble: 2 x 3
          a     b     c
      <dbl> <dbl> <dbl>
    1     1     2     3
    2     4     5     6
    

    用skip=n来跳过前n行。

    read_csv("The first line of metadata
      The second line of metadata
             x,y,z
             1,2,3", skip = 2)
    
    # A tibble: 1 x 3
          x     y     z
      <dbl> <dbl> <dbl>
    1     1     2     3
    
    
    read_csv("# A comment I want to skip
      x,y,z
      1,2,3", comment = "#")
    
    # A tibble: 1 x 3
          x     y     z
      <dbl> <dbl> <dbl>
    1     1     2     3
    

    无列名

    read_csv("1,2,3\n4,5,6", col_names = FALSE)
    
    # A tibble: 2 x 3
         X1    X2    X3
      <dbl> <dbl> <dbl>
    1     1     2     3
    2     4     5     6
    
    read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
    # A tibble: 2 x 3
          x     y     z
      <dbl> <dbl> <dbl>
    1     1     2     3
    2     4     5     6
    
    与R基础包进行比较
    • 速度更快
    • 可以生成tibble,不会将字符串向量转化为因子,不使用行名称,也不会随意改变列名称。
    • 更易于重复使用。
    解析向量
    str(parse_logical(c("TRUE", "FALSE", "NA")))
    #>  logi [1:3] TRUE FALSE NA
    str(parse_integer(c("1", "2", "3")))
    #>  int [1:3] 1 2 3
    str(parse_date(c("2010-01-01", "1979-10-14")))
    #>  Date[1:2], format: "2010-01-01" "1979-10-14"
    
    
    str(parse_integer(c("1", "2", "a")))
    Warning: 1 parsing failure.
    row col   expected actual
      3  -- an integer      a
    
     int [1:3] 1 2 NA
     - attr(*, "problems")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    1 obs. of  4 variables:
      ..$ row     : int 3
      ..$ col     : int NA
      ..$ expected: chr "an integer"
      ..$ actual  : chr "a"
    

    数值

    parse_double("1.23")
    #> [1] 1.23
    parse_double("1,23", locale = locale(decimal_mark = ","))
    #> [1] 1.23
    
    parse_number("$100")
    #> [1] 100
    parse_number("20%")
    #> [1] 20
    parse_number("It cost $123.45")
    #> [1] 123
    
    
    # Used in America
    parse_number("$123,456,789")
    #> [1] 1.23e+08
    
    # Used in many parts of Europe
    parse_number("123.456.789", locale = locale(grouping_mark = "."))
    #> [1] 1.23e+08
    
    # Used in Switzerland
    parse_number("123'456'789", locale = locale(grouping_mark = "'"))
    #> [1] 1.23e+08
    
    

    字符串

    #In R, we can get at the underlying representation of a string using charToRaw():
    
    charToRaw("Hadley")
    #> [1] 48 61 64 6c 65 79
    
    x1 <- "El Ni\xf1o was particularly bad this year"
    x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
    
    x1
    #> [1] "El Ni\xf1o was particularly bad this year"
    x2
    #> [1] "\x82\xb1\x82\xf1\x82ɂ\xbf\x82\xcd"
    
    parse_character(x1, locale = locale(encoding = "Latin1"))
    #> [1] "El Niño was particularly bad this year"
    parse_character(x2, locale = locale(encoding = "Shift-JIS"))
    #> [1] "こんにちは"
    
    查看编码格式
    guess_encoding(charToRaw(x1))
    #> # A tibble: 2 x 2
    #>   encoding   confidence
    #>   <chr>           <dbl>
    #> 1 ISO-8859-1       0.46
    #> 2 ISO-8859-9       0.23
    guess_encoding(charToRaw(x2))
    #> # A tibble: 1 x 2
    #>   encoding confidence
    #>   <chr>         <dbl>
    #> 1 KOI8-R         0.42
    
    因子
    fruit <- c("apple", "banana")
    parse_factor(c("apple", "banana", "bananana"), levels = fruit)
    #> Warning: 1 parsing failure.
    #> row col           expected   actual
    #>   3  -- value in level set bananana
    #> [1] apple  banana <NA>  
    #> attr(,"problems")
    #> # A tibble: 1 x 4
    #>     row   col expected           actual  
    #>   <int> <int> <chr>              <chr>   
    #> 1     3    NA value in level set bananana
    #> Levels: apple banana
    
    时间
    parse_datetime("2010-10-01T2010")
    #> [1] "2010-10-01 20:10:00 UTC"
    # If time is omitted, it will be set to midnight
    parse_datetime("20101010")
    #> [1] "2010-10-10 UTC"
    
    parse_date("2010-10-01")
    #> [1] "2010-10-01"
    
    library(hms)
    parse_time("01:10 am")
    #> 01:10:00
    parse_time("20:10:01")
    #> 20:10:01
    
    parse_date("01/02/15", "%m/%d/%y")
    #> [1] "2015-01-02"
    parse_date("01/02/15", "%d/%m/%y")
    #> [1] "2015-02-01"
    parse_date("01/02/15", "%y/%m/%d")
    #> [1] "2001-02-15"
    
    parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
    #> [1] "2015-01-01"
    
    解析文件

    既然您已经了解了如何解析单个向量,现在就回到开始部分,研究readr如何解析文件。在本节中,您将了解两个新内容:

    • readr如何自动猜测每个列的类型。
    • 如何修改默认值。

    启发式

    guess_parser("2010-10-01")
    #> [1] "date"
    guess_parser("15:01")
    #> [1] "time"
    guess_parser(c("TRUE", "FALSE"))
    #> [1] "logical"
    guess_parser(c("1", "5", "9"))
    #> [1] "double"
    guess_parser(c("12,352,561"))
    #> [1] "number"
    
    str(parse_guess("2010-10-10"))
    #>  Date[1:1], format: "2010-10-10"
    
    challenge <- read_csv(readr_example("challenge.csv"))
    Parsed with column specification:
    cols(
      x = col_double(),
      y = col_logical()
    )
    Warning: 1000 parsing failures.
     row col           expected     actual                                             file
    1001   y 1/0/T/F/TRUE/FALSE 2015-01-16 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    1002   y 1/0/T/F/TRUE/FALSE 2018-05-18 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    1003   y 1/0/T/F/TRUE/FALSE 2015-09-05 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    1004   y 1/0/T/F/TRUE/FALSE 2012-11-28 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    1005   y 1/0/T/F/TRUE/FALSE 2020-01-13 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    .... ... .................. .......... ................................................
    See problems(...) for more details.
    

    有两个打印输出:查看前1000行生成的列规范和前5个解析失败。显式地找出问题()总是一个好主意,这样您就可以更深入地研究它们:

     problems(challenge)
    # A tibble: 1,000 x 5
         row col   expected           actual     file                                            
       <int> <chr> <chr>              <chr>      <chr>                                           
     1  1001 y     1/0/T/F/TRUE/FALSE 2015-01-16 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
     2  1002 y     1/0/T/F/TRUE/FALSE 2018-05-18 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
     3  1003 y     1/0/T/F/TRUE/FALSE 2015-09-05 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
     4  1004 y     1/0/T/F/TRUE/FALSE 2012-11-28 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
     5  1005 y     1/0/T/F/TRUE/FALSE 2020-01-13 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
     6  1006 y     1/0/T/F/TRUE/FALSE 2016-04-17 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
     7  1007 y     1/0/T/F/TRUE/FALSE 2011-05-14 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
     8  1008 y     1/0/T/F/TRUE/FALSE 2020-07-18 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
     9  1009 y     1/0/T/F/TRUE/FALSE 2011-04-30 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    10  1010 y     1/0/T/F/TRUE/FALSE 2010-05-11 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    # ... with 990 more rows
    

    一个好的策略是逐列工作,直到没有问题为止。这里我们可以看到x列有很多解析问题——整数值后面有尾随字符。这意味着我们需要使用双解析器。

    challenge <- read_csv(
      readr_example("challenge.csv"), 
      col_types = cols(
        x = col_integer(),
        y = col_character()
      )
    )
    
    Warning: 1000 parsing failures.
     row col               expected             actual                                             file
    1001   x no trailing characters .23837975086644292 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    1002   x no trailing characters .41167997173033655 'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    1003   x no trailing characters .7460716762579978  'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    1004   x no trailing characters .723450553836301   'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    1005   x no trailing characters .614524137461558   'D:/R-3.5.1/library/readr/extdata/challenge.csv'
    .... ... ...................... .................. ................................................
    See problems(...) for more details.
    
    challenge <- read_csv(
      readr_example("challenge.csv"), 
      col_types = cols(
        x = col_double(),
        y = col_character()
      )
    )
    
    tail(challenge)
    # A tibble: 6 x 2
          x y         
      <dbl> <chr>     
    1 0.805 2019-11-21
    2 0.164 2018-03-29
    3 0.472 2014-08-04
    4 0.718 2015-08-16
    5 0.270 2020-02-04
    6 0.608 2019-01-06
    
    
    challenge <- read_csv(
      readr_example("challenge.csv"), 
      col_types = cols(
        x = col_double(),
        y = col_date()
      )
    )
    tail(challenge)
    #> # A tibble: 6 x 2
    #>       x y         
    #>   <dbl> <date>    
    #> 1 0.805 2019-11-21
    #> 2 0.164 2018-03-29
    #> 3 0.472 2014-08-04
    #> 4 0.718 2015-08-16
    #> 5 0.270 2020-02-04
    #> 6 0.608 2019-01-06
    
    challenge2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)
    #> Parsed with column specification:
    #> cols(
    #>   x = col_double(),
    #>   y = col_date(format = "")
    #> )
    challenge2
    #> # A tibble: 2,000 x 2
    #>       x y         
    #>   <dbl> <date>    
    #> 1   404 NA        
    #> 2  4172 NA        
    #> 3  3004 NA        
    #> 4   787 NA        
    #> 5    37 NA        
    #> 6  2332 NA        
    #> # … with 1,994 more rows
    
    challenge2 <- read_csv(readr_example("challenge.csv"), 
                           col_types = cols(.default = col_character())
    )
    
    challenge2
    # A tibble: 2,000 x 2
       x     y    
       <chr> <chr>
     1 404   NA   
     2 4172  NA   
     3 3004  NA   
     4 787   NA   
     5 37    NA   
     6 2332  NA   
     7 2489  NA   
     8 1449  NA   
     9 3665  NA   
    10 3863  NA   
    # ... with 1,990 more rows
    
    
    df <- tribble(
      ~x,  ~y,
      "1", "1.21",
      "2", "2.32",
      "3", "4.56"
    )
    df
    #> # A tibble: 3 x 2
    #>   x     y    
    #>   <chr> <chr>
    #> 1 1     1.21 
    #> 2 2     2.32 
    #> 3 3     4.56
    
    # Note the column types
    type_convert(df)
    #> Parsed with column specification:
    #> cols(
    #>   x = col_double(),
    #>   y = col_double()
    #> )
    #> # A tibble: 3 x 2
    #>       x     y
    #>   <dbl> <dbl>
    #> 1     1  1.21
    #> 2     2  2.32
    #> 3     3  4.56
    
    文件写出

    readr还提供了两个将数据写入磁盘的有用函数:write_csv()和write_tsv()。这两个函数都增加了输出文件被正确读入的机会:

    • 总是用UTF-8编码字符串。
    • 以ISO8601格式保存日期和日期时间,以便在其他地方轻松解析。
    write_csv(challenge, "challenge.csv")
    
    challenge
    #> # A tibble: 2,000 x 2
    #>       x y         
    #>   <dbl> <date>    
    #> 1   404 NA        
    #> 2  4172 NA        
    #> 3  3004 NA        
    #> 4   787 NA        
    #> 5    37 NA        
    #> 6  2332 NA        
    #> # … with 1,994 more rows
    write_csv(challenge, "challenge-2.csv")
    read_csv("challenge-2.csv")
    #> Parsed with column specification:
    #> cols(
    #>   x = col_double(),
    #>   y = col_logical()
    #> )
    #> # A tibble: 2,000 x 2
    #>       x y    
    #>   <dbl> <lgl>
    #> 1   404 NA   
    #> 2  4172 NA   
    #> 3  3004 NA   
    #> 4   787 NA   
    #> 5    37 NA   
    #> 6  2332 NA   
    #> # … with 1,994 more rows
    
    
    write_rds(challenge, "challenge.rds")
    read_rds("challenge.rds")
    #> # A tibble: 2,000 x 2
    #>       x y         
    #>   <dbl> <date>    
    #> 1   404 NA        
    #> 2  4172 NA        
    #> 3  3004 NA        
    #> 4   787 NA        
    #> 5    37 NA        
    #> 6  2332 NA        
    #> # … with 1,994 more rows
    

    feather包实现了一种快速的二进制文件格式,可以跨编程语言共享:

    library(feather)
    write_feather(challenge, "challenge.feather")
    read_feather("challenge.feather")
    #> # A tibble: 2,000 x 2
    #>       x      y
    #>   <dbl> <date>
    #> 1   404   <NA>
    #> 2  4172   <NA>
    #> 3  3004   <NA>
    #> 4   787   <NA>
    #> 5    37   <NA>
    #> 6  2332   <NA>
    #> # ... with 1,994 more rows
    

    r4ds

    相关文章

      网友评论

        本文标题:R for data science ||使用readr进行数据

        本文链接:https://www.haomeiwen.com/subject/ffxklctx.html