美文网首页
readr进行数据导入

readr进行数据导入

作者: 医科研 | 来源:发表于2019-01-25 18:44 被阅读25次
    解析一些read_csv的参数
    skip=n 跳过前n行
    comment=“#”,丢弃所有以#开头的行
    col_names=FALSE 数据读取没有列
    “”反斜杠加n 用于换行
    na,设定用哪些值用缺失值处理 na==“.” 即点用缺失值代替
    为什么不用R基础包中的read.csv呢?
    速度快
    可以生产tibble不会将字符向量转换为因子、
    解析向量, Parse_*函数族
    str(parse_logical(c("TRUE", "FALSE", "NA")))
    ##  logi [1:3] TRUE FALSE NA
    #>  logi [1:3] TRUE FALSE NA
    str(parse_integer(c("1", "2", "3")))
    ##  int [1:3] 1 2 3
    #>  int [1:3] 1 2 3
    str(parse_date(c("2010-01-01", "1979-10-14")))
    ##  Date[1:2], format: "2010-01-01" "1979-10-14"
    #>  Date[1:2], format: "2010-01-01" "1979-10-14"
    
    #解析失败则输出以缺失值形式存在
    x <- parse_integer(c("123", "345", "abc", "123.45"))
    ## Warning: 2 parsing failures.
    ## row col               expected actual
    ##   3  -- an integer                abc
    ##   4  -- no trailing characters    .45
    x
    ## [1] 123 345  NA  NA
    ## attr(,"problems")
    ## # A tibble: 2 x 4
    ##     row   col expected               actual
    ##   <int> <int> <chr>                  <chr> 
    ## 1     3    NA an integer             abc   
    ## 2     4    NA no trailing characters .45
    problems(x)#获取完整的失败信息合集
    ## # A tibble: 2 x 4
    ##     row   col expected               actual
    ##   <int> <int> <chr>                  <chr> 
    ## 1     3    NA an integer             abc   
    ## 2     4    NA no trailing characters .45
    重要的解析函数
    parse_logical() parse_integer() 分别解析逻辑值和整数
    parse_double严格数值型解析函数 parse_number灵活数值型解析函数
    parse_character 字符编码很重要
    paese_factor 可创建因子,R使用这种数据结构表示分类变量
    parse_datetime parse_date, parse_time 解析日期 时间
    数值
    parse_double("1.23")
    ## [1] 1.23
    # 设置新的地区对象, decimal_mark参数,覆盖.的默认值
    parse_double("1,23", locale = locale(decimal_mark = ","))
    ## [1] 1.23
    # parse_number忽略数值前后的非数值型字符
    parse_number("$100")
    ## [1] 100
    parse_number("20%")
    ## [1] 20
    parse_number("It cost $123.45")
    ## [1] 123.45
    # parse_number忽略分组符号
    parse_number("$123,456,789")
    ## [1] 123456789
    parse_number("123.456.789", locale = locale(grouping_mark = "."))
    ## [1] 123456789
    parse_number("123'456'789", locale = locale(grouping_mark = "'"))
    ## [1] 123456789
    字符串
    # charToRaw获取字符串的底层表示, ASCII码 16进制表示英文字符
    charToRaw("Hadley")
    ## [1] 48 61 64 6c 65 79
    # gusess_encoding函数 找到编码方式
    # parse_character设定编码方式
    因子
    fruit <- c("apple", "banana")
    parse_factor(c("apple", "banana", "bananana"), levels = fruit)
    ## Warning: 1 parsing failure.
    ## row col           expected   actual
    ##   3  -- value in level set bananana
    ## [1] apple  banana <NA>  
    ## attr(,"problems")
    ## # A tibble: 1 x 4
    ##     row   col expected           actual  
    ##   <int> <int> <chr>              <chr>   
    ## 1     3    NA value in level set bananana
    ## Levels: apple banana
    日期,日期与时间, 时间
    #parse_datetime 期待的日期时间是符合 ISO 8601标准的日期时间(一种国际标准)
    parse_datetime("2010-10-01T2010")
    ## [1] "2010-10-01 20:10:00 UTC"
    parse_datetime("20101010")
    ## [1] "2010-10-10 UTC"
    # parse_date期待的是四位数的年份,格式如下以- 或/分割
    parse_date("2010-10-01")
    ## [1] "2010-10-01"
    #parse_time 期待的是小时:分钟和秒,:am或pm 标识符
    library(hms)
    parse_time("01:10 am")
    ## 01:10:00
    #> 01:10:00
    parse_time("20:10:01")
    ## 20:10:01
    #> 20:10:01
    
    # 如默认设置不符合,可自行设置,格式如下
    # 解析成三个不同的时间
    parse_date("01/02/15", "%m/%d/%y")
    ## [1] "2015-01-02"
    #> [1] "2015-01-02"
    parse_date("01/02/15", "%d/%m/%y")
    ## [1] "2015-02-01"
    #> [1] "2015-02-01"
    parse_date("01/02/15", "%y/%m/%d")
    ## [1] "2001-02-15"
    #> [1] "2001-02-15"
    Year
    %Y (4 digits).
    %y (2 digits); 00-69 -> 2000-2069, 70-99 -> 1970-1999.
    Month
    %m (2 digits).
    %b (abbreviated name, like “Jan”).
    %B (full name, “January”).
    Day
    %d (2 digits).
    %e (optional leading space).
    Time
    %H 0-23 hour.
    %I 0-12, must be used with %p.
    %p AM/PM indicator.
    %M minutes.
    %S integer seconds.
    %OS real seconds.
    %Z Time zone (as name, e.g. America/Chicago). Beware of abbreviations: if you’re American, note that “EST” is a Canadian time zone that does not have daylight savings time. It is not Eastern Standard Time! We’ll come back to this time zones.
    %z (as offset from UTC, e.g. +0800).
    Non-digits
    %. skips one non-digit character.
    %* skips any number of non-digits.
    
    # readr解析文件
    ## readr以一种启发式过程来确定每列的类型
    ## 先用guess_parse()函数返回readr最可信的猜测
    # 示例
    guess_parser("2010-10-01")
    ## [1] "date"
    #> [1] "date"
    guess_parser("15:01")
    ## [1] "time"
    #> [1] "time"
    guess_parser(c("TRUE", "FALSE"))
    ## [1] "logical"
    #> [1] "logical"
    guess_parser(c("1", "5", "9"))
    ## [1] "double"
    #> [1] "double"
    guess_parser(c("12,352,561"))
    ## [1] "number"
    #> [1] "number"
    
    str(parse_guess("2010-10-10"))
    ##  Date[1:1], format: "2010-10-10"
    默认设置的问题解决 1. 前1000行不能代表所有行,2列中有大量缺失值
    challenge <- read_csv(readr_example("challenge.csv")) #
    ## Parsed with column specification:
    ## cols(
    ##   x = col_double(),
    ##   y = col_logical()
    ## )
    ## Warning: 1000 parsing failures.
    ##  row col           expected     actual                                               file
    ## 1001   y 1/0/T/F/TRUE/FALSE 2015-01-16 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
    ## 1002   y 1/0/T/F/TRUE/FALSE 2018-05-18 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
    ## 1003   y 1/0/T/F/TRUE/FALSE 2015-09-05 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
    ## 1004   y 1/0/T/F/TRUE/FALSE 2012-11-28 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
    ## 1005   y 1/0/T/F/TRUE/FALSE 2020-01-13 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
    ## .... ... .................. .......... ..................................................
    ## See problems(...) for more details.
    # redadr_example读取R包中的文件路径
    problems(challenge)
    ## # A tibble: 1,000 x 5
    ##      row col   expected        actual    file                              
    ##    <int> <chr> <chr>           <chr>     <chr>                             
    ##  1  1001 y     1/0/T/F/TRUE/F~ 2015-01-~ 'D:/R/R-3.5.2/library/readr/extda~
    ##  2  1002 y     1/0/T/F/TRUE/F~ 2018-05-~ 'D:/R/R-3.5.2/library/readr/extda~
    ##  3  1003 y     1/0/T/F/TRUE/F~ 2015-09-~ 'D:/R/R-3.5.2/library/readr/extda~
    ##  4  1004 y     1/0/T/F/TRUE/F~ 2012-11-~ 'D:/R/R-3.5.2/library/readr/extda~
    ##  5  1005 y     1/0/T/F/TRUE/F~ 2020-01-~ 'D:/R/R-3.5.2/library/readr/extda~
    ##  6  1006 y     1/0/T/F/TRUE/F~ 2016-04-~ 'D:/R/R-3.5.2/library/readr/extda~
    ##  7  1007 y     1/0/T/F/TRUE/F~ 2011-05-~ 'D:/R/R-3.5.2/library/readr/extda~
    ##  8  1008 y     1/0/T/F/TRUE/F~ 2020-07-~ 'D:/R/R-3.5.2/library/readr/extda~
    ##  9  1009 y     1/0/T/F/TRUE/F~ 2011-04-~ 'D:/R/R-3.5.2/library/readr/extda~
    ## 10  1010 y     1/0/T/F/TRUE/F~ 2010-05-~ 'D:/R/R-3.5.2/library/readr/extda~
    ## # ... with 990 more rows
    ## 修改列类型,根据problems()返回的结果
    ## 告诉readr如何加载数据
    challenge<-read_csv(
      readr_example("challenge.csv"),
      col_types = cols(
        x=col_double(),
        y=col_date()
        )
    )
    tail(challenge)#查看后6个
    ## # A tibble: 6 x 2
    ##       x y         
    ##   <dbl> <date>    
    ## 1 0.805 2019-11-21
    ## 2 0.164 2018-03-29
    ## 3 0.472 2014-08-04
    ## 4 0.718 2015-08-16
    ## 5 0.270 2020-02-04
    ## 6 0.608 2019-01-06
    # 先将所有列都作为字符向量输入,发现读取问题
    challenge2 <- read_csv(readr_example("challenge.csv"), 
      col_types = cols(.default = col_character())
    )
    tail(challenge2)##
    ## # A tibble: 6 x 2
    ##   x                   y         
    ##   <chr>               <chr>     
    ## 1 0.805274312151596   2019-11-21
    ## 2 0.1635163405444473  2018-03-29
    ## 3 0.47193897631950676 2014-08-04
    ## 4 0.7183186465408653  2015-08-16
    ## 5 0.26987858884967864 2020-02-04
    ## 6 0.608237189007923   2019-01-06
    # type_convert函数启动启发式解析
    df <- tribble(
      ~x,  ~y,
      "1", "1.21",
      "2", "2.32",
      "3", "4.56"
    )
    df
    ## # A tibble: 3 x 2
    ##   x     y    
    ##   <chr> <chr>
    ## 1 1     1.21 
    ## 2 2     2.32 
    ## 3 3     4.56
    type.convert(df)##启发式解析数据
    ## # A tibble: 3 x 2
    ##       x     y
    ##   <int> <dbl>
    ## 1     1  1.21
    ## 2     2  2.32
    ## 3     3  4.56
    写入文件
    write_csv(challenge,"challenge.csv")
    read_csv("challenge.csv")
    ## Parsed with column specification:
    ## cols(
    ##   x = col_double(),
    ##   y = col_logical()
    ## )
    ## Warning: 1000 parsing failures.
    ##  row col           expected     actual            file
    ## 1001   y 1/0/T/F/TRUE/FALSE 2015-01-16 'challenge.csv'
    ## 1002   y 1/0/T/F/TRUE/FALSE 2018-05-18 'challenge.csv'
    ## 1003   y 1/0/T/F/TRUE/FALSE 2015-09-05 'challenge.csv'
    ## 1004   y 1/0/T/F/TRUE/FALSE 2012-11-28 'challenge.csv'
    ## 1005   y 1/0/T/F/TRUE/FALSE 2020-01-13 'challenge.csv'
    ## .... ... .................. .......... ...............
    ## See problems(...) for more details.
    ## # A tibble: 2,000 x 2
    ##        x y    
    ##    <dbl> <lgl>
    ##  1   404 NA   
    ##  2  4172 NA   
    ##  3  3004 NA   
    ##  4   787 NA   
    ##  5    37 NA   
    ##  6  2332 NA   
    ##  7  2489 NA   
    ##  8  1449 NA   
    ##  9  3665 NA   
    ## 10  3863 NA   
    ## # ... with 1,990 more rows
    # 但存在的问题是,保存为csv文件后类型信息就丢失了,读取需要重新解析
    # 暂存结果 write_rds() read_rds函数,实际是对基础函数saveRDS() readRDS的包装
    write_rds(challenge,"challenge.rds")
    read_rds("challenge.rds")#读取后仍保留了格式
    ## # A tibble: 2,000 x 2
    ##        x y         
    ##    <dbl> <date>    
    ##  1   404 NA        
    ##  2  4172 NA        
    ##  3  3004 NA        
    ##  4   787 NA        
    ##  5    37 NA        
    ##  6  2332 NA        
    ##  7  2489 NA        
    ##  8  1449 NA        
    ##  9  3665 NA        
    ## 10  3863 NA        
    ## # ... with 1,990 more rows
    # feather包实现了一种快速二进制格式,可在多个编程语言间共享
    library(feather)
    write_feather(challenge, "challenge.feather")
    read_feather("challenge.feather")
    ## # A tibble: 2,000 x 2
    ##        x y         
    ##    <dbl> <date>    
    ##  1   404 NA        
    ##  2  4172 NA        
    ##  3  3004 NA        
    ##  4   787 NA        
    ##  5    37 NA        
    ##  6  2332 NA        
    ##  7  2489 NA        
    ##  8  1449 NA        
    ##  9  3665 NA        
    ## 10  3863 NA        
    ## # ... with 1,990 more rows
    # feather比RDS格式更快,且可在R之外使用
    

    相关文章

      网友评论

          本文标题:readr进行数据导入

          本文链接:https://www.haomeiwen.com/subject/xrsujqtx.html