解析一些read_csv的参数
skip=n 跳过前n行
comment=“#”,丢弃所有以#开头的行
col_names=FALSE 数据读取没有列
“”反斜杠加n 用于换行
na,设定用哪些值用缺失值处理 na==“.” 即点用缺失值代替
为什么不用R基础包中的read.csv呢?
速度快
可以生产tibble不会将字符向量转换为因子、
解析向量, Parse_*函数族
str(parse_logical(c("TRUE", "FALSE", "NA")))
## logi [1:3] TRUE FALSE NA
#> logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
## int [1:3] 1 2 3
#> int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
## Date[1:2], format: "2010-01-01" "1979-10-14"
#> Date[1:2], format: "2010-01-01" "1979-10-14"
#解析失败则输出以缺失值形式存在
x <- parse_integer(c("123", "345", "abc", "123.45"))
## Warning: 2 parsing failures.
## row col expected actual
## 3 -- an integer abc
## 4 -- no trailing characters .45
x
## [1] 123 345 NA NA
## attr(,"problems")
## # A tibble: 2 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA an integer abc
## 2 4 NA no trailing characters .45
problems(x)#获取完整的失败信息合集
## # A tibble: 2 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA an integer abc
## 2 4 NA no trailing characters .45
重要的解析函数
parse_logical() parse_integer() 分别解析逻辑值和整数
parse_double严格数值型解析函数 parse_number灵活数值型解析函数
parse_character 字符编码很重要
paese_factor 可创建因子,R使用这种数据结构表示分类变量
parse_datetime parse_date, parse_time 解析日期 时间
数值
parse_double("1.23")
## [1] 1.23
# 设置新的地区对象, decimal_mark参数,覆盖.的默认值
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
# parse_number忽略数值前后的非数值型字符
parse_number("$100")
## [1] 100
parse_number("20%")
## [1] 20
parse_number("It cost $123.45")
## [1] 123.45
# parse_number忽略分组符号
parse_number("$123,456,789")
## [1] 123456789
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789
parse_number("123'456'789", locale = locale(grouping_mark = "'"))
## [1] 123456789
字符串
# charToRaw获取字符串的底层表示, ASCII码 16进制表示英文字符
charToRaw("Hadley")
## [1] 48 61 64 6c 65 79
# gusess_encoding函数 找到编码方式
# parse_character设定编码方式
因子
fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set bananana
## [1] apple banana <NA>
## attr(,"problems")
## # A tibble: 1 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA value in level set bananana
## Levels: apple banana
日期,日期与时间, 时间
#parse_datetime 期待的日期时间是符合 ISO 8601标准的日期时间(一种国际标准)
parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
parse_datetime("20101010")
## [1] "2010-10-10 UTC"
# parse_date期待的是四位数的年份,格式如下以- 或/分割
parse_date("2010-10-01")
## [1] "2010-10-01"
#parse_time 期待的是小时:分钟和秒,:am或pm 标识符
library(hms)
parse_time("01:10 am")
## 01:10:00
#> 01:10:00
parse_time("20:10:01")
## 20:10:01
#> 20:10:01
# 如默认设置不符合,可自行设置,格式如下
# 解析成三个不同的时间
parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
#> [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
## [1] "2015-02-01"
#> [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
## [1] "2001-02-15"
#> [1] "2001-02-15"
Year
%Y (4 digits).
%y (2 digits); 00-69 -> 2000-2069, 70-99 -> 1970-1999.
Month
%m (2 digits).
%b (abbreviated name, like “Jan”).
%B (full name, “January”).
Day
%d (2 digits).
%e (optional leading space).
Time
%H 0-23 hour.
%I 0-12, must be used with %p.
%p AM/PM indicator.
%M minutes.
%S integer seconds.
%OS real seconds.
%Z Time zone (as name, e.g. America/Chicago). Beware of abbreviations: if you’re American, note that “EST” is a Canadian time zone that does not have daylight savings time. It is not Eastern Standard Time! We’ll come back to this time zones.
%z (as offset from UTC, e.g. +0800).
Non-digits
%. skips one non-digit character.
%* skips any number of non-digits.
# readr解析文件
## readr以一种启发式过程来确定每列的类型
## 先用guess_parse()函数返回readr最可信的猜测
# 示例
guess_parser("2010-10-01")
## [1] "date"
#> [1] "date"
guess_parser("15:01")
## [1] "time"
#> [1] "time"
guess_parser(c("TRUE", "FALSE"))
## [1] "logical"
#> [1] "logical"
guess_parser(c("1", "5", "9"))
## [1] "double"
#> [1] "double"
guess_parser(c("12,352,561"))
## [1] "number"
#> [1] "number"
str(parse_guess("2010-10-10"))
## Date[1:1], format: "2010-10-10"
默认设置的问题解决 1. 前1000行不能代表所有行,2列中有大量缺失值
challenge <- read_csv(readr_example("challenge.csv")) #
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_logical()
## )
## Warning: 1000 parsing failures.
## row col expected actual file
## 1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## .... ... .................. .......... ..................................................
## See problems(...) for more details.
# redadr_example读取R包中的文件路径
problems(challenge)
## # A tibble: 1,000 x 5
## row col expected actual file
## <int> <chr> <chr> <chr> <chr>
## 1 1001 y 1/0/T/F/TRUE/F~ 2015-01-~ 'D:/R/R-3.5.2/library/readr/extda~
## 2 1002 y 1/0/T/F/TRUE/F~ 2018-05-~ 'D:/R/R-3.5.2/library/readr/extda~
## 3 1003 y 1/0/T/F/TRUE/F~ 2015-09-~ 'D:/R/R-3.5.2/library/readr/extda~
## 4 1004 y 1/0/T/F/TRUE/F~ 2012-11-~ 'D:/R/R-3.5.2/library/readr/extda~
## 5 1005 y 1/0/T/F/TRUE/F~ 2020-01-~ 'D:/R/R-3.5.2/library/readr/extda~
## 6 1006 y 1/0/T/F/TRUE/F~ 2016-04-~ 'D:/R/R-3.5.2/library/readr/extda~
## 7 1007 y 1/0/T/F/TRUE/F~ 2011-05-~ 'D:/R/R-3.5.2/library/readr/extda~
## 8 1008 y 1/0/T/F/TRUE/F~ 2020-07-~ 'D:/R/R-3.5.2/library/readr/extda~
## 9 1009 y 1/0/T/F/TRUE/F~ 2011-04-~ 'D:/R/R-3.5.2/library/readr/extda~
## 10 1010 y 1/0/T/F/TRUE/F~ 2010-05-~ 'D:/R/R-3.5.2/library/readr/extda~
## # ... with 990 more rows
## 修改列类型,根据problems()返回的结果
## 告诉readr如何加载数据
challenge<-read_csv(
readr_example("challenge.csv"),
col_types = cols(
x=col_double(),
y=col_date()
)
)
tail(challenge)#查看后6个
## # A tibble: 6 x 2
## x y
## <dbl> <date>
## 1 0.805 2019-11-21
## 2 0.164 2018-03-29
## 3 0.472 2014-08-04
## 4 0.718 2015-08-16
## 5 0.270 2020-02-04
## 6 0.608 2019-01-06
# 先将所有列都作为字符向量输入,发现读取问题
challenge2 <- read_csv(readr_example("challenge.csv"),
col_types = cols(.default = col_character())
)
tail(challenge2)##
## # A tibble: 6 x 2
## x y
## <chr> <chr>
## 1 0.805274312151596 2019-11-21
## 2 0.1635163405444473 2018-03-29
## 3 0.47193897631950676 2014-08-04
## 4 0.7183186465408653 2015-08-16
## 5 0.26987858884967864 2020-02-04
## 6 0.608237189007923 2019-01-06
# type_convert函数启动启发式解析
df <- tribble(
~x, ~y,
"1", "1.21",
"2", "2.32",
"3", "4.56"
)
df
## # A tibble: 3 x 2
## x y
## <chr> <chr>
## 1 1 1.21
## 2 2 2.32
## 3 3 4.56
type.convert(df)##启发式解析数据
## # A tibble: 3 x 2
## x y
## <int> <dbl>
## 1 1 1.21
## 2 2 2.32
## 3 3 4.56
写入文件
write_csv(challenge,"challenge.csv")
read_csv("challenge.csv")
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_logical()
## )
## Warning: 1000 parsing failures.
## row col expected actual file
## 1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 'challenge.csv'
## 1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 'challenge.csv'
## 1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 'challenge.csv'
## 1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 'challenge.csv'
## 1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 'challenge.csv'
## .... ... .................. .......... ...............
## See problems(...) for more details.
## # A tibble: 2,000 x 2
## x y
## <dbl> <lgl>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
# 但存在的问题是,保存为csv文件后类型信息就丢失了,读取需要重新解析
# 暂存结果 write_rds() read_rds函数,实际是对基础函数saveRDS() readRDS的包装
write_rds(challenge,"challenge.rds")
read_rds("challenge.rds")#读取后仍保留了格式
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
# feather包实现了一种快速二进制格式,可在多个编程语言间共享
library(feather)
write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
# feather比RDS格式更快,且可在R之外使用
网友评论