《R for Data Science》第十六章 Dates and times 啃书知识点积累
参考链接:R for Data Science
Dates and times are hard because they have to reconcile two physical phenomena
(the rotation of the Earth and its orbit around the sun)
目前已经把R更新到4.0,安装
nycflights13
出了些麻烦,最后用以下命令
install.packages("https://cran.r-project.org/src/contrib/nycflights13_1.0.1.tar.gz",
repos=NULL, method="libcurl")
Creating date/times
A date-time is a date plus a time: it uniquely identifies an instant in time.
Tibbles print this as<dttm>
. Elsewhere in R these are calledPOSIXct
library(lubridate)
today()
#> [1] "2020-04-27"
now()
#> [1] "22020-04-27 10:11:40 CST"
# today()中的tzone参数控制时区
?today()
today("GMT")
today("UTC")
- From strings
ymd("2020-04-27")
#> [1] "2020-04-27"
mdy("April 27st, 2020")
#> [1] "2020-04-27"
dmy("27-Apr-2020")
#> [1] "2020-04-27"
mdy("Apr-27-2020")
#> [1] "2020-04-27"
# dttm格式也可以
ymd_hms("2020-04-27 20:11:59")
#> [1] "2020-04-27 20:11:59 UTC"
mdy_hm("04/27/2020 08:01")
#> [1] "2020-04-27 08:01:00 UTC"
# 如果向量中含有无效字符串
ymd(c('20200427','XiChen'))
# [1] "2020-04-27" NA
# Warning message:
# 1 failed to parse.
- These functions also take unquoted numbers
ymd(20200427)
#> [1] "2020-04-27"
ymd(20200427, tz = "UTC")
#> [1] "2020-04-27 UTC"
- From individual components
To create a date/time from this sort of input, use
make_date()
for dates, ormake_datetime()
for date-times
library(tidyverse)
library(nycflights13)
library(lubridate)
flights %>%
select(year, month, day, hour, minute) %>%
mutate(departure = make_datetime(year, month, day, hour, minute))
#> # A tibble: 336,776 x 6
#> year month day hour minute departure
#> <int> <int> <int> <dbl> <dbl> <dttm>
#> 1 2013 1 1 5 15 2013-01-01 05:15:00
#> 2 2013 1 1 5 29 2013-01-01 05:29:00
#> 3 2013 1 1 5 40 2013-01-01 05:40:00
#> 4 2013 1 1 5 45 2013-01-01 05:45:00
#> 5 2013 1 1 6 0 2013-01-01 06:00:00
#> 6 2013 1 1 5 58 2013-01-01 05:58:00
#> # … with 3.368e+05 more rows
# 也可以用make_date不包含time
flights %>%
select(year, month, day, hour, minute) %>%
mutate(date = make_date(year, month))
- 自建函数配合
make_
解析日期时间
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
- From other types
-
date
和dttm
互换
as_datetime(today())
#> [1] "2020-01-15 UTC"
as_date(now())
#> [1] "2020-01-15"
- “Unix Epoch” 基于1970-01-01
as_datetime(60 * 60 * 10)
#> [1] "1970-01-01 10:00:00 UTC"
# 中间有两个闰年
as_date(365 * 10 + 2)
#> [1] "1980-01-01"
Date-time components
datetime <- ymd_hms("2016-07-08 12:34:56")
year(datetime)
#> [1] 2016
month(datetime)
#> [1] 7
mday(datetime)
#> [1] 8
yday(datetime)
#> [1] 190
wday(datetime)
#> [1] 6
- 可以设置具体参数优化提取
month(datetime, label = TRUE)
#> [1] Jul
#> 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(datetime, label = TRUE, abbr = FALSE)
#> [1] Friday
#> 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
# wday的label参数关系到映射的文本
p1 <- flights_dt %>%
mutate(wday = wday(dep_time)) %>%
ggplot(aes(x = wday)) +
geom_bar()
p2 <- flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
ggplot(aes(x = wday)) +
geom_bar()
p1 + p2
- 一个类似于“幸存者偏差”的案例
# 实际起飞
p1 <- flights_dt %>%
mutate(minute = minute(dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()) %>%
ggplot() +
geom_line(aes(minute, avg_delay))
# 计划起飞
p2 <- flights_dt %>%
mutate(minute = minute(sched_dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()) %>%
ggplot() +
geom_line(aes(minute, avg_delay))
p1 + p2
- Rounding
将时间归并到近似单元
floor_date()
round_date()
-
ceiling_date()
(需要指定unit)
floor_date(today(), unit = "year") + months(0:11)
#> [1] "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" "2019-05-01"
#> [6] "2019-06-01" "2019-07-01" "2019-08-01" "2019-09-01" "2019-10-01"
#> [11] "2019-11-01" "2019-12-01"
flights_dt %>%
count(week = floor_date(dep_time, "week")) %>%
ggplot(aes(week, n)) +
geom_line()
- Setting components
# 可以直接逐个设置
(datetime <- ymd_hms("2016-07-08 12:34:56"))
#> [1] "2016-07-08 12:34:56 UTC"
year(datetime) <- 2020
datetime
#> [1] "2020-07-08 12:34:56 UTC"
month(datetime) <- 01
datetime
#> [1] "2020-01-08 12:34:56 UTC"
hour(datetime) <- hour(datetime) + 1
datetime
#> [1] "2020-01-08 13:34:56 UTC"
# 也可以用update更新
update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
#> [1] "2020-02-02 02:34:56 UTC"
# 值过大会滚动累加
ymd("2015-02-01") %>%
update(mday = 30)
#> [1] "2015-03-02"
ymd("2015-02-01") %>%
update(hour = 400)
#> [1] "2015-02-17 16:00:00 UTC"
- Q: How does the distribution of flight times within a day change over the course of the year?
flights_dt %>%
filter(!is.na(dep_time)) %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
mutate(month = factor(month(dep_time))) %>%
ggplot(aes(dep_hour, color = month)) +
geom_freqpoly(aes(y = ..density..), binwidth = 60 * 60)
Time spans
- Durations
Durations always record the time span in seconds.
c_age <- today() - ymd(19941027)
c_age
# Time difference of 9314 days
as.duration(c_age) # 先转换为s
# [1] "804729600s (~25.5 years)"
dseconds(15)
#> [1] "15s"
dminutes(10)
#> [1] "600s (~10 minutes)"
dhours(c(12, 24))
#> [1] "43200s (~12 hours)" "86400s (~1 days)"
ddays(0:5)
#> [1] "0s" "86400s (~1 days)" "172800s (~2 days)"
#> [4] "259200s (~3 days)" "345600s (~4 days)" "432000s (~5 days)"
dweeks(3)
#> [1] "1814400s (~3 weeks)"
dyears(1)
#> [1] "31536000s (~52.14 weeks)"
# 可以做计算
2 * dyears(1)
#> [1] "63072000s (~2 years)"
dyears(1) + dweeks(12) + dhours(15)
#> [1] "38847600s (~1.23 years)"
tomorrow <- today() + ddays(1);tomorrow
# [1] "2020-04-28"
last_year <- today() - dyears(1)
# [1] "2020-04-26"
- Periods
Periods are time spans but don’t have a fixed length in seconds, instead they work with “human” times, like days and months.
# Durations无法根据时区调整
one_pm <- ymd_hms("2016-03-12 13:00:00", tz = "America/New_York")
one_pm
#> [1] "2016-03-12 13:00:00 EST"
one_pm + ddays(1)
#> [1] "2016-03-13 14:00:00 EDT"
# Periods可以
one_pm
#> [1] "2016-03-12 13:00:00 EST"
one_pm + days(1)
#> [1] "2016-03-13 13:00:00 EDT"
# 另一个例子
# A leap year
ymd("2016-01-01") + dyears(1)
#> [1] "2016-12-31"
ymd("2016-01-01") + years(1)
#> [1] "2017-01-01"
- 多数时候periods和durations用法类似
但解析的是“human units”而不是durations中的秒
seconds(15)
#> [1] "15S"
minutes(10)
#> [1] "10M 0S"
hours(c(12, 24))
#> [1] "12H 0M 0S" "24H 0M 0S"
days(7)
#> [1] "7d 0H 0M 0S"
months(1:6)
#> [1] "1m 0d 0H 0M 0S" "2m 0d 0H 0M 0S" "3m 0d 0H 0M 0S" "4m 0d 0H 0M 0S"
#> [5] "5m 0d 0H 0M 0S" "6m 0d 0H 0M 0S"
weeks(3)
#> [1] "21d 0H 0M 0S"
years(1)
#> [1] "1y 0m 0d 0H 0M 0S"
# 也可以做计算
10 * (months(6) + days(1))
#> [1] "60m 10d 0H 0M 0S"
days(50) + hours(25) + minutes(2)
#> [1] "50d 25H 2M 0S"
- Intervals
涉及的符号:
%--%
years(1) / days(1)
# [1] 365.25
next_year <- today() + years(1)
(today() %--% next_year) / ddays(1)
# [1] 365
(today() %--% next_year) %/% days(1)
# [1] 365
- Summary
If you only care about physical time, use a duration;
if you need to add human times, use a period;
if you need to figure out how long a span is in human units, use an interval.
Time zones
用的少,就放两个可能用到的代码
Sys.timezone()
# [1] "Asia/Taipei"
ymd_hms("2020-04-27 12:00:00", tz = Sys.timezone())
# [1] "2020-04-27 12:00:00 CST"
网友评论