Data wrangling

Creating tibbles

- 强转data.frame

as_tibble(iris) # as.tibble()已弃用
#> # A tibble: 150 x 5
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#> 1          5.1         3.5          1.4         0.2 setosa 
#> 2          4.9         3            1.4         0.2 setosa 
#> 3          4.7         3.2          1.3         0.2 setosa 
#> 4          4.6         3.1          1.5         0.2 setosa 
#> 5          5           3.6          1.4         0.2 setosa 
#> 6          5.4         3.9          1.7         0.4 setosa 
#> # … with 144 more rows

- 常规创建tibble

tibble(
  x = 1:5, 
  y = 1, 
  z = x ^ 2 + y
)
#> # A tibble: 5 x 3
#>       x     y     z
#>   <int> <dbl> <dbl>
#> 1     1     1     2
#> 2     2     1     5
#> 3     3     1    10
#> 4     4     1    17
#> 5     5     1    26

It’s possible for a tibble to have column names that are not valid R variable names, aka non-syntactic names.

tb <- tibble(
  `:)` = "smile", 
  ` ` = "space",
  `2000` = "number"
); tb
#> # A tibble: 1 x 3
#>   `:)`  ` `   `2000`
#>   <chr> <chr> <chr> 
#> 1 smile space number

- tribble()

允许少量数据横向构建tibble，易于观察
（常规用#注释标注表头）

tb1 <- tribble(
  ~x, ~y, ~z,
  #--|--|----
  "a", 2, 3.6,
  "b", 1, 8.5
)
#> # A tibble: 2 x 3
#>   x         y     z
#>   <chr> <dbl> <dbl>
#> 1 a         2   3.6
#> 2 b         1   8.5

class(tb1)
# [1] "tbl_df"     "tbl"        "data.frame"

Tibbles vs. data.frame

- print()

参数n控制显示的行数

tibble(
  a = lubridate::now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),
  e = sample(letters, 1e3, replace = TRUE)
)
#> # A tibble: 1,000 x 5
#>   a                   b              c     d e    
#>   <dttm>              <date>     <int> <dbl> <chr>
#> 1 2020-01-15 20:43:23 2020-01-22     1 0.368 n    
#> 2 2020-01-16 14:48:32 2020-01-27     2 0.612 l    
#> 3 2020-01-16 09:12:12 2020-02-06     3 0.415 p    
#> 4 2020-01-15 22:33:29 2020-02-05     4 0.212 m    
#> 5 2020-01-15 18:57:45 2020-02-02     5 0.733 i    
#> 6 2020-01-16 05:58:42 2020-01-29     6 0.460 n    
#> # … with 994 more rows

tb1 %>% 
  print()

tb1 %>% 
  print(n = 3)

参数width设置Inf则显示所有列，默认则为适应宽度
参数n_extra设置不直接显示而在页脚出现的列名个数，默认全显示

width: Width of text output to generate. This defaults to NULL, which means use getOption("tibble.width") or (if also NULL) getOption("width"); the latter displays only the columns that fit on one screen. You can also set options(tibble.width = Inf) to override this default and always print all columns.
n_extra: Number of extra columns to print abbreviated information for, if the width is too small for the entire tibble. If NULL, the default, will print information about at most tibble.max_extra_cols extra columns.

nycflights13::flights %>% 
  print(n = 3)

nycflights13::flights %>% 
  print(n = 3, width = Inf)

nycflights13::flights %>% 
  print(n = 3, n_extra = 5)

You can also control the default print behaviour by setting options:

options(tibble.print_max = n, tibble.print_min = m): if more than n rows, print only m rows. Use options(tibble.print_min = Inf) to always show all rows.

Use options(tibble.width = Inf) to always print all columns, regardless of the width of the screen.

- lubridate包

补充lubridate包用法，部分参考：Learn R | 日期时间处理之lubridate包

(1) 解析时点

library(lubridate)
ymd(20200406)
# [1] "2020-04-06"


# tz可以指定时区
ymd_hms("20200406 21:30:30", tz = "Pacific/Auckland")
# [1] "2020-04-06 21:30:30 NZST" 


# 智能识别杂乱格式
test_date<- c("2020-04-06", "2020 04 06", "2020-4-6",
              "2020-4, 6", "Created on 2020 4 06", "202004 !!! 06")
ymd(test_date)
# [1] "2020-04-06" "2020-04-06" "2020-04-06" "2020-04-06" "2020-04-06" "2020-04-06"


# parse_date_time可分布解析时间
test_date <- c('20200406','120315','12/17/1996','09-01-01',
               '2015 12 23','2009-1, 5','Created on 2013 4 6')
parse_date_time(test_date,order = c('ymd','mdy','dmy','ymd'))
# [1] "2020-04-06 UTC" "2015-12-03 UTC" "1996-12-17 UTC" "2009-01-01 UTC" 
# [5] "2015-12-23 UTC" "2009-01-05 UTC" "2013-04-06 UTC"

(2) 查改时点

second()，minute()，hour()，day()，wday()，yday()，week()，month()，year()，tz()分别可以提取秒，分，小时，天，周的第几天，年的第几天，星期，月，年和时区的信息

test_date <- ymd_hms('2020/04/06/21/10/42')

second(test_date)
# [1] 42

second(test_date) <- 32
second(test_date)
# [1] 32

wday(test_date)
# [1] 2
wday(test_date, label = TRUE)
# [1] 周一
# Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六

(3) 日期计算

# periods
> minutes(1)   
[1] "1M 0S"
# durations[加前缀'd']
> dminutes(1)  
[1] "60s"

leap_year(2020)
# [1] TRUE
ymd(20200101)+years(1)
# [1] "2021-01-01"
ymd(20200101)+dyears(1)
# [1] "2020-12-31"

test_date + weeks(0:5)
# [1] "2020-04-06 21:10:32 UTC" "2020-04-20 21:10:32 UTC" "2020-05-04 21:10:32 UTC"
# [4] "2020-05-18 21:10:32 UTC" "2020-06-01 21:10:32 UTC" "2020-06-15 21:10:32 UTC"

# 日期加减需要用 %m+%
test_date <- as.Date('2020-04-06')
test_date %m+% months(20:30)
# [1] "2021-12-06" "2022-01-06" "2022-02-06" "2022-03-06" "2022-04-06" "2022-05-06" 
# [7] "2022-06-06" "2022-07-06" "2022-08-06" "2022-09-06" "2022-10-06"

- Subsetting

df <- tibble(
  x = runif(5),
  y = rnorm(5)
)

# Extract by name
df$x
#> [1] 0.7330 0.2344 0.6604 0.0329 0.4605
df[["x"]]
#> [1] 0.7330 0.2344 0.6604 0.0329 0.4605

# Extract by position
df[[1]]
#> [1] 0.7330 0.2344 0.6604 0.0329 0.4605

结合管道标识符

df %>% 
  .$x
#> [1] 0.7330 0.2344 0.6604 0.0329 0.4605
df %>% 
  .[["x"]]
#> [1] 0.7330 0.2344 0.6604 0.0329 0.4605

- Q: Compare and contrast the following operations on a data.frame and equivalent tibble. What is different?

df <- data.frame(abc = 1, xyz = "a")
df$x
# [1] a
# Levels: a
# df中没有`x`列，只有`xyz`列，但开头模糊匹配也能取出数据

df <- tibble(abc = 1, xyz = "a")
df$x
# NULL
# Warning message:
# Unknown or uninitialised column: 'x'.
# 如果用tibble则无法模糊匹配

- Q: If you have the name of a variable stored in an object, e.g. var <- "mpg", how can you extract the reference variable from a tibble?

df <- tribble(~x, ~y, ~mpg,
              #--/--/----
              1,  2,  'a',
              3,  4,  'b'
              )

var <- 'mpg'

df$var
# NULL
# Warning message:
# Unknown or uninitialised column: 'var'. 
# $方法无法取出

df[[var]]
# [1] "a" "b"

- Q: Practice referring to non-syntactic names

annoying <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
)

#> Extracting the variable called 1. 
annoying$`1` 
# 等价于
annoying[[1]]
# [1]  1  2  3  4  5  6  7  8  9 10

#> Plotting a scatterplot of 1 vs 2. 
annoying %>% 
  ggplot(aes(x = `1`, y = `2`)) +
  geom_point()

#> Creating a new column called 3 which is 2 divided by 1. 
annoying %>% 
  mutate(`3` = `2` / `1`)

#> Renaming the columns to one, two and three.
annoying %>% 
  mutate(`3` = `2` / `1`) %>% 
  rename(one = `1`,
         two = `2`,
         three = `3`)

- Q: What does tibble::enframe() and deframe() do

'''
enframe() converts named atomic vectors or lists to one- or two-column data frames. For a list, the result will be a nested tibble with a column of type list. For unnamed vectors, the natural sequence is used as name column.
deframe() converts two-column data frames to a named vector or list, using the first column as name and the second column as value. If the input has only one column, an unnamed vector is returned.
'''

enframe(1:3)
enframe(c(a = 5, b = 7))
enframe(list(one = 1, two = 2:3, three = 4:6))

deframe(enframe(1:3))
deframe(tibble(a = 1:3))
deframe(tibble(a = as.list(1:3)))

deframe(enframe(list(one = 1, two = 2:3, three = 4:6))