《R for Data Science》第十九章 Functions 啃书知识点积累
参考链接:R for Data Science
When should you write a function?
“do not repeat yourself” (or DRY) principle
适用于超过两次的重复代码操作
df <- tibble::tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df$a <- (df$a - min(df$a, na.rm = TRUE)) /
(max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$b <- (df$b - min(df$b, na.rm = TRUE)) /
(max(df$b, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$c <- (df$c - min(df$c, na.rm = TRUE)) /
(max(df$c, na.rm = TRUE) - min(df$c, na.rm = TRUE))
df$d <- (df$d - min(df$d, na.rm = TRUE)) /
(max(df$d, na.rm = TRUE) - min(df$d, na.rm = TRUE))
-
range
函数:返回给定向量的最小值和最大值
range(c(1,3,5))
# [1] 1 5
range(c(11,32,25))
# [1] 11 32
# 上述代码可以简化为
rescale01 <- function(x) {
rng <- range(x, na.rm = TRUE)
(x - rng[1]) / (rng[2] - rng[1])
}
df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)
# 实际上课本中的代码可以进一步简化为
apply(df, 2, rescale01)
- 优化函数,消除无限值的干预
rescale01(x)
# [1] 0 0 0 0 0 0 0 0 0 0 NaN
# 故修改函数体
rescale01 <- function(x) {
rng <- range(x, na.rm = TRUE, finite = TRUE)
(x - rng[1]) / (rng[2] - rng[1])
}
rescale01(x)
# [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
# [8] 0.7777778 0.8888889 1.0000000 Inf
-
na.rm
和finite
的综合应用
rescale1 <- function(x, na.rm) {
rng <- range(x, na.rm = na.rm)
(x - rng[1]) / (rng[2] - rng[1])
}
rescale1(c(NA, 1:5), na.rm = FALSE)
#> [1] NA NA NA NA NA NA
rescale1(c(NA, 1:5), na.rm = TRUE)
#> [1] NA 0.00 0.25 0.50 0.75 1.00
# 加上finite后na.rm的设置不重要了
rescale2 <- function(x, na.rm) {
rng <- range(x, na.rm = na.rm, finite = TRUE)
(x - rng[1]) / (rng[2] - rng[1])
}
rescale2(c(NA, 1:5), na.rm = FALSE)
#> [1] NA 0.00 0.25 0.50 0.75 1.00
rescale2(c(NA, 1:5), na.rm = TRUE)
#> [1] NA 0.00 0.25 0.50 0.75 1.00
原因:The option
finite = TRUE
torange()
will drop all non-finite elements, andNA
is a non-finite element.
另外可以注意,无限值可以相互比较
-Inf == -Inf
# [1] TRUE
- 两个在环境中实用的脚本函数
# 判断path是否是文件夹
is_directory <- function(x) file.info(x)$isdir
# 判断文件是否可读(存在且有权限打开)
is_readable <- function(x) file.access(x, 4) == 0
- 函数设置关键字变量默认值即可空值调用
greet <- function(time = lubridate::now()) {
hr <- lubridate::hour(time)
# I don't know what to do about times after midnight,
# are they evening or morning?
if (hr < 12) {
print("good morning")
} else if (hr < 17) {
print("good afternoon")
} else {
print("good evening")
}
}
greet()
Functions are for humans and computers
ctrl + shift + R
# - -----------------------------------------------------------------------
Conditional execution
if (condition) {
# code executed when condition is TRUE
} else {
# code executed when condition is FALSE
}
- Conditions
The condition must evaluate to either
TRUE
orFALSE
.
If it’s a vector, you’ll get a warning message; if it’s anNA
, you’ll get an error.
if (c(TRUE, FALSE)) {}
#> Warning in if (c(TRUE, FALSE)) {: the condition has length > 1 and only the
#> first element will be used
#> NULL
if (NA) {}
#> Error in if (NA) {: missing value where TRUE/FALSE needed
- 短路逻辑表达式拼接
||
&&
用于逻辑表达式的判断,不是向量化操作符|
和 &
As soon as
||
sees the firstTRUE
it returnsTRUE
without computing anything else. As soon as&&
sees the firstFALSE
it returnsFALSE
.
- 检查向量相等
# identical是严格检测类型,返回单个逻辑判断
identical(0L, 0)
#> [1] FALSE
- Multiple conditions
else if
if (this) {
# do that
} else if (that) {
# do something else
} else {
#
}
-
switch
适用于判断较多的情况
multi_op <- function(x, y, op) {
switch(op,
plus = x + y,
minus = x - y,
times = x * y,
divide = x / y,
stop("Unknown op!")
)
}
switch
的其他用法
# 如果是整数则按索引返回
switch(1, "apple", "banana", "cantaloupe")
#> [1] "apple"
switch(2, "apple", "banana", "cantaloupe")
#> [1] "banana"
# 如果是非整数则忽略小数部分
switch(1.2, "apple", "banana", "cantaloupe")
#> [1] "apple"
switch(2.8, "apple", "banana", "cantaloupe")
#> [1] "banana"
switch
涉及缺失值和无表达式的情况
switcheroo <- function(x) {
switch(x,
a = ,
b = "ab",
c = NA,
d = "cd"
)
}
# a涉及的表达式为空,则轮次到下一个非空表达式共享给a
switcheroo("a")
#> [1] "ab"
switcheroo("b")
#> [1] "ab"
switcheroo("c")
#> [1] NA
switcheroo("d")
#> [1] "cd"
switcheroo("e")
- Code style
- An opening curly brace should never go on its own line and should always be followed by a new line.
- A closing curly brace should always go on its own line, unless it’s followed by else.
- Always indent the code inside curly braces.
- It’s ok to drop the curly braces if you have a very short if statement that can fit on one line.
y <- 10
x <- if (y < 20) "Too low" else "Too high"
Notice that when you call a function, you should place a space around
=
in function calls, and always put a space after a comma, not before (just like in regular English). Using whitespace makes it easier to skim the function for the important components.
# Good
average <- mean(feet / 12 + inches, na.rm = TRUE)
- “Fizz Buzz” 问题的几种解法
课本给出的这一解法个人认为不应该用短路&&
否则一旦!(x %% 3)
为假时短路判断
下一个else if
跟得依然是!(x %% 3)
,直接调转下一个
实际是多了一步无效代码
应该为:
fizzbuzz <- function(x) {
# these two lines check that x is a valid input
stopifnot(length(x) == 1)
stopifnot(is.numeric(x))
if (!(x %% 3) & !(x %% 5)) {
"fizzbuzz"
} else if (!(x %% 3)) {
"fizz"
} else if (!(x %% 5)) {
"buzz"
} else {
# ensure that the function returns a character vector
as.character(x)
}
}
# 第二种解法用嵌套判断增加科学性
fizzbuzz2 <- function(x) {
# these two lines check that x is a valid input
stopifnot(length(x) == 1)
stopifnot(is.numeric(x))
if (!(x %% 3)) {
if (!(x %% 5)) {
"fizzbuzz"
} else {
"fizz"
}
} else if (!(x %% 5)) {
"buzz"
} else {
# ensure that the function returns a character vector
as.character(x)
}
}
# 第三种解法是经典case_when,向量化
fizzbuzz_vec <- function(x) {
case_when(
!(x %% 3) & !(x %% 5) ~ "fizzbuzz",
!(x %% 3) ~ "fizz",
!(x %% 5) ~ "buzz",
TRUE ~ as.character(x)
)
}
# 第四种解法是利用向量完成向量化
fizzbuzz_vec2 <- function(x) {
y <- as.character(x)
# put the individual cases first - any elements divisible by both 3 and 5
# will be overwritten with fizzbuzz later
y[!(x %% 3)] <- "fizz"
y[!(x %% 3)] <- "buzz"
y[!(x %% 3) & !(x %% 5)] <- "fizzbuzz"
y
}
- cut
函数适用于有序判定
if (temp <= 0) {
"freezing"
} else if (temp <= 10) {
"cold"
} else if (temp <= 20) {
"cool"
} else if (temp <= 30) {
"warm"
} else {
"hot"
}
# 可简化为(右闭)
temp <- seq(-10, 50, by = 5)
cut(temp, c(-Inf, 0, 10, 20, 30, Inf),
right = TRUE,
labels = c("freezing", "cold", "cool", "warm", "hot")
)
#> [1] freezing freezing freezing cold cold cool cool
#> [8] warm warm hot hot hot hot
#> Levels: freezing cold cool warm hot
# 也可以改成右开,即小于号
temp <- seq(-10, 50, by = 5)
cut(temp, c(-Inf, 0, 10, 20, 30, Inf),
right = FALSE,
labels = c("freezing", "cold", "cool", "warm", "hot")
)
#> [1] freezing freezing cold cold cool cool warm
#> [8] warm hot hot hot hot hot
#> Levels: freezing cold cool warm hot
Function arguments
The default value should almost always be the most common value.
The few exceptions to this rule are to do with safety. For example, it makes sense forna.rm
to default toFALSE
because missing values are important. Even thoughna.rm = TRUE
is what you usually put in your code, it’s a bad idea to silently ignore missing values by default.
- Choosing names
- 常默认的简易变量命名
- Checking values
- 要善于施加约束
wt_mean <- function(x, w) {
sum(x * w) / sum(w)
}
wt_var <- function(x, w) {
mu <- wt_mean(x, w)
sum(w * (x - mu) ^ 2) / sum(w)
}
wt_sd <- function(x, w) {
sqrt(wt_var(x, w))
}
# 这里权重和数据长度不同,但由于循环补齐依然能运行
wt_mean(1:6, 1:3)
#> [1] 7.67
It’s good practice to check important preconditions, and throw an error (with
stop()
)
wt_mean <- function(x, w) {
if (length(x) != length(w)) {
stop("`x` and `w` must be the same length", call. = FALSE)
}
sum(w * x) / sum(w)
}
# 如果进一步完善代码会变得更复杂
wt_mean <- function(x, w, na.rm = FALSE) {
if (!is.logical(na.rm)) {
stop("`na.rm` must be logical")
}
if (length(na.rm) != 1) {
stop("`na.rm` must be length 1")
}
if (length(x) != length(w)) {
stop("`x` and `w` must be the same length", call. = FALSE)
}
if (na.rm) {
miss <- is.na(x) | is.na(w)
x <- x[!miss]
w <- w[!miss]
}
sum(w * x) / sum(w)
}
# 可以使用stopifnot函数,断言需要为真表达式
wt_mean <- function(x, w, na.rm = FALSE) {
stopifnot(is.logical(na.rm), length(na.rm) == 1)
stopifnot(length(x) == length(w))
if (na.rm) {
miss <- is.na(x) | is.na(w)
x <- x[!miss]
w <- w[!miss]
}
sum(w * x) / sum(w)
}
wt_mean(1:6, 6:1, na.rm = "foo")
#> Error in wt_mean(1:6, 6:1, na.rm = "foo"): is.logical(na.rm) is not TRUE
- Dot-dot-dot (…)
有些函数允许任意数量输入:
sum(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
#> [1] 55
stringr::str_c("a", "b", "c", "d", "e", "f")
#> [1] "abcdef"
They rely on a special argument:
...
(pronounced dot-dot-dot). This special argument captures any number of arguments that aren’t otherwise matched.
commas <- function(...) stringr::str_c(..., collapse = ", ")
commas(letters[1:10])
#> [1] "a, b, c, d, e, f, g, h, i, j"
rule <- function(..., pad = "-") {
title <- paste0(...)
width <- getOption("width") - nchar(title) - 5
cat(title, " ", stringr::str_dup(pad, width), "\n", sep = "")
}
rule("Important output")
#> Important output -----------------------------------------------------------
...
的代价:关键字参数拼错不会报错,答案可能出错
sum(x, na.rm = TRUE)
# [1] 3
sum(x, na.mr = TRUE)
# [1] 4
Return values
complicated_function <- function(x, y, z) {
if (length(x) == 0 || length(y) == 0) {
return(0)
}
# Complicated code here
}
Environment
lexical scoping
f <- function(x) {
x + y
}
y <- 100
f(10)
#> [1] 110
y <- 1000
f(10)
#> [1] 1010
网友评论