缘起
整理文件时发现之前的笔记,放到这里。
目的
坐车等无聊情景下方便回顾这些技能。
# this is a note about the book [R For Data Science]
# and a little bitter of [R In Action]
# and [Advanced R]
# Rujiang Liang
# Created in 2018
# R FOR DATA SCIENCE ------------------------------------------------------
# DATA TRANSFORMATION -----------------------------------------------------
library(tidyverse)
library(nycflights13)
nycflights13::flights %>%
print(n = 10, width = Inf) # width = Inf to show all variables
iris %>%
.$Species %>%
table() %>%
(function(x) x / 3) %>%
(function(x) x + 2) %>%
`*`(2) %>%
round() %>%
as.character()
diamonds2 <- diamonds %>%
filter(between(y, 3, 20))
iris %>%
.[["Species"]]
jan1 <- flights %>%
filter(month == 1, day == 1)
test <- flights %>%
filter(month == 11 | month == 12)
table(test$month)
filter(flights, month %in% c(11, 12)) %>%
count(month)
# ***FALSE****
filter(flights, month == 11 | 12) %>% count(month)
flights %>%
arrange(year, month, day)
flights %>%
arrange(desc(dep_delay))
# SELECT helper
# starts_with("abc"): matches names that begin with “abc”.
# ends_with("xyz"): matches names that end with “xyz”.
# contains("ijk"): matches names that contain “ijk”.
# matches("(.)\\1"): selects variables that match a regular expression.
# num_range("x", 1:3): matches x1, x2 and x3
flights %>%
rename(YEAR = year, MONTH = month)
select(flights, time_hour, air_time, everything())
# generate variables by existing variables
flights %>%
select(
year:day,
ends_with("delay"),
distance,
air_time
) %>%
mutate(
gain = dep_delay - arr_delay,
hour = air_time / 60,
gaim_per_hour = gain / hour
)
# keep ONLY new variables
transmute(flights,
gain = dep_delay - arr_delay,
hour = air_time / 60,
gain_per_hour = gain / hour
)
transmute(flights,
dep_time, # so that we can keep oringal variables
hour = dep_time %/% 100,
minute = dep_time %% 100
)
# group_by summary
table1 <- tibble(
a = c(1, 2),
b = c(100, 200)
)
table1 %>%
count(a)
table1 %>%
count(a, wt = b)
flights %>%
group_by(year, month, day) %>%
summarise(count = n(), # MEAN or SD calculated by how many cases?
delay = mean(dep_delay, na.rm = T)) # na.rm = T should not be omited
# unless the data have no NA.
flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = T),
delay = mean(arr_delay, na.rm = T)
) %>%
filter(count > 20, dest != "HNL")
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay),
n = n())
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(delay = mean(arr_delay)) %>%
ggplot(aes(x = delay)) +
geom_freqpoly(binwidth = 10)
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
#first(x), nth(x, 2), last(x) work similarly to x[1], x[2], x[length(x)]
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first_dep = first(dep_time),
last_dep = last(dep_time),
quantile = quantile(dep_time, .75)
)
iris %>%
group_by(Species) %>%
summarise(
mean = mean(Sepal.Width),
quantile = quantile(Sepal.Length, .75))
sapply(Filter(is.numeric, iris), function(i) list(mean = mean(i), sd = sd(i)))
funs <- list(min = min, median = median, mean = mean, max = max, sd = sd)
sapply(funs, function(f) f(1:10, na.rm = T))
sapply(Filter(is.numeric, iris),
function(x) sapply(funs, function(f) f(x, na.rm = T)))
iris %>%
split(.$Species) %>%
sapply(., function(df)
list(mean = mean(df$Sepal.Width),
quantile = quantile(df$Sepal.Length, .75)))
iris %>%
summarise(n())
iris %>%
group_by(Species) %>%
summarise(n(),
n_distinct(Sepal.Length),
sum(!is.na(Sepal.Length)))
not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
avg_delay1 = mean(arr_delay),
avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
)
#n(), sum(!is.na(x)), n_distinct(x)
not_cancelled %>%
group_by(dest) %>%
summarise(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
not_cancelled %>%
group_by(year, month, day) %>%
filter(rank((arr_delay)) < 10)
iris %>%
as.tibble %>%
filter(rank(Sepal.Width) < 10)
iris %>%
as.tibble %>%
filter(min_rank(Sepal.Width) < 10)
iris %>%
as.tibble %>%
filter(rank(desc(Sepal.Width)) < 10)
popular_dests <- flights %>%
group_by(dest) %>%
filter(n() > 365)
popular_dests
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)
# related database (merge)
library(nycflights13)
# left_join: by = c("dest" = "faa")
top_dest <- flights %>%
count(dest, sort = T) %>%
head(10)
flights %>%
semi_join(top_dest)
## the below part of related database is not important
library(nycflights13)
airlines
airports
planes
planes %>%
count(tailnum) %>%
filter(n > 1)
weather %>%
count(year, month, day, hour, origin) %>%
filter(n > 1)
flights %>%
count(year, month, day, hour, flight) %>%
filter(n > 1)
flights <- flights %>%
mutate(id = row_number())
flights
flights %>%
count(id) %>%
filter(n > 1)
flights2 <- nycflights13::flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = "carrier")
flights2 %>%
select(-origin, -dest) %>%
mutate(name = airlines$name[match(carrier, airlines$carrier)])
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
1, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2"
)
left_join(x, y, by = "key")
left_join(y, x, by = "key")
flights2 %>%
left_join(weather)
flights2 %>%
left_join(planes, by = "tailnum")
flights2 %>%
left_join(airports, by = c("dest" = "faa"))
flights2 %>%
left_join(airports, by = c("origin" = "faa"))
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point(color = "blue") +
coord_quickmap() +
theme_void()
top_dest <- flights %>%
count(dest, sort = T) %>%
head(10)
flights %>%
filter(dest %in% top_dest$dest)
flights %>%
semi_join(top_dest, by = "dest")
# spreading and gathering
library(tidyverse)
table4a
tidy4a <- table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
tidy4a
table4b
tidy4b <- table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")
tidy4a %>%
left_join(tidy4b, by = c("country", "year"))
table2
table2 %>%
spread(key = type, value = count)
table3
table3 %>%
separate(rate, into = c("cases", "population"))
table3 %>%
separate(rate, into = c("cases", "population"), sep = "/", convert = T)
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
table5
table5 %>%
unite(new, century, year)
table5 %>%
unite(new, century, year, sep = "")
stocks <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
stocks
stocks %>%
spread(year, return)
stocks %>%
spread(year, return) %>%
gather(`2015`:`2016`, key = "year", value = "return")
stocks %>%
spread(year, return) %>%
gather(`2015`:`2016`, key = "year", value = "return", na.rm = T)
women %>%
mutate(id = row_number()) %>%
gather(-id, key = "key", value = "value")
stocks %>%
complete(year, qtr)
treatment <- tribble(
~ person, ~ treatment, ~response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, 9,
"Katherine Burke", 1, 4
)
treatment
treatment %>%
fill(person)
# an example of data cleaning
library(tidyverse)
who
who1 <- who %>%
gather(new_sp_m014:newrel_f65, key = "key", value = "case", na.rm = T)
who1
who1 %>%
count(key)
who2 <- who1 %>%
mutate(key = str_replace(key, "newrel", "new_rel"))
who2
who3 <- who2 %>%
separate(key, c("new", "type", "sexage"), sep = "_")
who3 %>%
count(new)
who4 <- who3 %>%
select(-new, -iso2, -iso3)
who4
who5 <- who4 %>%
separate(sexage, c("sex", "age"), sep = 1)
who5
tidyr::who %>%
gather(key, value, new_sp_m014:newrel_f65, na.rm = T) %>%
mutate(key = str_replace(key, "newrel", "new_rel")) %>%
separate(key, c("new", "var", "sexage")) %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex", "age"), sep = 1)
# INPUT / OUTPUT ----------------------------------------------------------
library(tidyverse)
ggplot(diamonds, aes(carat, price)) +
geom_hex()
ggsave("diamonds.pdf")
plot1 <- ggplot(diamonds, aes(clarity, price)) +
geom_boxplot()
ggsave("diamonds.png", plot = plot1)
write_csv(diamonds, "diamonds.csv")
read_csv("diamonds.csv")
read_csv(
"a, b, c
1, 2, 3
4, 5, 6"
)
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3", skip = 2)
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
read_csv("1,2,3\n4,5,6", col_names = FALSE)
read_csv("1,2,3\n4,5,6")
read_csv("a,b,c\n1,2,.", na = ".")
write_rds(iris, "iris.rds")
read_rds("iris.rds")
# save(file, file = "name.RData)
# load("name.Rdata")
# STRINGR & REGULAR EXPRESSION --------------------------------------------
#strings
library(tidyverse)
string1 <- "this is a string"
string2 <- 'if i want to include a "quote" inside a string, i use single quotes'
x <- c("\"", "\\")
x
writeLines(x)
str_length(c("a", "r for data science", NA))
str_c("x", "y", sep = ", ")
x <- c("abc", NA)
x
str_c("|-", x, "-|")
str_c("|-", str_replace_na(x), "-|")
str_c("prefix-", c("a", "b", "c"), "-suffix")
name <- "Hadley"
time_of_day <- "morning"
birthday <- F
str_c("good ", time_of_day, " ", name, if (birthday) " and happy birthday",
".")
str_c(c("x", "y", "z"), collapse = ", ")
#subsetting strings
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
str_sub(x, -3, -1)
str_sub("a", 1, 5)
str_sub(x, -3, -1) <- str_to_upper(str_sub(x, -3, -1))
x
#regular expression
x <- c("apple", "banana", "pear")
str_view(x, "an")
str_view(x, ".a.")
str_view(c("abc", "a.c", "bef"), "a\\.c")
x <- "a\\b"
writeLines(x)
str_view(x, "\\\\")
x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
str_view(x, "^apple$")
#For example, I’ll search for \bsum\b to
#avoid matching summarise, summary, rowsum and so on.
# \\d: matches any digit.
# \\s: matches any whitespace (e.g. space, tab, newline).
# [abc]: matches a, b, or c.
# [^abc]: matches anything except a, b, or c.
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]c")
str_view(c("abc", "a.c", "a*c", "a c"), "a[b ]c")
# This works for most (but not all)
# regex metacharacters: $ . | ? * + ( ) [ {.
# Unfortunately, a few characters have special meaning
# even inside a character class and
# must be handled with backslash escapes: ] \ ^ and -
str_view(c("grey", "gray"), "gr(e|a)y")
str_view(c("grey", "gray"), "gr[ea]y")
str_view(c("grey", "gray", "ray", "r"), ".*r.*")
# ?: 0 or 1
# +: 1 or more
# *: 0 or more
# precedence of these operators is high
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
str_view(x, "CC+")
str_view(x, "C[LX]+")
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
str_view(x, "C{2,3}?")
str_view(x, "C[LX]+?")
str_view(fruit, "(..)\\1", match = T)
x <- c("apple", "banana", "pear")
str_detect(x, "e")
sum(str_detect(words, "^t"))
mean(str_detect(words, "[aeiou]$"))
no_vowels_1 <- !str_detect(words, "[aeiou]")
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
words[str_detect(words, "x$")]
str_subset(words, "x$")
df <- tibble(
word = words,
i = seq_along(word)
)
df %>%
filter(str_detect(word, "x$"))
x <- c("apple", "banana", "pear")
str_count(x, "a")
mean(str_count(x, "a"))
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
str_count("abababa", "aba")
str_view_all("abababa", "aba")
length(sentences)
head(sentences)
colors <- c("red", "orange", "yellow", "green", "blue", "purple")
color_match <- str_c(colors, collapse = "|")
color_match
has_color <- str_subset(sentences, color_match)
matchs <- str_extract(has_color, color_match)
more <- sentences[str_count(sentences, color_match) > 1]
more
str_extract(more, color_match)
str_extract_all(more, color_match, simplify = T)
str_extract_all(more, color_match, simplify = F)
x <- c("a", "a b", "abc")
str_extract_all(x, "[a-z]", simplify = T)
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
has_noun %>%
str_match(noun)
# FACTORS -----------------------------------------------------------------
library(forcats)
x1 <- c("Dec", "Apr", "Jan", "Mar")
sort(x1)
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
y1 <- factor(x1, levels = month_levels)
sort(y1)
y1
x2 <- c("Dec", "Apr", "Jam", "Mar")
y2 <- factor(x2, levels = month_levels) # elements not in levels will be replaced with NA
y2
sort(y2)
levels(y2)
factor(x1)
factor(x1, levels = unique(x1)) # waht is this mean?
gss_cat %>%
count(race)
gss_cat %>%
ggplot(aes(race)) +
geom_bar()
gss_cat %>%
ggplot(aes(race)) +
geom_bar() +
scale_x_discrete(drop = F)
relig_summary <- gss_cat %>%
group_by(relig) %>%
summarise(
age = mean(age, na.rm = T),
tvhours = mean(tvhours, na.rm = T),
n = n()
)
ggplot(relig_summary, aes(tvhours, relig)) +
geom_point()
ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) +
geom_point()
relig_summary %>%
mutate(relig = fct_reorder(relig, tvhours)) # what is this mean?
rincome_summary <- gss_cat %>%
group_by(rincome) %>%
summarise(
age = mean(age, na.rm = T),
tvhours = mean(tvhours, ra.rm = T),
n = n()
)
ggplot(rincome_summary, aes(age, rincome)) + geom_point()
ggplot(rincome_summary, aes(age, fct_relevel(rincome, "Not applicable"))) + # what is this mean?
geom_point()
# FUNCTION ----------------------------------------------------------------
library(tidyverse)
library(magrittr)
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
x <- df$a
(x - min(x, no.rm = T)) / (max(x, na.rm = T) - min(x, na.rm = T))
rng <- range(x, na.rm = T)
(x - rng[1]) / (rng[2] - rng[1])
rescale01 <- function(x) { # rescale a vector, use for-loop if you want rescale a dataframe
rng <- range(x, na.rm = T)
(x - rng[1]) / (rng[2] - rng[1])
}
rescale01(c(0, 5, 10))
rescale01(c(-10, 0, 10))
rescale01(c(1, 2, 3, NA, 5))
has_name <- function(x) {
nms <- names(x)
if (is.null(nms)) {
rep(FALSE, length(x))
} else {
!is.na(nms) & nms != ""
}
}
# if(length(x) == 0 || lenght(y) == 0), note that "OR AND" in *condition*
# shou be written as "|| &&"
f <- function(x, y, op) {
switch(op,
plus = x + y,
minus = x - y,
times = x * y,
dvide = x / y,
stop("Unknown op")
)
}
f(1, 2, "plus")
f(2, 2, "minus")
y <- 10
x <- if (y < 20) "Too low" else "Too high"
x
mean_ci <- function(x, conf = .95) {
se <- sd(x) / sqrt(length(x))
alpha = 1 - conf
mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}
x <- runif(100)
mean_ci(x)
mean_ci(x, .99)
wt_mean <- function(x, w) {
if (length(x) != length(w)) {
stop("'x' and 'w' must be the same length", call. = F) # call. = F what mean?
}
sum(w * x) / sum(x)
}
wt_mean <- function(x, w, na.rm = F) { # how robust the function you want?
if (!is.logical(na.rm)) {
stop("'na.rm' must be logical")
}
if (length(na.rm) != 1) {
stop("'na.rm' must be length 1")
}
if(length(x) != length(w)) {
stop("'x' and 'w' must be the same length", call. = F)
}
if(na.rm) { # impressive if (condition) {} nothing happened if (condition = F)
miss <- is.na(x) | is.na(w)
w <- w[!miss]
x <- x[!miss]
}
sum(x * w) / sum(w)
}
# a shorthand of preview one, but no message to show if there are errors
wt_mean <- function(x, w, na.rm = F) {
stopifnot(is.logical(na.rm), length(na.rm) == 1)
stopifnot(length(x) == length(w)) # stop if not (TURE)
if(na.rm) {
miss <- is.na(x) | is.na(w)
w <- w[!miss]
x <- x[!miss]
}
sum(x * w) / sum(w)
}
commas <- function(...) stringr::str_c(..., collapse = ", ")
commas(letters[1:10])
rule <- function(..., pad = "-") {
title <- paste0(...)
width = getOption("width") - nchar(title) - 5
cat(title, " ", stringr::str_dup(pad, width), "\n", sep = "")
}
x <- c(1, 2)
sum(x, na.mr = T)
sum(x, na.mr = F) # be careful, some function treat T as 1 and F as 0
library(tidyverse)
typeof(letters)
typeof(1:10)
typeof(1)
typeof(1L)
x <- sqrt(2) ^ 2
x
x - 2
x == 2
c(-1, 0, 1) / 0
x <- sample(20, 100, replace = T)
sum(x > 10)
mean(x > 10)
x <- c(10, 3, NA, 5, 8, 1, NA)
x[!is.na(x)]
x[x %% 2 == 0]
x[x %% 2 == 0 & !is.na(x)]
y <- c(x = 1, y = 2, z = 3)
purrr::set_names(1:3, c("a", "b", "c"))
y["x"]
y[c("x", "y")]
x <- list(1, 2, 3)
x
str(x)
x_named <- list(a = 1, b = 2, c = 3)
str(x_named)
y <- list("a", 1L, 1.5, T)
str(y)
z <- list(list(1, 2), list(3, 4))
str(z)
a <- list(a = 1:3, b = "a string", c = pi, d = list(-1, -5))
str(a)
str(a[1:2])
str(a[[1]])
str(a[[4]])
a[[4]][1]
a[[4]][[1]]
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
output <- vector("double", ncol(df))
for (i in seq_along(df)) {
output[[i]] <- median(df[[i]])
}
output
# 1.The output: output <- vector("double", length(x)). Before you start the loop, you must always allocate sufficient space for the output. This is very important for efficiency: if you grow the for loop at each iteration using c() (for example), your for loop will be very slow.
#
# A general way of creating an empty vector of given length is the vector() function. It has two arguments: the type of the vector (“logical”, “integer”, “double”, “character”, etc) and the length of the vector.
#
# 2.The sequence: i in seq_along(df). This determines what to loop over: each run of the for loop will assign i to a different value from seq_along(df). It’s useful to think of i as a pronoun, like “it”.
#
# You might not have seen seq_along() before. It’s a safe version of the familiar 1:length(l), with an important difference: if you have a zero-length vector, seq_along() does the right thing:
#
# y <- vector("double", 0)
# seq_along(y)
# #> integer(0)
# 1:length(y)
# #> [1] 1 0
#
# You probably won’t create a zero-length vector deliberately, but it’s easy to create them accidentally. If you use 1:length(x) instead of seq_along(x), you’re likely to get a confusing error message.
#
# 3.The body: output[[i]] <- median(df[[i]]). This is the code that does the work. It’s run repeatedly, each time with a different value for i. The first iteration will run output[[1]] <- median(df[[1]]), the second will run output[[2]] <- median(df[[2]]), and so on.
rescale01 <- function(x) {
rng <- range(x, na.rm = T)
(x - rng[1]) / (rng[2] - rng[1])
}
df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)
for (i in seq_along(df)) {
df[[i]] <- rescale01(df[[i]])
}
# for (x in xs)
#
# for (nm in names(xs))
# x[[nm]]
results <- vector("list", length(iris))
names(results) <- names(iris)
# “logical”, “integer”, “double”, “character”
for (i in seq_along(iris)) { # what is mean?
names <- names(iris)[[i]]
value <- iris[[i]]
}
means <- c(0, 1, 2)
output <- double()
for (i in seq_along(means)) {
n <- sample(100, 1)
output <- c(output, rnorm(n, means[[i]]))
}
str(output)
#save time and space
out <- vector("list", length(means))
for (i in seq_along(means)) {
n <- sample(100, 1)
out[[i]] <- rnorm(n, means[[i]])
}
str(out)
str(unlist(out))
#You might be generating a long string. Instead of paste()ing together each iteration with the previous, save the output in a character vector and then combine that vector into a single string with paste(output, collapse = "").
#You might be generating a big data frame. Instead of sequentially rbind()ing in each iteration, save the output in a list, then use dplyr::bind_rows(output) to combine the output into a single data frame.
# for (i in seq_along(x)) {
# # body
# }
#
# # eauivalent to
# i <- 1
# while (i <= length(x)) {
# # body
# i <- i + 1
# }
flip <- function() sample(c("T", "H"), 1)
flips <- 0
nheads <- 0
while (nheads < 3) {
if (flip() == "H") {
nheads = nheads + 1
} else {
nheads <- 0
}
flips <- flips + 1
}
# files <- dir("data/", pattern = "\\.csv$", full.names = T)
# created by Rujiang Liang, it is not perfect
# show_mean <- function(df) {
# vector <- logical()
# for (i in 1:ncol(df)) {
# vector[i] <- is.numeric(df[[i]])
# }
# df_reduced <- df[, vector]
# out_name <- names(df_reduced)
# out_value <- double()
# for (j in 1:ncol(df_reduced)) {
# out_value[j] <- mean(df_reduced[[j]])
# }
# as.matrix(c(out_name, out_value), ncol = 2)
# }
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
output <- vector("double", length(df))
for (i in seq_along(df)) {
output[i] <- mean(df[[i]])
}
output
col_mean <- function(df) {
output <- vector("double", length(df))
for (i in seq_along(df)){
output[i] <- mean(df[[i]])
}
output
}
col_summary <- function(df, fun) {
out <- vector("double", length(df))
for (i in seq_along(df)) {
out[i] <- fun(df[[i]])
}
out
}
col_summary(df, mean)
col_summary(df, sd)
map_dbl(df, mean)
map_dbl(df, median)
map_dbl(df, sd)
map_dbl(iris[-5], function(x) length(unique(x)))
map_dbl(iris[-5], ~length(unique(.)))
# map() makes a list.
# map_lgl() makes a logical vector.
# map_int() makes an integer vector.
# map_dbl() makes a double vector.
# map_chr() makes a character vector.
df %>% map_dbl(mean)
map_dbl(df, mean, trim = .5)
z <- list(x = 1:3, y = 4:5)
map_int(z, length)
models <- mtcars %>%
split(.$cyl) %>%
map(function(df) lm(mpg ~ wt, data = df))
models <- mtcars %>%
split(.$cyl) %>%
map(~lm(mpg ~ wt, data = .))
models %>%
map(summary) %>%
map_dbl(~.$r.squared)
models %>%
map(summary) %>%
map(~.$coefficients)
models %>%
map(summary) %>%
map(~.$coefficients[2, 1])
models %>%
map(summary) %>%
map(~.$coefficients["wt", ])
models %>%
map(summary) %>%
map_dbl("r.squared")
models %>%
map(summary) %>%
map("coefficients")
mtcars %>%
split(.$cyl) %>%
map(~lm(mpg ~ wt, .)) %>%
map(summary) %>%
map("coefficients") %>%
map(~.["wt", c("Estimate", "t value")]) %>%
reduce(rbind)
x <- list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9))
x %>% map_dbl(2)
threshold <- function(x, cutoff = .8) x[x > cutoff]
safe_log <- safely(log)
str(safe_log(10))
str(safe_log("a"))
x <- list(1, 10, "a")
y <- x %>% map(safely(log))
str(y)
y <- y %>% transpose()
str(y)
is_ok <- y$error %>% map_lgl(is_null)
x[!is_ok]
y$result[is_ok] %>% flatten_dbl()
x %>% map_dbl(possibly(log, NA))
x <- list(1, -1)
x %>% map(quietly(log)) %>% str()
mu <- tibble(`5` = 5, `10` = 10, `-3` = -3)
mu %>%
map_df(rnorm, n = 5)
mu <- list(`5` = 5, `10` = 10, `-3` = -3)
mu %>%
map(rnorm, n = 5)
mu %>%
map_df(rnorm, n = 5)
sigma <- list(1, 5, 10)
seq_along(mu) %>%
map(~rnorm(5, mu[[.]], sigma[[.]])) %>%
str()
map2(mu, sigma, rnorm, n = 5) %>% str()
n <- list(1, 3, 5)
arg1 <- list(n, mu, sigma)
arg1 %>%
pmap(rnorm) %>%
str()
# better to name the arguments
arg2 <- list(mean = mu, sd = sigma, n = n)
arg2 %>%
pmap(rnorm) %>%
str()
# a data.frame ensures name and some lenght of each column
params <- tibble(
mean = c(5, 10, -3),
sd = c(1, 5, 10),
n = c(1, 3, 5)
)
params %>%
pmap(rnorm)
sim <- tribble(
~f, ~params,
"runif", list(min = -1, max = 1),
"rnorm", list(sd = 5),
"rpois", list(lambda = 10)
)
sim %>%
mutate(sim = invoke_map(f, params, n = 10))
plots <- mtcars %>%
# group_by(cyl) %>% do not work
split(.$cyl) %>%
map(~ggplot(., aes(mpg, wt)) + geom_point())
paths <- str_c(names(plots), ".pdf")
pwalk(list(paths, plots), ggsave)
iris %>%
keep(is.factor) %>%
str()
iris %>%
keep(is.numeric) %>%
str()
iris %>%
discard(is.factor) %>%
str()
x <- list(1:5, letters, list(10))
x %>%
some(is_character)
x %>%
every(is_vector)
#detect() finds the first element where the predicate is true;
#detect_index() returns its position.
x <- sample(10)
x %>%
detect(~ . > 5)
x %>%
detect_index(~ . > 5)
#head_while() and tail_while()
#take elements from the start or end of a vector
#while a predicate is true
x %>%
head_while(~ . > 5)
x %>%
tail_while(~ . > 5)
dfs <- list(
age = tibble(name = "John", age = 30),
sex = tibble(name = c("John", "Marry"), sex = c("m", "f")),
trt = tibble(name = "Marry", treatment = "a")
)
dfs %>% reduce(full_join)
dfs <- list(
c(1, 3, 5, 6, 10),
c(1, 2, 3, 7, 8, 10),
c(1, 2, 3, 4, 8, 9, 10)
)
dfs %>%
reduce(intersect)
#Accumulate keeps all the interim results.
x <- sample(10)
x %>%
accumulate(`+`)
# MODEL -------------------------------------------------------------------
library(tidyverse)
library(modelr)
options(na.action = na.warn)
ggplot(sim1, aes(x, y)) +
geom_point()
models <- tibble(
a1 = runif(250, -20, 40),
a2 = runif(250, -5, 5)
)
ggplot(sim1, aes(x, y)) +
geom_abline(aes(intercept = a1, slope = a2),
data = models, alpha = 1/4) +
geom_point()
model1 <- function(a, data) {
a[1] + data$x * a[2]
}
model1(c(7, 1.5), sim1)
measure_distance <- function(mod, data) {
diff <- data$y - model1(mod, data)
sqrt(mean(diff ^ 2))
}
measure_distance(c(7, 1.5), sim1)
sim1_dist <- function(a1, a2) {
measure_distance(c(a1, a2), sim1)
}
models <- models %>%
mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
models
ggplot(sim1, aes(x, y)) +
geom_point(size = 2, color = "grey30") +
geom_abline(
aes(intercept = a1, slope = a2, color = -dist),
data = filter(models, rank(dist) <= 10)
)
ggplot(models, aes(a1, a2)) +
geom_point(data = filter(models, rank(dist) <= 10),
size = 4, color = "red") +
geom_point(aes(color = -dist))
grid <- expand.grid(
a1 = seq(-5, 20, length = 25),
a2 = seq(1, 3, length = 25)
) %>%
mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
grid %>%
ggplot(aes(a1, a2)) +
geom_point(data = filter(grid, rank(dist) <= 10),
size = 4, color = "red") +
geom_point(aes(color = -dist))
ggplot(sim1, aes(x, y)) +
geom_point(size = 2, color = "grey30") +
geom_abline(
aes(intercept = a1, slope = a2, color = -dist),
data = filter(grid, rank(dist) <= 10)
)
best <- optim(c(0, 0), measure_distance, data = sim1)
best$par
ggplot(sim1, aes(x, y)) +
geom_point(size = 2, color = "grey30") +
geom_abline(intercept = best$par[1], slope = best$par[2])
sim1_mod <- lm(y ~ x, data = sim1)
coef(sim1_mod)
ggplot(sim1, aes(x, y)) +
geom_point(size = 2, color = "grey30") +
geom_abline(intercept = best$par[1], slope = best$par[2],
size = 3, alpha = 1 / 4) +
geom_abline(intercept = coef(sim1_mod)[1],
slope = coef(sim1_mod)[2],
color = "red")
# bookmark ----------------------------------------------------------------
grid <- sim1 %>%
data_grid(x)
grid
grid <- grid %>%
add_predictions(sim1_mod)
grid
ggplot(sim1, aes(x)) +
geom_point(aes(y = y)) +
geom_line(aes(y = pred), data = grid, color = "red", size = 1)
sim1 <- sim1 %>%
add_residuals(sim1_mod)
sim1
ggplot(sim1, aes(resid)) +
geom_freqpoly(binwidth = .5)
ggplot(sim1, aes(x, resid)) +
geom_ref_line(h = 0) +
geom_point()
df <- tibble(
y = c(4, 5),
x1 = c(2, 1),
x2 = c(5, 6)
)
model_matrix(df, y ~ x1)
ggplot(sim2) +
geom_point(aes(x, y))
mod2 <- lm(y ~ x, data = sim2)
grid <- sim2 %>%
data_grid(x) %>%
add_predictions(mod2)
grid
ggplot(sim2, aes(x)) +
geom_point(aes(y = y)) +
geom_point(data = grid, aes(y = pred), color = "red", size = 4)
ggplot(sim3, aes(x1, y)) +
geom_point(aes(color = x2))
mod1 <- lm(y ~ x1 + x2, data = sim3)
mod2 <- lm(y ~ x1 * x2, data = sim3)
grid <- sim3 %>%
data_grid(x1, x2) %>%
gather_predictions(mod1, mod2)
grid
ggplot(sim3, aes(x1, y, color = x2)) +
geom_point() +
geom_line(data = grid, aes(y = pred)) +
facet_wrap(~ model)
sim3 <- sim3 %>%
gather_residuals(mod1, mod2)
ggplot(sim3, aes(x1, resid, color = x2)) +
geom_point() +
facet_grid(model ~ x2)
mod1 <- lm(y ~ x1 + x2, data = sim4)
mod2 <- lm(y ~ x1 * x2, data = sim4)
grid <- sim4 %>%
data_grid(
x1 = seq_range(x1, 5),
x2 = seq_range(x2, 5)
) %>%
gather_predictions(mod1, mod2)
grid
ggplot(grid, aes(x1, x2)) +
geom_tile(aes(fill = pred)) +
facet_wrap(~ model)
ggplot(grid, aes(x1, pred, color = x2, group = x2)) +
geom_line() +
facet_wrap(~ model)
ggplot(grid, aes(x2, pred, color = x1, group = x1)) +
geom_line() +
facet_wrap(~ model)
library(splines)
sim5 <- tibble(
x = seq(0, 3.5 * pi, length = 50),
y = 4 * sin(x) + rnorm(length(x))
)
ggplot(sim5, aes(x, y)) +
geom_point()
mod1 <- lm(y ~ ns(x, 1), data = sim5)
mod2 <- lm(y ~ ns(x, 2), data = sim5)
mod3 <- lm(y ~ ns(x, 3), data = sim5)
mod4 <- lm(y ~ ns(x, 4), data = sim5)
mod5 <- lm(y ~ ns(x, 5), data = sim5)
mod6 <- lm(y ~ ns(x, 6), data = sim5)
mod7 <- lm(y ~ ns(x, 7), data = sim5)
grid <- sim5 %>%
data_grid(x = seq_range(x, n = 50, expand = .1)) %>%
gather_predictions(mod1, mod2, mod3, mod4, mod5, mod6, mod7, .pred = "y")
ggplot(sim5, aes(x, y)) +
geom_point() +
geom_line(data = grid, color = "red") +
facet_wrap(~ model)
nobs(mod1)
library(tidyverse)
library(modelr)
options(na.action = na.warn)
library(nycflights13)
library(lubridate)
ggplot(diamonds, aes(cut, price)) + geom_boxplot()
ggplot(diamonds, aes(color, price)) + geom_boxplot()
ggplot(diamonds, aes(clarity, price)) + geom_boxplot()
ggplot(diamonds, aes(carat, price)) + geom_hex(bins = 100)
diamonds2 <- diamonds %>%
filter(carat <= 2.5) %>%
mutate(lprice = log2(price), lcarat = log2(carat))
ggplot(diamonds2, aes(lcarat, lprice)) +
geom_hex(bins = 50)
mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)
grid <- diamonds2 %>%
data_grid(carat = seq_range(carat, 20)) %>%
mutate(lcarat = log2(carat)) %>%
add_predictions(mod_diamond, "lprice") %>%
mutate(price = 2 ^ lprice)
grid
ggplot(diamonds2, aes(carat, price)) +
geom_point(alpha = .1) +
geom_line(data = grid, color = "red", size = 1)
diamonds2 <- diamonds2 %>%
add_residuals(mod_diamond, "lresid")
ggplot(diamonds2, aes(lcarat, lresid)) +
geom_hex(bins = 50)
ggplot(diamonds2, aes(cut, lresid)) + geom_boxplot()
ggplot(diamonds2, aes(color, lresid)) + geom_boxplot()
ggplot(diamonds2, aes(clarity, lresid)) + geom_boxplot()
mod_diamond2 <- lm(lprice ~ lcarat + color + cut + clarity, data = diamonds2)
grid <- diamonds2 %>%
data_grid(cut, .model = mod_diamond2) %>%
add_predictions(mod_diamond2)
grid
ggplot(grid, aes(cut, pred)) +
geom_point()
diamonds2 <- diamonds2 %>%
add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) +
geom_hex(bins = 50)
diamonds2 %>%
filter(abs(lresid2) > 1) %>%
add_predictions(mod_diamond2) %>%
mutate(pred = round(2 ^ pred)) %>%
select(price, pred, carat:table, x:z) %>%
arrange(price)
daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarise(n = n())
daily
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = T))
ggplot(daily, aes(wday, n)) +
geom_boxplot()
mod <- lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0) +
geom_line()
daily %>%
filter(resid < -100)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line(color = "grey50") +
geom_smooth(se = F, span = .3)
daily %>%
filter(wday == "周六") %>%
ggplot(aes(date, n)) +
geom_point() +
geom_line() +
scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b") +
theme_bw()
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall"))
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "周六") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3) +
geom_line() +
scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b")
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = .75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, color = "grey75") +
geom_line()
#chapter 25
#chapter 27
# R BASE ------------------------------------------------------------------
test <- 1:3
test[1] #ok
test[[1]] #ok
# object[index] "index" can be a vector of numeric
# or a vector of character if nameed
df <- data.frame(a = 1:5, b = 1:5, c = 1:5)
df
df$c(a, b) # not appliable
df$c("a", "b") # not appliable
df[c(a,b)] # not applyable
df[c("a","b")] # ok
my_index <- "a"
df[my_index]
df[!names(df) %in% c("a", "b")] # ok
x <- c("one", "two", "three")
x[c(1, -1)] # it does not work, do not mix negative and positive sub_number
x[rep(1, 10)]
x[0]
x <- c(10, 3, NA, 5, 8, 1, NA)
x[x %% 2 == 0]
x[x %% 2 == 1]
attach(mtcars)
summary(mpg)
plot(mpg, disp)
plot(mpg, wt)
detach(mtcars)
data$gender <- factor(data$gender,
levels = c(1, 2),
labels = c("male", "female"))
data$age[data$age == 99] <- NA
data <- within(data, {
agecat <- NA
agecat[age > 75] <- "Elder"
agecat[age <= 75] <- "Young"
})
data <- transform(data,
sumx = x1 + x2,
meanx = (x1 + x2) / 2)
library(car)
x <- 10:100
df <- data.frame(x = x)
df
df$y <- recode(df$x, "lo:60 = 'C'; 61:80 = 'B'; 81:100 = 'A'")
df$z <- recode(df$x, "lo:60 = 'C'; 60:80 = 'B'; 81:hi='A'; else='NULL'")
df
help(recode)
leadership <- iris
names(leadership)[2] <- "something_you_like"
names(leadership)
names(leadership)[2:4] <- c("something", "you", "like")
names(leadership)
names(leadership)[[2]] <- "something_you_like"
names(leadership)
order(4:6)
(3:1)[order(3:1)]
leadership[order(leadership$Sepal.Length),]
leadership[order(
c(leadership$Sepal.Length, leadership$something_you_like)
),]
# is.na() can be applied to a dataframe, this is awesome
# is.infinite() in.nan()
# na.omit() drops all the obs(rows) that contain NA
# something == NA do not work
# any(c(T, T, F)) all(c(T, T, F))
sum(c(1, 2, NA))
# cbind(a, b) is a relatively safe function,
# to make it work, dataframe a and b must have same variable,
# but do not have to be arranged to same order
# subset
df[6:10, ]
myvars <- c("q1", "q2", "q3", "q4", "q5")
df[myvars]
myvars <- paste("q", 1:5, sep = "")
myvars
df[myvars]
myvars <- names(df) %in% c("q3", "q4")
df[!myvars]
head(df)
df$Sepal.Length <- NULL
df$Sepal.Width <- df$Petal.Length <- NULL # interesting
leadership[leadership$male == 1, ]
df[1, "Species"]
df[1, 2]
subset(leadership, age >= 35 | age < 24,
select = c(q1, q2, q3, q4, q5)) # select = q1:q5
sample(1:10, 3, replace = F)
mean(x, trim = .05, na.rm = T) # drop 10% obs and na
##strings
x <- c("ab", "cde", "fghij")
nchar(x)
length(x)
substr(x, 1, 1)
substr(x, 1, 1) <- "something"
x
substr(x, 1, 2) <- "something"
x
toupper(c("aaa", "bb"))
tolower(c("someTHING", "hErE"))
seq(1, 5, 2)
x <- 1:10
cut(x, 3)
cut(x, 3, ordered_result = T)
cut(x, c(0, 3, 6, 9, 10),
labels = c("not_so_good", "good", "great", "awesome"),
ordered_result = T)
pretty(x, 3)
c <- matrix(runif(12), nrow = 3)
c
mean(c)
apply(c, 1, mean)
apply(c, 2, mean)
apply(c, 2, mean, trim = .2)
apply(iris, 2, mean) # why does not work?
apply(iris[-5], 2, mean) # it works
apply(iris[-5], 1, mean) # it works
## function
mystats <- function(x, parametric = T, print = F) {
if (parametric) {
center = mean(x)
spread = sd(x)
} else {
center = median(x)
spread = mad(x)
}
if (print & parametric) {
cat("Mean =", center, "\n", "SD =", spread, "\n")
} else if (print & !parametric){
cat("Median =", center, "\n", "SD =", spread, "\n")
}
results <- list(center = center, spread = spread)
return(results)
}
set.seed(1234)
x <- rnorm(500)
mystats(x)
y <- mystats(x, print = T)
x <- 1:10
x
names(x) <- c(letters[1:10])
x
attributes(x)
attr(x, "dim") <- c(2, 5)
x
attributes(x)
row.names(x) <- c("a", "b")
attributes(x)
x
attr(x, "dimnames")[[2]] <- c(LETTERS[1:5])
x
attr(x, "names") <- NULL
x
attributes(x)
attr(x, "test") <- "this is a test"
x
attr(x, "dim") <- NULL
x
attributes(iris)
iris <- unclass(iris)
str(iris)
attributes(iris)
data(iris)
set.seed(1234)
fit <- kmeans(iris[1:4], 3)
fit
str(fit)
names(fit)
summary(fit)
unclass(fit)
fit[c(2, 7)]
fit[2]
fit[[2]]
fit[["centers"]]
fit$"centers"
fit$centers
fit[[2]][1, 1:3]
fit[2][1][1][[1]][1, ]
interactive()
args(mean)
#output to file
sink("test.txt", append = T)
sink()
cat("hello")
cat("hello", file = "mytxt.txt", append = T)
#work with dirs
getwd()
list.dirs()
list.files()
dir()
dir(path = "./check")
file.info("./check.readme.txt")
file.info("communication.html")
file.exists("./check/readme.txt")
file.exists("test.txt")
file_test("-f", "check")
file_test("-d", "check")
dir.create("./something")
file.exists("something")
file.create("test.txt")
file.info("test.txt")
file.rename("test.txt", "something.txt")
file.info("something.txt")
file.remove("something.txt")
file.exists(c("something.txt", "test.txt"))
system("tree")
file.rename("something", "something2")
file.copy("diamonds.csv", "./something2/renamed.html")
unlink("something2")
readLines("./check/readme.txt", encoding = "UTF-8")
readLines("temp.R")
# SOME FUNCTION BY ME -----------------------------------------------------
my_sample <- function(x, size = 30, repeats = 100) {
out <- vector("double", length = repeats)
for (i in seq_len(repeats)) {
out[[i]] <- mean(sample(x, size))
}
out
}
sizes <- c(`20` = 20, `30` = 30, `50` = 50, `100` = 100)
sizes %>%
lapply(my_sample, x = 1:999, repeats = 100) %>%
as.tibble() %>%
gather() %>%
qplot(x = value, data = .,
facets = factor(key, levels = sizes) ~ .)
library(tidyverse)
out %>%
as.tibble() %>%
gather() %>%
ggplot(aes(value)) +
geom_histogram(aes(y = ..density..), fill = "blue", alpha = .5) +
geom_density(color = "red", fill = "pink", alpha = .3) +
facet_grid(factor(key, levels = c("size_20", "size_30", "size_50", "size_100"))~.) +
geom_vline(aes(xintercept = 500), color = "orange", size = 1.2) +
theme_bw()
map_dbl(out, mean)
col_num <- function(df) {
index <- vector("logical", ncol(df))
for (i in seq_along(df)) {
index[[i]] <- is.numeric(df[[i]])
}
df[index]
}
col_mean <- function(df) {
results <- vector("double", ncol(df))
names(results) <- names(df)
for (i in seq_along(df)) {
results[[i]] <- mean(df[[i]])
}
results
}
show_vec_mean <- function(x) {
for (i in seq_along(x)) {
cat(names(x)[[i]], ":", "\t", round(x[[i]], 3), "\n")
}
}
show_mean <- function(df) {
show_vec_mean(col_mean(col_num(df)))
}
data(iris)
show_mean(iris)
data(diamonds)
show_mean(diamonds)
chk_outlier <- function(df, min = T, number = 10) {
out <- vector("list", length = length(df))
names(out) <- names(df)
if (min) {
for (i in seq_along(df)) {
out[[i]] <- head(sort(df[[i]]), number)
}
} else {
for (i in seq_along(df)) {
out[[i]] <- rev(tail(sort(df[[i]]), number))
}
}
out
}
num <- col_num(iris)
chk_outlier(num, min = F, number = 4)
library(tidyverse)
split_chr <- function(df, sep = "#") {
vec <- df[[1]]
strlist <- str_split(vec, sep)
unique_chr <- unique(flatten_chr(strlist))
col_name <- unique_chr[order(as.numeric(unique_chr))]
list <- vector("list", length = length(col_name))
names(list) <- col_name
for (i in seq_along(col_name)) {
list[[i]] <- map_lgl(strlist, ~col_name[[i]] %in% .)
}
list
}
list2df <- function(list) {
as.data.frame(do.call(cbind, list))
}
lgl2chr <- function(df, chr = c("1", "")) {
map_df(df, ~ifelse(., chr[1], chr[2]))
}
split_col <- function(df) {
lgl2chr(list2df(split_chr(df)), c("bingo",""))
}
test <- tibble(a = c("1#3#2", "1#4#something", "2#3#@"))
split_col(test)
keep_na <- function(df, start_num, end_num) {
#start_num, end_num for cols that must have their value keeped,
#like ID, name....
right_df <- df[-(start_num:end_num)]
na_rows <- apply(right_df, 1, function(x) {any(is.na(x))})
na_cols <- apply(right_df, 2, function(x) {any(is.na(x))})
left_df <- df[start_num:end_num][na_rows, ]
right_df_na_keeped <- right_df[na_rows, na_cols]
logical <- is.na(right_df_na_keeped)
chr_na_keeped <- ifelse(logical, "missing", "")
cbind(left_df, chr_na_keeped)
}
data(iris)
names(iris) <- c("a", "b", "c", "d", "e")
head(iris)
variable <- c("a", "b", "d")
conds <- list(c(4.5, 7.5),
c(2.3, 4),
c(.2, 2.4))
chk_rng <- function(vec, cond) {
between(vec, cond[1], cond[2])
}
chk_rng(1:10, c(3,7))
map2(iris[variable], conds, chk_rng)
chk_range <- function(df, var, cond) {
temp_list <- map2(df[var], cond, chk_rng)
df_lgl <- list2df(temp_list)
}
head(chk_range(iris, variable, conds))
keep_FALSE <- function(df, id) {
right_df <- df[-(id[1]:id[2])]
na_rows <- apply(right_df, 1, function(x) {any(!x)})
na_cols <- apply(right_df, 2, function(x) {any(!x)})
left_df <- df[id[1]:id[2]][na_rows, ]
right_df_na_keeped <- right_df[na_rows, na_cols]
cbind(left_df, right_df_na_keeped)
}
test_data <- data.frame(a = 1:5,
b = c(T, F, T, T, T),
c = c(T, T, F, T, T))
keep_FALSE(test_data, id = c(1, 1))
chk_range_id <- function(df, var, cond, id) {
right_df <- chk_range(df, var, cond)
full_df_interested <- cbind(df[id[1]:id[2]], right_df)
newid <- c(1, (id[2] - id[1]) + 1)
keep_FALSE(full_df_interested, newid)
}
results <- chk_range_id(iris, variable, conds, c(5,5))
results
lgl2chr(results[-1])
cbind(results[1], lgl2chr(results[-1]))
chk_lgl_vecout <- function(df, vec, cond, skip) {
temp_df <- df[skip]
temp_df <- is.na(temp_df)
logical_value <- df[[vec]] == cond
out <- vector("logical", length(logical_value))
for (i in seq_along(logical_value)) {
if (logical_value[[i]]) {
out[[i]] <- all(unlist(temp_df[i, ]))
} else {
out[[i]] <- all(!unlist(temp_df[i, ]))
}
}
out
}
test_data <- tibble(id = 1:10,
name = c(rep(letters[1:3], 3), "a"),
a1 = c(rep(1:3, each = 3), 1),
a2 = c("A", NA, NA, "A", NA, NA, "A", NA, NA, "A"),
a3 = c(1, 1, NA, 1, 1, NA, 1, 1, NA, 1))
test_data
(sub <- chk_lgl_vecout(test_data, "a1", 3, c("a2", "a3")))
test_data[!sub, ]
test_data1 <- test_data %>%
mutate(b1 = as.character(a1),
b2 = a2,
b3 = rep(1, 10))
test_data1
vec <- list("a1", "b1")
cond <- list(3, "3")
skip <- list(c("a2", "a3"), c("b2", "b3"))
vec; cond; skip
chk_lgl_vecout_with_df <- function(a, b, c) {
chk_lgl_vecout(test_data1, a, b, c)
}
chk_lgl_vecout_with_df("b1", "3", c("b2", "b3"))
pars <- list(vec, cond, skip)
pmap(pars, chk_lgl_vecout_with_df)
chk_lgl_id <- function(df, id, vec, cond, skip) {
pars <- list(vec, cond, skip)
right_df <- pmap(pars, function(a, b, c) chk_lgl_vecout(df, a, b, c))
names(right_df) <- unlist(vec)
left_df <- df[id]
full_df <- cbind(left_df, right_df)
keep_FALSE(full_df, id)
}
test_data1[9, "b3"] <- NA
test_data1[2, "a2"] <- "B"
test_data1[3, "b2"] <- "c"
test_data1
chk_lgl_id(test_data1, 1:2, vec, cond, skip)
chk_lgl_id(test_data1, 1:2, vec, cond, skip) %>%
select(-(1:2)) %>%
lgl2chr(c("", "1"))
test_data <- tibble(a = 1:5,
b = c("1", "2", "3", "4", "5"),
c = c("one", "2", "3", "4#5", "5"))
test_data
map_df(test_data, as.numeric)
chk_illegal_type <- function(df, id) {
right_df <- df[-id]
right_df <- map_df(right_df, as.numeric)
na_rows <- apply(right_df, 1, function(x) {any(is.na(x))})
na_cols <- apply(right_df, 2, function(x) {any(is.na(x))})
left_df <- df[id][na_rows, ]
right_df_na_keeped <- right_df[na_rows, na_cols]
logical <- is.na(right_df_na_keeped)
chr_na_keeped <- ifelse(logical, "wrong_type_or_missing", "")
cbind(left_df, chr_na_keeped)
}
test_data <- test_data %>%
mutate(id = row_number(),
d = lag(c)) %>%
select(id, everything())
test_data
chk_illegal_type(test_data, 1)
# A LITTLE BITTER OF VISUALITION ------------------------------------------
data <- map_dbl(col_num(iris), mean)
#par(mfrow = c(2, 2))
par(mfrow = c(1, 1))
barplot(data)
pie(data)
boxplot(iris$Sepal.Length)
hist(iris$Sepal.Length, col = "gray", cex = 1.5)
plot(iris$Sepal.Length, iris$Sepal.Width, cex = 1.5)
plot(women$height, women$weight, type = "b")
plot(density(iris$Sepal.Length))
boxplot(mtcars$mpg ~ mtcars$cyl)
mtcars <- mtcars[order(mtcars$mpg),]
dotchart(mtcars$mpg, labels = row.names(mtcars), pch = 19, cex = 1.3)
library(rgl)
with(mtcars, {
plot3d(wt, disp, mpg, col = "red", size = 5)
})
library("GGally")
ggpairs(iris[iris$Species %in% c("versicolor", "virginica"), ], aes(color = Species))
+ theme_bw()
ggpairs(mtcars[c("wt", "disp", "mpg")])
# ADVANCED R --------------------------------------------------------------
# SECTION_1 basic ---------------------------------------------------------
a <- matrix(1:4, ncol = 2)
is.vector(a)
is.atomic(a)
is.list(a)
typeof(a)
class(a)
attributes(a)
x <- c(1, 2, 3)
typeof(x)
x <- c(1L, 2L, 3L)
typeof(x)
x
structure(1:20, my_attribute = "this is a vecgtor")
# name a vector
x <- c(a = 1, b = 2, c = 3)
x <- 1:3; names(x) <- c("a", "b", "c")
x <- setNames(1:3, c("a", "b", "c"))
# drop name
unname(x)
names(x) <- NULL
# factor
x <- factor(c("a", "b", "b", "a"))
class(x)
levels(x)
x[2] <- "c"
x
c(factor("a"), factor("b")) # do not work
sex_char <- c("m", "m", "m")
sex_factor <- factor(sex_char, levels = c("m", "f"))
table(sex_factor)
f1 <- factor(letters)
levels(f1) <- rev(levels(f1))
f2 <- rev(factor(letters))
f3 <- factor(letters, levels = rev(letters))
f1; f2; f3
l <- list(1:3, "a", T, 1.0)
dim(l) <- c(2, 2)
l
dim(1:3)
as.data.frame(l)
as.data.frame(list(a = 1:4, b = 2:5))
df <- data.frame(x = 1:3)
df$y <- list(1:2, 1:3, 1:4)
df
data.frame(x = 1:3, y = list(1:2, 1:3, 1:4))
data.frame(x = 1:3, y = I(list(1:2, 1:3, 1:4)))
dfm <- data.frame(x = 1:3, y = I(matrix(1:9, nrow = 3)))
dim(dfm)
names(dfm)
str(dfm)
x <- c(2.1, 4.2, 3.3, 5,4)
x[c(1, 1)]
x[c(2.1, 2.9)]
x[c(-1, 2)]
x[c(T, F)]
x[c(T, F, T, F)] #same as above one
x[c(T, F, NA, F)]
x[]
x[0]
y <- setNames(x, letters[1:5])
y
y[c("a", "a", "a")]
y[c("a", "aa")] #matched precisely
a <- matrix(1:9, nrow = 3)
colnames(a) <- c("A", "B", "C")
a[c(T, F, T), c("B", "B")]
a[0, -2]
vals <- outer(1:5, 1:5, FUN = "paste", sep = ",")
vals
vals[c(4, 15)]
select <- matrix(ncol = 2, byrow = T, c(
1, 1,
3, 1,
2, 4
))
select
vals[select]
vals[1]
z <- as.data.frame(vals)
z <- setNames(z, letters[1:5]); z
z[, "a"]
z[, "a", drop = F]
a <- list(a = 1, b = 2)
a[[1]]
a[["a"]]
b <- list(a = list(b = list(c = list(d = 1))))
b[[c("a", "b", "c", "d")]] #recursive, very useful
b[["a"]][["b"]][["c"]][["d"]] #same as above
x <- 1:5
x[-1] <- 4:1; x #length(left_hand = right_hand)
x[c(1, 1)] <- 2:3; x # legal
x[c(1, NA)] <- c(1:2) # illegal
x[c(T, F, NA)] <- 1 #legal, NA is treated as FALSE
data(mtcars)
mtcars <- lapply(mtcars, as.integer); mtcars
data(mtcars)
mtcars[] <- lapply(mtcars, as.integer); mtcars
x <- list(a = 1, b = 2)
x[["a"]] <- NULL; str(x)
x <- list(a = 1)
x["b"] <- list(NULL); str(x)
x <- c("m", "f", "u", "f", "f", "m", "u")
lookup <- c(m = "male", f = "female", u = NA)
lookup[x]
unname(lookup[x])
c(m = "known", f = "known", u = "unkame")[x]
grades <- c(1, 2, 2, 3, 1)
info <- data.frame( grade = 3:1,
desc = c("Excellent", "Good", "Poor"),
fail = c(F, F, T) )
id <- match(grades, info$grade)
info[id, ]
rownames(info) <- info$grade
info[as.character(grades), ]
df <- data.frame(x = rep(1:3, each = 2), y = 6:1, z = letters[1:6])
df[sample(nrow(df)), ]
df[sample(nrow(df), 3), ]
df[sample(nrow(df), 6, rep = T), ]
df2 <- df[sample(nrow(df)), 3:1]
df2[order(df2$x), ]
df2[, order(names(df2))]
df <- data.frame(x = c(2, 4, 1), y = c(9, 11, 6), n = c(3, 5, 1))
df
rep(1:nrow(df), df$n)
df[rep(1:nrow(df), df$n), ]
subset(mtcars, gear == 5)
subset(mtcars, gear == 5 & cyl == 4)
x <- sample(10) < 4
which(x)
unwhich <- function(x, n) {
out <- rep_len(F, n)
out[x] <- T
out
}
unwhich(which(x), 10)
(x1 <- 1:10 %% 2 == 0)
(x2 <- which(x1))
(y1 <- 1:10 %% 5 == 0)
(y2 <- which(y1))
x1 & y1
intersect(x2, y2)
x1 | y1
union(x2, y2)
x1 & !y1
setdiff(x2, y2)
xor(x1, y1)
setdiff(union(x2, y2), intersect(x2, y2))
# FUNCTION ----------------------------------------------------------------
# install.packages("pryr")
library("pryr")
library("purrr")
f <- function(x) x^2 # three parts
f
formals(f)
body(f)
environment(f)
sum
formals(sum)
body(sum)
environment(sum)
objs <- mget(ls("package:base"), inherits = T)
funs <- Filter(is.function, objs)
map(map(funs, formals), length)
x <- 1
h <- function() {
y <- 2
i <- function() {
z <- 3
c(x, y, z)
}
i()
}
h()
rm(x, h)
j <- function(x) {
y <- 2
function() {
c(x, y)
}
}
k <- j(1)
k()
k
rm(j, k)
l <- function(x) x + 1
m <- function() {
l <- function(x) x * 2
l(10)
}
m()
rm(l, m)
n <- function(x) x / 2
o <- function() {
n <- 10
n(n)
}
o()
rm(o, n)
j <- function() {
if(!exists("a")) {
a <- 1
} else {
a <- a + 1
}
print(a)
}
j()
f <- function() x
x <- 15
f()
x <- 30
f()
#above function f should be avoid because of outer dependency
f <- function() x + 1
codetools::findGlobals(f)
environment(f) <- emptyenv()
f()
c <- 10
c(c = c)
add <- function(x, y) x + y
sapply(1:10, add, 3)
sapply(1:10, `+`, 3)
sapply(1:10, "+", 3)
x <- list(1:3, 4:9, 10:12)
sapply(x, "[", 2)
sapply(x, function(x) x[2])
library(purrr)
map(x, function(x) x[2])
f <- function(abcdef, bcde1, bcde2) {
list(a = abcdef, b1 = bcde1, b2 = bcde2)
}
str(f(1, 2, 3))
str(f(2, 3, a = 1))
str(f(1, 3, b = 1))
args <- list(1:10, na.rm = T)
do.call(mean, args) #divery the formals to the functions
g <- function(a = 1, b = a * 2) {
c(a, b)
}
g()
g(109)
f <- function(a = 1, b = d) {
d <- (a + 1) ^ 2
c(a, b)
}
f()
f(10)
i <- function(a, b) {
c(missing(a), missing(b))
}
i()
i(a = 1)
i(b = 2)
i(a = 1, b = 2)
f <- function(x) {
10
}
f(stop("this is an error!"))
f <- function(x) {
force(x)
10
}
f(stop("this is an error"))
add <- function(x) {
function(y) x + y
}
adders <- lapply(1:10, add)
adders[[1]](10)
adders[[10]](10)
add <- function(x) {
force(x)
function(y) x + y
}
adders2 <- lapply(1:10, add)
adders2[[1]](10)
adders2[[10]](10)
f <- function(x = ls()) {
a <- 1
x
}
f()
f(ls())
`&&` <- function(x, y) {
if (!x) return(FALSE)
if (!y) return(FALSE)
TRUE
}
a <- NULL
!is.null(a) && a > 0
a > 0 && !is.null(a)
if(is.null(a)) stop("a is null")
!is.null(a) || stop("a is null")
barplot(1:5, col = "red", pch = 20)
plot(1:5, bty = "u")
plot(1:5, labels = F)
f <- function(...) {
names(list(...))
}
f(a = 1, b = 2)
`%+%` <- function(a, b) paste(a, b, sep = "")
"new" %+% "string"
`second<-` <- function(x, value) {
x[2] <- value
x
}
x <- 1:10
second(x) <- 5
x
library(pryr)
x <- 1:10
address(x)
second(x) <- 6
address(x)
`modify<-` <- function(x, position, value) {
x[position] <- value
x
}
modify(x, 1) <- 10
modify(get("x"), 1) <- 10 # do not work,
# get("x") <- `modify<-`(get("x"), 1, 10)
replace_fun <- function(fun = F) {
objs <- mget(ls("package:base"), inherits = T)
fun_names <- names(Filter(is.function, objs))
replace_fun_names <- fun_names[str_detect(fun_names, "<-")]
if (fun) objs[replace_fun_names] else replace_fun_names
}
replace_fun(fun = T)
f1 <- function() 1
f1()
f2 <- function() invisible(1)
f2()
f1() == 1
f2() == 1
(f2())
a <- 2
(a <- 2) # <- is one of the most common used function that
# return invisible value, so, it can be used below
a <- b <- c <- d <- 2
in_dir <- function(dir, code) {
old <- setwd(dir)
on.exit(setwd(old))
force(code)
}
getwd()
in_dir("~", getwd())
# OO GUIDE ----------------------------------------------------------------
f <- function() {}
typeof(f)
is.function(f)
typeof(sum)
is.primitive(sum)
df <- data.frame(x = 1:10, y = letters[1:10])
otype(df)
otype(df$x)
otype(df$y)
mean
UseMethod("mean")
ftype(mean)
ftype(t.data.frame)
ftype(t.test)
methods("mean")
methods(class = "list")
foo <- structure(list(), class = "foo")
foo
foo <- list()
class(foo) <- "foo"
foo
class(foo)
inherits(foo, "foo")
foo <- function(x) {
if(!is.numeric(x)) stop("X must be numeric")
structure(list(x), class = "foo")
}
mod <- lm(log(mpg) ~ log(disp), data = mtcars)
class(mod)
print(mod)
class(mod) <- "data.frame"
print(mod)
mod$coefficients
f <- function(x) UseMethod("f")
f.a <- function(x) "class a"
a <- structure(list(), class = "a")
class(a)
f(a)
mean.a <- function(x) "a"
mean(a)
f <- function(x) UseMethod("f")
f.a <- function(x) "class a"
f.default <- function(x) "unknown class"
f(structure(list(), class = "a"))
f(structure(list(), class = "b"))
f(structure(list(), class = c("b", "a")))
library(methods)
library(stats4)
y <- c(26, 17, 13, 12,20, 5, 9, 8, 5, 4, 8)
nLL <- function(lambda) -sum(dpois(y, lambda, log = T))
fit <- mle(nLL, start = list(lambda = 5), nobs = length(y))
isS4(fit)
otype(fit)
isS4(nobs)
ftype(nobs)
# s4 object ---------------------------------------------------------------
mle_nobs <- method_from_call(nobs(fit))
isS4(mle_nobs)
ftype(mle_nobs)
is(fit)
class(fit)
is(fit, "mle")
getGenerics()
getclass()
setClass("Person",
slots = list(name = "character", age = "numeric"))
setClass("Employee",
slots = list(boss = "Person"),
contains = "Person")
alice <- new("Person", name = "Alice", age = 40)
john <- new("Employee", name = "John", age = 20, boss = alice)
alice@age
slot(john, "boss")
setClass("RangedNumeric",
contains = "numeric",
slots = list(min = "numeric", max = "numeric"))
rn <- new("RangedNumeric", 1:10, min = 1, max = 10)
rn@min
rn@.Data
setGeneric("union")
setMethod("union",
c(x = "data.frame", y = "data.frame"),
function(x, y) {
unique(rbind(x, y))
})
setGeneric("myGeneric", function(x) {
standardGeneric("myGeneric")
})
# reference class ---------------------------------------------------------
Account <- setRefClass("Account")
Account$new()
Account <- setRefClass("Account",
fields = list(balance = "numeric"))
a <- Account$new(balance = 100)
a$balance
a$balance <- 200
a$balance
b <- a
b$balance
a$balance <- 0
b$balance
c <- a$copy()
c$balance
a$balance <- 100
c$balance
Account <- setRefClass("Account",
fields = list(balance = "numeric"),
methods = list(
withdraw = function(x) {
balance <<- balance - x
},
deposit = function(x) {
balance <<- balance + x
}
))
a <- Account$new(balance = 100)
a$deposit(100)
a$balance
NoOverdraft <- setRefClass("NoOverdraft",
contains = "Account",
methods = list(
withdraw = function(x) {
if (balance < x) stop("Not enough money")
balance <<- balance - x
}
))
accountJohn <- NoOverdraft$new(balance = 100)
accountJohn$deposit(50)
accountJohn
accountJohn$withdraw(1000)
pryr::otype(accountJohn)
# env ---------------------------------------------------------------------
e <- new.env()
e$a <- FALSE
e$b <- "a"
e$c <- 2.3
e$d <- 1:3
e$a <- e$d
e$a <- 1:3
search()
as.environment("package:stats")
ls(e)
parent.env(e)
e$.a <- 2
ls(e)
ls(e, all.names = T)
str(e)
ls.str(e)
e$c <- 3
e$c
e[["c"]]
get("c", envir = e)
e <- new.env()
e$a <- 1
e$a <- NULL
ls(e)
e$a
rm("a", envir = e)
ls(e)
x <- 10
exists("x", envir = e)
exists("x", envir = e, inherits = F)
identical(globalenv(), environment())
library(pryr)
x <- 5
where("x")
where("mean")
where <- function(name, env = parent.env()) {
if(identical(env, emptyenv())) {
stop("can not find", name, call. = FALSE)
} else if (exists(name, envir = env, inherits = FALSE))
env
} else {
where(name,parent.env(env))
}
y <- 1
f <- function(x) x + y
environment(f)
e <- new.env()
e$g <- function() 1
environment(e$g)
environment(sd)
where("sd")
g <- function(x) {
if(!exists("a", inherits = F)) {
message("define a")
a <- 1
} else {
a <- a + 1
}
a
}
g(10)
g(10)
h <- function() {
x <- 10
function() {
x
}
}
i <- h()
x <- 20
i()
#assign
x <- 0
f <- function() {
x <<- 1
}
f()
x
library(pryr)
system.time(b%<d-%{Sys.sleep(1); 1})
system.time(b)
x %<a-% runif(1)
x
x
modify <- function(x) {
x$a <- 2
invisible()
}
x_1 <- list()
x_1$a <- 1
modify(x_1)
x_1$a
x_e <- new.env()
x_e$a <- 1
modify(x_e)
x_e$a
remove(x)
x <- 1
e1 <- new.env()
get("x", envir = e1)
get("x", envir = e1, inherits = F)
e2 <- new.env(parent = emptyenv())
get("x", envir = e2)
# debug & defensive programing (skip some subcharpter) --------------------
f <- function(a) g(a)
g <- function(b) h(b)
h <- function(c) i(c)
i <- function(d) "a" + d
f(10)
f1 <- function(x) {
log(x)
10
}
f1("x")
f2 <- function(x) {
try(log(x))
10
}
f2("x")
try({
a <- 1
b <- "x"
a + b
})
success <- try(1 + 2)
failure <- try("a" + "b")
class(success)
class(failure)
elements <- list(1:10, c(-1, 10), c(T, F), letters)
rm("results")
results <- lapply(elements, log)
results
results <- lapply(elements, function(x) try(log(x)))
results
is.error <- function(x) inherits(x, "try-error")
succeeded <- !sapply(results, is.error)
str(results[succeeded])
str(elements[!succeeded])
default <- NULL
try(default <- read.csv("possibly-bad-input.csv"), silent = T)
show_condition <- function(code) {
tryCatch(code,
error = function(c) "error",
warning = function(c) "warning",
message = function(c) "message")
}
show_condition(stop("!"))
show_condition(warning("?!"))
show_condition(10)
try2 <- function(code, silent = F) {
tryCatch(code, error = function(c) {
msg <- conditionMessage(c)
if(!silent) message(c)
invisible(structure(msg, class = "try-error"))
})
}
try2(1)
try2(stop("hi"))
try2(stop("hi"), silent = T)
i <- 1
while(i < 3) {
tryCatch({
Sys.sleep(.5)
message("try to escape")
}, interrupt = function(x) {
message("try again!")
i <<- i + 1
})
}
#defensive programing
col_means <- function(df) {
numeric <- sapply(df, is.numeric)
numeric_cols <- df[, numeric]
data.frame(lapply(numeric_cols, mean))
}
if (T) {
col_means <- function(df) {
numeric <- lapply(df, is.numeric)
numeric_cols <- df[, unlist(numeric), drop = FALSE]
data.frame(lapply(numeric_cols, mean))
}
}
col_means(mtcars)
col_means(mtcars[, 0])
col_means(mtcars[0, ])
col_means(mtcars[, "mpg", drop = F])
col_means(1:10)
col_means(as.matrix(mtcars))
col_means(as.list(mtcars))
mtcars2 <- mtcars
mtcars2[-1] <- lapply(mtcars2[-1], as.character)
col_means(mtcars2)
# SECTION_2 functional programing -----------------------------------------
set.seed(1014)
df <- data.frame(replicate(6, sample(c(1:10, -99), 6, rep = TRUE)))
names(df) <- letters[1:6]
df
fix_missing <- function(x) {
x[x == -99] <- NA
x
}
df$a <- fix_missing(df$a)
df$b <- fix_missing(df$b)
df$c <- fix_missing(df$c)
df$d <- fix_missing(df$d)
df[] <- lapply(df, fix_missing)
df[1:5] <- lapply(df[1:5], fix_missing)
fix_missing_999 <- function(x) {
x[x == -999] <- NA
x
}
fix_missing_9999 <- function(x) {
x[x == -9999] <- NA
x
}
missing_fixer <- function(na_value) {
function(x) {
x[x == na_value] <- NA
x
}
}
fix_missing_88 <- missing_fixer(88)
fix_missing_88(c(88, 89))
fix_missing <- function(x, na.value) {
x[x == na.value] <- NA
x
}
summary <- function(x) {
c(mean(x, na.rm = T),
median(x, na.rm = T),
sd(x, na.rm = T),
mad(x, na.rm = T),
IQR(x, na.rm = T))
}
lapply(df, summary)
summary <- function(x) {
funs <- c(mean, median, sd, mad, IQR)
lapply(funs, function(f) f(x, na.rm = T))
}
lapply(df, summary)
lapply(mtcars, function(x) length(unique(x)))
Filter(function(x) !is.numeric(x), mtcars)
integrate(function(x) sin(x)^2, 0, pi)
function(x) 3()
(function(x) 3)()
(function(x) x + 10)(3)
power <- function(exponent) {
function(x) {
x ^ exponent
}
}
square <- power(2)
square(4)
cube <- power(3)
cube(2)
square
cube
as.list(environment(square))
as.list(environment(cube))
library(pryr)
unenclose(square)
unenclose(cube)
new_counter <- function() {
i <- 0
function() {
i <<- i + 1
i
}
}
count_one <- new_counter()
count_two <- new_counter()
count_one()
count_one()
count_one()
count_two()
count_two()
i <- 0
new_counter2 <- function() {
i <<- i + 1
i
}
new_counter3 <- function() {
i <- 0
function() {
i <- i + 1
i
}
}
new_counter2()
new_counter2()
new_count <- new_counter3()
new_count()
compute_mean <- list(
base = function(x) mean(x),
sum = function(x) sum(x) / length(x),
manual = function(x) {
total <- 0
n <- length(x)
for (i in seq_along(x)) {
total <- total + x[1] / n
}
total
}
)
x <- runif(1e5)
system.time(compute_mean$base(x))
system.time(compute_mean[[2]](x))
system.time(compute_mean[["manual"]](x))
lapply(compute_mean, function(f) f(x))
call_fun <- function(f, ...) f(...)
lapply(compute_mean, call_fun, x)
lapply(compute_mean, function(x) system.time(f(x)))
x <- 1:10
funs <- list(
sum = sum,
mean = mean,
median = median
)
lapply(funs, function(f) f(x))
funs2 <- list(
sum = function(x, ...) sum(x, ..., na.rm = TRUE),
mean = function(x, ...) mean(x, ..., na.rm = TRUE),
median = function(x, ...) median(x, ..., na.rm = TRUE)
)
lapply(funs2, function(f) f(x))
lapply(funs, function(f) f(x, na.rm = T))
simple_tag <- function(tag) {
force(tag)
function(...) {
paste0("<", tag, ">", paste0(...), "</", tag, ">")
}
}
tags <- c("p", "b", "i")
html <- lapply(setNames(tags, tags), simple_tag)
html$p("this is ", html$b("bold"))
with(html, p("this is", b("bold"), "text."))
attach(html)
p("this is", b("bold"), "text.")
detach()
list2env(html, environment())
p("This is ", b("bold"), " text.")
rm(list = names(html), envir = environment())
midpoint <- function(f, a, b) {
(b - a) * f((a + b) / 2)
}
trapezoid <- function(f, a, b) {
(b - a) / 2 * (f(a) + f(b))
}
midpoint(sin, 0, pi)
trapezoid(sin, 0, pi)
midpoint_composite <- function(f, a, b, n = 10) {
points <- seq(a, b, length = n + 1)
h <- (b - a) / n
area <- 0
for (i in seq_len(n)) {
area <- area + h * f((points[i] + points[i + 1]) / 2)
}
area
}
trapezoid_composite <- function(f, a, b, n = 10) {
points <- seq(a, b, length = n + 1)
h <- (b - a) / n
area <- 0
for (i in seq_len(n)) {
area <- area + h / 2 * (f(points[i]) + f(points[i + 1]))
}
area
}
midpoint_composite(sin, 0, pi, n = 10)
midpoint_composite(sin, 0, pi, n = 100)
trapezoid_composite(sin, 0, pi, n = 10)
trapezoid_composite(sin, 0, pi, n = 100)
composite <- function(f, a, b, n = 10, rule) {
points <- seq(a, b, length = n + 1)
area <- 0
for (i in seq_len(n)) {
area <- area + rule(f, points[i], points[i + 1])
}
area
}
composite(sin, 0, pi, n = 10, rule = midpoint)
composite(sin, 0, pi, n = 10, rule = trapezoid)
simpson <- function(f, a, b) {
(b - a) / 6 * (f(a) + 4 * f((a + b) / 2) + f(b))
}
boole <- function(f, a, b) {
pos <- function(i) a + i * (b - a) / 4
fi <- function(i) f(pos(i))
(b - a) / 90 *
(7 * fi(0) + 32 * fi(1) + 12 * fi(2) + 32 * fi(3) + 7 * fi(4))
}
composite(sin, 0, pi, n = 10, rule = simpson)
composite(sin, 0, pi, n = 10, rule = boole)
newton_cotes <- function(coef, open = FALSE) {
n <- length(coef) + open
function(f, a, b) {
pos <- function(i) a + i * (b - a) / n
points <- pos(seq.int(0, length(coef) - 1))
(b - a) / sum(coef) * sum(f(points) * coef)
}
}
boole <- newton_cotes(c(7, 32, 12, 32, 7))
milne <- newton_cotes(c(2, -1, 2), open = TRUE)
composite(sin, 0, pi, n = 10, rule = milne)
randomise <- function(f) f(runif(1e3))
randomise(mean)
randomise(mean)
randomise(sum)
# functional --------------------------------------------------------------
lapply2 <- function(x, f, ...) {
out <- vector("list", length(x))
for (i in seq_along(x)) {
out[[i]] <- f(x[[i]], ...)
}
out
}
l <- replicate(20, runif(sample(1:10, 1)), simplify = F)
out <- vector("list", length(l))
for (i in seq_along(l)) {
out[[i]] <- length(l[[i]])
}
unlist(out)
unlist(lapply(l, length))
unlist(lapply(mtcars, class))
mtcars[] <- lapply(mtcars, function(x) x / mean(x))
trims <- c(0, .1, .2, .5)
x <- rcauchy(1000)
unlist(lapply(trims, function(trim) mean(x, trim = trim))) # beautiful
lapply(trims, mean, x = x)
# 3 ways to for-loop 1: for (x in xs) 2: for (i in seq_along(xs))
# 3:for(nm in names(xs))
xs <- runif(1e3)
res <- c()
for (x in xs) res <- c(res, sqrt(x)) #it is slowl
res
# 3 ways to using lapply
lapply(xs, function(x) {}) # most frequently used methods
lapply(seq_along(xs), function(i) {})
lapply(names(xs), function(nm) {})
formulas <- list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt
)
lapply(formulas, function(formula) lm(formula = formula, data = mtcars))
bootstraps <- lapply(1:10, function(i) {
rows <- sample(1:nrow(mtcars), rep = T)
mtcars[rows, ]
})
out <- vector("list", 10)
for (i in 1:10) {
rows <- sample(1:nrow(mtcars), rep = T)
out[[i]] <- mtcars[rows, ]
}
out
sapply(mtcars, is.numeric)
vapply(mtcars, is.numeric, logical(1))
sapply(list(), is.numeric) # better in interactive
vapply(list(), is.numeric, logical(1)) #better in program
df <- data.frame(x = 1:10, y = letters[1:10])
sapply(df, class)
vapply(df, class, character(1))
df2 <- data.frame(x = 1:10, y = Sys.time() + 1:10)
sapply(df2, class)
vapply(df2, class, character(1))
xs <- replicate(5, runif(10), simplify = F)
ws <- replicate(5, rpois(10, 5) + 1, simplify = F)
unlist(lapply(xs, mean))
unlist(lapply(seq_along(xs), function(i) {
weighted.mean(xs[[i]], ws[[i]])
}))
unlist(Map(weighted.mean, xs, ws))
mtmeans <- lapply(mtcars, mean)
mtmeans[] <- Map(`/`, mtcars, mtmeans)
Map(function(x, w) weighted.mean(x, w, na.rm = T), xs, ws)
rollmean <- function(x, n) {
out <- rep(NA, length(x))
offset <- trunc(n / 2)
for (i in (offset + 1):(length(x) - n + offset - 1)) {
out[i] <- mean(x[(i - offset):(i + offset - 1)])
}
out
}
x <- seq(1, 3, length = 1e2) + runif(1e2)
plot(x)
lines(rollmean(x, 5), col = "blue", lwd = 2)
lines(rollmean(x, 10), col = "red", lwd = 2)
rollapply <- function(x, n, f, ...) {
out <- rep(NA, length(x))
offset <- trunc(n / 2)
for (i in (offset = 1):(length(x) - n + offset + 1)) {
out[i] <- f(x[(i - offset):(i + offset)], ...)
}
out
}
x <- seq(1, 3, length = 1e2) + rt(1e2, df = 2) / 3
plot(x)
lines(rollmean(x, 5), col = "red", lwd = 2)
lines(rollapply(x, 5, median), col = "blue", lwd = 2)
# which element be computed first is not important when you using apply
lapply3 <- function(x, f, ...) {
out <- vector("list", length(x))
for (i in sample(seq_along(x))) {
out[[i]] <- f(x[[i]], ...)
}
out
}
unlist(lapply(1:10, sqrt))
unlist(lapply3(1:10, sqrt))
library(parallel)
boot_df <- function(x) x[sample(nrow(x), rep = T), ]
rsquared <- function(mod) summary(mod)$r.square
boot_lm <- function(i) {
rsquared(lm(mpg ~ wt + disp, data = boot_df(mtcars)))
}
system.time(lapply(1:500, boot_lm))
system.time(mclapply(1:500, boot_lm, mc.cores = 2)) # unfortunately, it does not work on windows,try parLapply() instead.
a <- matrix(1:20, nrow = 5)
a1 <- apply(a, 1, identity)
a1
identical(a, a1)
identical(a, t(a1))
x <- matrix(rnorm(20, 0, 1), nrow = 4)
x1 <- sweep(x, 1, apply(x, 1, min), `-`)
x2 <- sweep(x1, 1, apply(x1, 1, max), `/`)
outer(1:3, 1:10, `*`)
pulse <- round(rnorm(22, 70, 10 / 3)) + rep(c(0, 5), c(10, 12))
group <- rep(c("A", "B"), c(10, 12))
tapply(pulse, group, length)
tapply(pulse, group, mean)
split(pulse, group)
tapply2 <- function(x, group, f, ..., simplify = TRUE) {
pieces <- split(x, group)
sapply(pieces, f, simplify = simplify)
}
tapply2(pulse, group, mean)
tapply2(pulse, group, length)
Reduce2 <- function(f, x) {
out <- x[[1]]
for (i in seq(2, length(x))) {
out <- f(out, x[[i]])
}
out
}
l <- replicate(5, sample(1:10, 15, replace = T), simplify = F)
Reduce(intersect, l)
where <- function(f, x) {
vapply(x, f, logical(1))
}
df <- data.frame(x = 1:3, y = c("a", "b", "c"))
where(is.factor, df)
str(Filter(is.factor, df))
Find(is.numeric, df)
Position(is.factor, df)
# integrate() uniroot() potimise() are skiped
trans <- list(
disp = function(x) x * 0.0163871,
am = function(x) factor(x, levels = c("auto", "manual"))
)
for (var in names(trans)) {
mtcars[[var]] <- trans[[var]](mtcars[[var]])
}
# recursively using for-loop is skiped
#11.7
add <- function(x, y) {
stopifnot(length(x) == 1, length(y) == 1,
is.numeric(x), is.numeric(y))
x + y
}
rm_na <- function(x, y, identity) {
if(is.na(x) && is.na(y)) {
identity
} else if (is.na(x)) {
y
} else {
x
}
}
rm_na(NA, 10, 0)
rm_na(10, NA, 0)
rm_na(NA, NA, NA)
add <- function(x, y, na.rm = F) {
if (na.rm && (is.na(x) || is.na(y))) rm_na(x, y, 0) else x + y
}
add(10, NA)
add(10, NA, na.rm = T)
add(NA, NA)
add(NA, NA, na.rm = T)
r_add <- function(xs, na.rm = T) {
Reduce(function(x, y) add(x, y, na.rm = na.rm), xs)
}
r_add(1:10)
r_add(NA, na.rm = T)
r_add(numeric())
r_add <- function(xs, na.rm = T) {
Reduce(function(x, y) add(x, y, na.rm = na.rm), xs, init = 0)
}
r_add(1:10)
r_add(NA, na.rm = T)
r_add(numeric())
v_add1 <- function(x, y, na.rm = FALSE) {
stopifnot(length(x) == length(y), is.numeric(x), is.numeric(y))
if (length(x) == 0) return(numeric())
simplify2array(Map(function(x, y) add(x, y, na.rm = na.rm), x, y))
}
v_add1(1:10, 1:10)
v_add1(numeric(), numeric())
v_add1(c(1, NA), c(1, NA))
v_add1(c(1, NA), c(1, NA), na.rm = T)
c_add <- function(xs, na.rm = FALSE) {
Reduce(function(x, y) add(x, y, na.rm = na.rm), xs, accumulate = TRUE)
}
c_add(1:10)
row_sum <- function(x, na.rm = FALSE) {
apply(x, 1, add, na.rm = na.rm)
}
col_sum <- function(x, na.rm = FALSE) {
apply(x, 2, add, na.rm = na.rm)
}
# 12 ----------------------------------------------------------------------
chatty <- function(f) {
function(x, ...) {
res <- f(x, ...)
cat("processing", x, "\n", sep = "")
res
}
}
f <- function(x) x ^ 2
s <- 3:1
chatty(f)(1)
vapply(s, chatty(f), numeric(1))
delay_by <- function(delay, f) {
function(...) {
Sys.sleep(delay)
f(...)
}
}
system.time(runif(100))
system.time(delay_by(.1, runif)(100))
dot_every <- function(n, f) {
i <- 1
function(...) {
if (i %% n == 0) cat(".")
i <<- i + 1
f(...)
}
}
x <- lapply(1:100, runif)
x <- lapply(1:100, dot_every(10, runif))
library(memoise)
slow_func <- function(x) {
Sys.sleep(1)
10
}
system.time(slow_func())
system.time(slow_func())
fast_func <- memoise(slow_func)
system.time(fast_func())
system.time(fast_func())
fib <- function(n) {
if(n < 2) return(1)
fib(n -2) + fib(n - 1)
}
system.time(fib(25))
system.time(fib(26))
fib2 <- memoise(function(n) {
if (n < 2) return(1)
fib2(n - 2) + fib2(n - 1)
})
system.time(fib2(23))
system.time(fib2(24))
runif <- memoise(runif)
runif(5)
runif(5)
# a lot of contents skiped
# some interesting FUNCs by me --------------------------------------------
library(tidyverse)
get_coefs <- function(model) {
summary(model)[["coefficients"]]
}
line_two <- function(matrix, n = 2) {
matrix[n, , drop = F]
}
vstack <- function(list) {
do.call(rbind, list)
}
do_model <- function(response, data, model = lm, ...) {
formula <- as.formula(paste(response, "~", "."))
model(formula, data = data, ...)
}
extend_scaler <- function(x) {
c(x, unlist(lapply(0:99, function(i) str_c(x, i))))
}
extend_chr <- function(chr) {
unlist(lapply(chr, extend_scaler))
}
keep_row <- function(matrix, vras, extend = T) {
if (extend) vars = extend_chr(vars)
vars = intersect(rownames(matrix), vars)
matrix[vars, , drop = F]
}
univar <- function(response, vars_interested, controls = NA, data, model = lm, ...) {
entry_name <- c(response, vars_interested, controls)
entry_name <- entry_name[!is.na(entry_name)]
wrong_name <- setdiff(entry_name, names(data))
if (length(wrong_name) != 0) stop("wrong variable name in you entry")
out <- vector("list", length = length(vars_interested))
for (i in seq_along(vars_interested)) {
new_data <- data[c(response, vars_interested[[i]], controls[!is.na(controls)])]
fit <- do_model(response, new_data, model, ...)
out[[i]] <- get_coefs(fit)
}
out
}
data(mtcars); names(mtcars)
vars <- c("cyl", "disp", "hp")
univar("am", vars, c("qsec", "gear"), mtcars)
univar("am", vars, c("qsec", "gear"), mtcars) %>% lapply(line_two)
univar("am", vars, c("qsec", "gear"), mtcars) %>% lapply(line_two) %>% vstack
mtcars$cyl <- as.factor(mtcars$cyl)
univar("vs", vars, "gear", data = mtcars, model = glm, family = binomial())
univar("vs", vars, "gear", data = mtcars, model = glm, family = binomial()) %>%
vstack %>% keep_row(vars)
library(reshape)
library(tidyverse)
library(tableone)
q_plot <- function(data, ...) qplot(data = data, ...)
to_draw <- function(data, group_var) {
nms <- names(data)
nms2 <- setdiff(nms, group_var)
melt(data, id = group_var)
}
keep_num <- function(data, group) {
out <- keep(data, is.numeric)
if (is.numeric(data[[group]])) out else cbind(out, data[group])
}
data(iris); names(iris) <- letters[1:5]; iris$group <- c(1, 0); str(iris); head(iris)
iris %>% keep_num("group") %>% to_draw("group") %>%
q_plot(x = value, facets = variable ~ group, geom = "density")
CreateTableOne(c("a", "b", "c", "d", "e"), "group", data = iris) %>%
print(nonnormal = c("c", "d"))
网友评论