美文网首页
Notes of R for data science_07Ju

Notes of R for data science_07Ju

作者: liang_rujiang | 来源:发表于2020-07-07 12:02 被阅读0次

    缘起

    整理文件时发现之前的笔记,放到这里。

    目的

    坐车等无聊情景下方便回顾这些技能。

    # this is a note about the book [R For Data Science] 
    # and  a little bitter of [R In Action]
    # and [Advanced R]
    
    # Rujiang Liang
    # Created in 2018
    
    
    # R FOR DATA SCIENCE ------------------------------------------------------
    # DATA TRANSFORMATION -----------------------------------------------------
    
    library(tidyverse)
    library(nycflights13)
    
    nycflights13::flights %>%
      print(n = 10, width = Inf) # width = Inf to show all variables
    
    iris %>%
      .$Species %>% 
      table() %>%
      (function(x) x / 3) %>%
      (function(x) x + 2) %>%
      `*`(2) %>%
      round() %>%
      as.character()
    
    diamonds2 <- diamonds %>%
      filter(between(y, 3, 20))
    
    iris %>%
      .[["Species"]]
    
    jan1 <- flights %>%
      filter(month == 1, day == 1)
    
    test <- flights %>%
      filter(month == 11 | month == 12)
    
    table(test$month)
    
    filter(flights, month %in% c(11, 12)) %>%
      count(month)
    
    # ***FALSE****
     filter(flights, month == 11 | 12) %>% count(month)
    
    flights %>%
      arrange(year, month, day) 
    
    flights %>%
      arrange(desc(dep_delay))
    
    # SELECT helper
    # starts_with("abc"): matches names that begin with “abc”.
    # ends_with("xyz"): matches names that end with “xyz”.
    # contains("ijk"): matches names that contain “ijk”.
    # matches("(.)\\1"): selects variables that match a regular expression. 
    # num_range("x", 1:3): matches x1, x2 and x3
    
    flights %>%
      rename(YEAR = year, MONTH = month)
    
    select(flights, time_hour, air_time, everything())
    
    # generate variables by existing variables
    flights %>%
      select(
        year:day,
        ends_with("delay"),
        distance,
        air_time
      ) %>%
      mutate(
        gain = dep_delay - arr_delay,
        hour = air_time / 60,
        gaim_per_hour = gain / hour
      )
    
    # keep ONLY new variables
    transmute(flights, 
              gain = dep_delay - arr_delay,
              hour = air_time / 60,
              gain_per_hour = gain / hour
    )
    
    transmute(flights, 
              dep_time, # so that we can keep oringal variables
              hour = dep_time %/% 100,
              minute = dep_time %% 100
    )
    
    
    # group_by summary
    table1 <- tibble(
      a = c(1, 2),
      b = c(100, 200)
    )
    
    table1 %>%
      count(a)
    
    table1 %>%
      count(a, wt = b)
    
    flights %>%
      group_by(year, month, day) %>%
      summarise(count = n(), # MEAN or SD calculated by how many cases?
                delay = mean(dep_delay, na.rm = T)) # na.rm = T should not be omited 
                                                    # unless the data have no NA.
    
    flights %>%
      group_by(dest) %>%
      summarise(
        count = n(),
        dist = mean(distance, na.rm = T),
        delay = mean(arr_delay, na.rm = T)
      ) %>%
      filter(count > 20, dest != "HNL")
    
    not_cancelled <- flights %>%
      filter(!is.na(dep_delay), !is.na(arr_delay))
    
    not_cancelled %>%
      group_by(year, month, day) %>%
      summarise(mean = mean(dep_delay),
                n = n())
    
    delays <- not_cancelled %>%
      group_by(tailnum) %>%
      summarise(delay = mean(arr_delay)) %>%
      ggplot(aes(x = delay)) + 
      geom_freqpoly(binwidth = 10)
    
    not_cancelled %>% 
    group_by(year, month, day) %>% 
      summarise(
        first = min(dep_time),
        last = max(dep_time)
      )
    
    #first(x), nth(x, 2), last(x) work similarly to x[1], x[2], x[length(x)]
    not_cancelled %>% 
      group_by(year, month, day) %>% 
      summarise(
        first_dep = first(dep_time), 
        last_dep = last(dep_time),
        quantile = quantile(dep_time, .75)
      )
    
    iris %>%
      group_by(Species) %>%
      summarise(
        mean = mean(Sepal.Width),
        quantile = quantile(Sepal.Length, .75))
    
    sapply(Filter(is.numeric, iris), function(i) list(mean = mean(i), sd = sd(i)))
    funs <- list(min = min, median = median, mean = mean, max = max, sd = sd)
    sapply(funs, function(f) f(1:10, na.rm = T))
    sapply(Filter(is.numeric, iris), 
           function(x) sapply(funs, function(f) f(x, na.rm = T)))
    
    iris %>%
      split(.$Species) %>%
      sapply(., function(df) 
        list(mean = mean(df$Sepal.Width), 
             quantile = quantile(df$Sepal.Length, .75)))
    iris %>% 
      summarise(n())
    iris %>%
     group_by(Species) %>%
      summarise(n(),
                n_distinct(Sepal.Length),
                sum(!is.na(Sepal.Length)))
    
    not_cancelled %>% 
      group_by(year, month, day) %>% 
      mutate(r = min_rank(desc(dep_time))) %>%
      filter(r %in% range(r))
    
    not_cancelled %>% 
      group_by(year, month, day) %>% 
      summarise(
        avg_delay1 = mean(arr_delay),
        avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
      )
    
    #n(), sum(!is.na(x)), n_distinct(x)
    not_cancelled %>% 
      group_by(dest) %>% 
      summarise(carriers = n_distinct(carrier)) %>% 
      arrange(desc(carriers))
    
    not_cancelled %>% 
      group_by(year, month, day) %>%
      filter(rank((arr_delay)) < 10)
    iris %>%
      as.tibble %>%
      filter(rank(Sepal.Width) < 10)
    
    iris %>%
      as.tibble %>%
      filter(min_rank(Sepal.Width) < 10)
    
    iris %>%
      as.tibble %>%
      filter(rank(desc(Sepal.Width)) < 10)
    
    popular_dests <- flights %>% 
      group_by(dest) %>% 
      filter(n() > 365)
    popular_dests
    
    popular_dests %>% 
      filter(arr_delay > 0) %>% 
      mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
      select(year:day, dest, arr_delay, prop_delay)
    
    # related database (merge)
    library(nycflights13)
    # left_join:  by = c("dest" = "faa")
    top_dest <- flights %>%
      count(dest, sort = T) %>%
      head(10)
    
    flights %>%
      semi_join(top_dest)
    
    ## the below part of related database is not important
    library(nycflights13)
    airlines
    airports
    planes
    planes %>%
      count(tailnum) %>%
      filter(n > 1)
    
    weather %>% 
      count(year, month, day, hour, origin) %>% 
      filter(n > 1)
    
    flights %>%
      count(year, month, day, hour, flight) %>%
      filter(n > 1)
    
    flights <- flights %>%
      mutate(id = row_number())
    flights
    
    flights %>%
      count(id) %>%
      filter(n > 1)
    
    flights2 <- nycflights13::flights %>%
      select(year:day, hour, origin, dest, tailnum, carrier) 
    flights2 
    
    flights2 %>%
      select(-origin, -dest) %>%
      left_join(airlines, by = "carrier")
    
    flights2 %>%
      select(-origin, -dest) %>%
      mutate(name = airlines$name[match(carrier, airlines$carrier)])
    
    
    x <- tribble(
      ~key, ~val_x,
      1, "x1",
      2, "x2",
      2, "x3",
      1, "x4"
    )
    y <- tribble(
      ~key, ~val_y,
      1, "y1",
      2, "y2"
    )
    left_join(x, y, by = "key")
    left_join(y, x, by = "key")
    
    flights2 %>%
      left_join(weather)
    
    flights2 %>%
      left_join(planes, by = "tailnum")
    
    flights2 %>%
      left_join(airports, by = c("dest" = "faa"))
    
    flights2 %>%
      left_join(airports, by = c("origin" = "faa"))
    
    airports %>%
      semi_join(flights, c("faa" = "dest")) %>%
      ggplot(aes(lon, lat)) +
      borders("state") +
      geom_point(color = "blue") +
      coord_quickmap() + 
      theme_void()
    
    top_dest <- flights %>%
      count(dest, sort = T) %>%
      head(10)
    
    flights %>%
      filter(dest %in% top_dest$dest)
    
    flights %>%
      semi_join(top_dest, by = "dest")
    
    # spreading and gathering 
    library(tidyverse)
    table4a
    tidy4a <- table4a %>%
      gather(`1999`, `2000`, key = "year", value = "cases")
    tidy4a
    
    table4b
    tidy4b <- table4b %>%
      gather(`1999`, `2000`, key = "year", value = "population")
    
    tidy4a %>%
      left_join(tidy4b, by = c("country", "year"))
    
    table2
    table2 %>%
      spread(key = type, value = count)
    
    table3
    table3 %>%
      separate(rate, into = c("cases", "population"))
    
    table3 %>%
      separate(rate, into = c("cases", "population"), sep = "/", convert = T)
    
    table3 %>%
      separate(year, into = c("century", "year"), sep = 2)
    
    table5
    table5 %>%
      unite(new, century, year)
    
    table5 %>%
      unite(new, century, year, sep = "")
    stocks <- tibble(
      year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
      qtr    = c(   1,    2,    3,    4,    2,    3,    4),
      return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
    )
    stocks
    stocks %>%
      spread(year, return)
    
    stocks %>% 
      spread(year, return) %>% 
      gather(`2015`:`2016`, key = "year", value = "return")
    stocks %>% 
      spread(year, return) %>% 
      gather(`2015`:`2016`, key = "year", value = "return", na.rm = T)
    
    women %>% 
      mutate(id = row_number()) %>% 
      gather(-id, key = "key", value = "value")
    
    stocks %>%
      complete(year, qtr)
    
    treatment <- tribble(
      ~ person,           ~ treatment, ~response,
      "Derrick Whitmore", 1,           7,
      NA,                 2,           10,
      NA,                 3,           9,
      "Katherine Burke",  1,           4
    )
    treatment
    treatment %>%
      fill(person)
    
    # an example of data cleaning
    library(tidyverse)
    who
    who1 <- who %>%
      gather(new_sp_m014:newrel_f65, key = "key", value = "case", na.rm = T)
    who1
    who1 %>%
      count(key)
    who2 <- who1 %>%
      mutate(key = str_replace(key, "newrel", "new_rel"))
    who2
    who3 <- who2 %>%
      separate(key, c("new", "type", "sexage"), sep = "_")
    who3 %>%
      count(new)
    who4 <- who3 %>%
      select(-new, -iso2, -iso3)
    who4
    
    who5 <- who4 %>%
      separate(sexage, c("sex", "age"), sep = 1)
    who5
    
    
    tidyr::who %>%
      gather(key, value, new_sp_m014:newrel_f65, na.rm = T) %>%
      mutate(key = str_replace(key, "newrel", "new_rel")) %>%
      separate(key, c("new", "var", "sexage")) %>%
      select(-new, -iso2, -iso3) %>%
      separate(sexage, c("sex", "age"), sep = 1)
    
    # INPUT / OUTPUT ----------------------------------------------------------
    
    library(tidyverse)
    ggplot(diamonds, aes(carat, price)) + 
      geom_hex()
    ggsave("diamonds.pdf")
    
    plot1 <- ggplot(diamonds, aes(clarity, price)) + 
      geom_boxplot()
    ggsave("diamonds.png", plot = plot1)
    
    write_csv(diamonds, "diamonds.csv")
    
    read_csv("diamonds.csv")
    read_csv(
      "a, b, c
      1, 2, 3
      4, 5, 6"
    )
    
    read_csv("The first line of metadata
      The second line of metadata
      x,y,z
      1,2,3", skip = 2)
    
    read_csv("# A comment I want to skip
      x,y,z
      1,2,3", comment = "#")
    
    read_csv("1,2,3\n4,5,6", col_names = FALSE)
    read_csv("1,2,3\n4,5,6")
    
    read_csv("a,b,c\n1,2,.", na = ".")
    
    write_rds(iris, "iris.rds")
    read_rds("iris.rds")
    
    # save(file, file = "name.RData)
    # load("name.Rdata")
    
    # STRINGR & REGULAR EXPRESSION --------------------------------------------
    
    #strings
    library(tidyverse)
    
    string1 <- "this is a string"
    string2 <- 'if i want to include a "quote" inside a string, i use single quotes'
    x <- c("\"", "\\")
    x
    writeLines(x)
    str_length(c("a", "r for data science", NA))
    
    str_c("x", "y", sep = ", ")
    
    x <- c("abc", NA)
    x
    str_c("|-", x, "-|")
    str_c("|-", str_replace_na(x), "-|")
    str_c("prefix-", c("a", "b", "c"), "-suffix")
    
    name <- "Hadley"
    time_of_day <- "morning"
    birthday <- F
    
    str_c("good ", time_of_day, " ", name, if (birthday) "   and happy birthday",
          ".")
    str_c(c("x", "y", "z"), collapse = ", ")
    
    #subsetting strings
    x <- c("Apple", "Banana", "Pear")
    str_sub(x, 1, 3)
    str_sub(x, -3, -1)
    str_sub("a", 1, 5)
    str_sub(x, -3, -1) <- str_to_upper(str_sub(x, -3, -1))
    x
    
    #regular expression
    x <- c("apple", "banana", "pear")
    str_view(x, "an")
    
    str_view(x, ".a.")
    str_view(c("abc", "a.c", "bef"), "a\\.c")
    x <- "a\\b"
    writeLines(x)
    str_view(x, "\\\\")
    
    x <- c("apple", "banana", "pear")
    str_view(x, "^a")
    str_view(x, "a$")
    
    x <- c("apple pie", "apple", "apple cake")
    str_view(x, "apple")
    str_view(x, "^apple$")
    
    #For example, I’ll search for \bsum\b to 
    #avoid matching summarise, summary, rowsum and so on.
    
    # \\d: matches any digit.
    # \\s: matches any whitespace (e.g. space, tab, newline).
    # [abc]: matches a, b, or c.
    # [^abc]: matches anything except a, b, or c.
    
    str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
    str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]c")
    str_view(c("abc", "a.c", "a*c", "a c"), "a[b ]c")
    # This works for most (but not all) 
    # regex metacharacters: $ . | ? * + ( ) [ {. 
    # Unfortunately, a few characters have special meaning 
    # even inside a character class and 
    # must be handled with backslash escapes: ] \ ^ and -
    
    str_view(c("grey", "gray"), "gr(e|a)y")
    str_view(c("grey", "gray"), "gr[ea]y")
    str_view(c("grey", "gray", "ray", "r"), ".*r.*")
    
    # ?: 0 or 1
    # +: 1 or more
    # *: 0 or more
    # precedence of these operators is high
    
    x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
    str_view(x, "CC?")
    str_view(x, "CC+")
    str_view(x, "C[LX]+")
    
    str_view(x, "C{2}")
    str_view(x, "C{2,}")
    str_view(x, "C{2,3}")
    str_view(x, "C{2,3}?")
    str_view(x, "C[LX]+?")
    
    str_view(fruit, "(..)\\1", match = T)
    
    
    x <- c("apple", "banana", "pear")
    str_detect(x, "e")
    
    sum(str_detect(words, "^t"))
    mean(str_detect(words, "[aeiou]$"))
    
    
    no_vowels_1 <- !str_detect(words, "[aeiou]")
    no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
    identical(no_vowels_1, no_vowels_2)
    
    words[str_detect(words, "x$")]
    str_subset(words, "x$")
    
    df <- tibble(
      word = words,
      i = seq_along(word)
    )
    df %>%
      filter(str_detect(word, "x$"))
    
    x <- c("apple", "banana", "pear")
    str_count(x, "a")
    mean(str_count(x, "a"))
    
    df %>%
      mutate(
        vowels = str_count(word, "[aeiou]"),
        consonants = str_count(word, "[^aeiou]")
      )
    
    str_count("abababa", "aba")
    str_view_all("abababa", "aba")
    
    length(sentences)
    head(sentences)
    colors <- c("red", "orange", "yellow", "green", "blue", "purple")
    color_match <- str_c(colors, collapse = "|")
    color_match
    has_color <- str_subset(sentences, color_match)
    matchs <- str_extract(has_color, color_match)
    
    more <- sentences[str_count(sentences, color_match) > 1] 
    more
    str_extract(more, color_match)
    str_extract_all(more, color_match, simplify = T)
    str_extract_all(more, color_match, simplify = F)
    x <- c("a", "a b", "abc")
    str_extract_all(x, "[a-z]", simplify = T)
    noun <- "(a|the) ([^ ]+)"
    has_noun <- sentences %>%
      str_subset(noun) %>%
      head(10)
    has_noun %>%
      str_extract(noun)
    has_noun %>%
      str_match(noun)
    
    # FACTORS -----------------------------------------------------------------
    
    library(forcats)
    x1 <- c("Dec", "Apr", "Jan", "Mar")
    sort(x1)
    month_levels <- c(
      "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
      "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
    )
    y1 <- factor(x1, levels = month_levels)
    sort(y1)
    y1
    x2 <- c("Dec", "Apr", "Jam", "Mar")
    y2 <- factor(x2, levels = month_levels) # elements not in levels will be replaced with NA
    y2
    sort(y2)
    levels(y2)
    factor(x1)
    factor(x1, levels = unique(x1)) # waht is this mean?
    
    gss_cat %>%
      count(race)
    
    gss_cat %>%
      ggplot(aes(race)) + 
      geom_bar()
    
    gss_cat %>%
      ggplot(aes(race)) + 
      geom_bar() +
      scale_x_discrete(drop = F)
    
    relig_summary <- gss_cat %>%
      group_by(relig) %>%
        summarise(
          age = mean(age, na.rm = T),
          tvhours = mean(tvhours, na.rm = T),
          n = n()
        ) 
    
    ggplot(relig_summary, aes(tvhours, relig)) +
      geom_point()
    
    ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) + 
      geom_point()
    
    relig_summary %>%
      mutate(relig = fct_reorder(relig, tvhours)) # what is this mean?
    
    rincome_summary <- gss_cat %>%
      group_by(rincome) %>%
      summarise(
        age = mean(age, na.rm = T), 
        tvhours = mean(tvhours, ra.rm = T),
        n = n()
      )
    ggplot(rincome_summary, aes(age, rincome)) + geom_point()
    ggplot(rincome_summary, aes(age, fct_relevel(rincome, "Not applicable"))) +  # what is this mean?
      geom_point()
    
    
    
    # FUNCTION ----------------------------------------------------------------
    library(tidyverse)
    library(magrittr)
    
    df <- tibble(
      a = rnorm(10),
      b = rnorm(10),
      c = rnorm(10), 
      d = rnorm(10)
    )
    
    x <- df$a
    (x - min(x, no.rm = T)) / (max(x, na.rm = T) - min(x, na.rm = T))
    rng <- range(x, na.rm = T)
    (x - rng[1]) / (rng[2] - rng[1])
    
    rescale01 <- function(x) {   # rescale a vector, use for-loop if you want rescale a dataframe
      rng <- range(x, na.rm = T)
      (x - rng[1]) / (rng[2] - rng[1])
    }
    
    rescale01(c(0, 5, 10))
    rescale01(c(-10, 0, 10))
    rescale01(c(1, 2, 3, NA, 5))
    
    has_name <- function(x) {
      nms <- names(x)
      if (is.null(nms)) {
        rep(FALSE, length(x))
      } else {
        !is.na(nms) & nms != ""
      }
    }
    
    # if(length(x) == 0 || lenght(y) == 0), note that "OR AND" in *condition*
    # shou be written as "|| &&"
    
    f <- function(x, y, op) {
      switch(op, 
             plus = x + y,
             minus = x - y,
             times = x * y,
             dvide = x / y,
             stop("Unknown op")
      )
    }
    
    f(1, 2, "plus")
    f(2, 2, "minus")
    y <- 10
    x <- if (y < 20) "Too low" else "Too high"
    x
    
    mean_ci <- function(x, conf = .95) {
      se <- sd(x) / sqrt(length(x))
      alpha = 1 - conf
      mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
    }
    
    x <- runif(100)
    mean_ci(x)
    mean_ci(x, .99)
    
    wt_mean <- function(x, w) {
      if (length(x) != length(w)) {
        stop("'x' and 'w' must be the same length", call. = F) # call. = F what mean?
      }
      sum(w * x) / sum(x)
    }
    
    wt_mean <- function(x, w, na.rm = F) { # how robust the function you want?
      if (!is.logical(na.rm)) {
        stop("'na.rm' must be logical")
      }
      if (length(na.rm) != 1) {
        stop("'na.rm' must be length 1")
      }
      if(length(x) != length(w)) {
        stop("'x' and 'w' must be the same length", call. = F)
      }
      
      if(na.rm) {         # impressive   if (condition) {}    nothing happened if (condition = F)
        miss <- is.na(x) | is.na(w)
        w <- w[!miss]
        x <- x[!miss]
      }
      sum(x * w) / sum(w)
    }
    
    # a shorthand of preview one, but no message to show if there are errors
    wt_mean <- function(x, w, na.rm = F) { 
      stopifnot(is.logical(na.rm), length(na.rm) == 1)
      stopifnot(length(x) == length(w))  # stop if not (TURE) 
      
      if(na.rm) {
        miss <- is.na(x) | is.na(w)
        w <- w[!miss]
        x <- x[!miss]
      }
      sum(x * w) / sum(w)
    }
    
    commas <- function(...) stringr::str_c(..., collapse = ", ")
    commas(letters[1:10])
    
    rule <- function(..., pad = "-") {
      title <- paste0(...)
      width = getOption("width") - nchar(title) - 5
      cat(title, " ", stringr::str_dup(pad, width), "\n", sep = "")
    }
    
    x <- c(1, 2)
    sum(x, na.mr = T) 
    sum(x, na.mr = F) # be careful, some function treat T as 1 and F as 0
    
    library(tidyverse)
    typeof(letters)
    typeof(1:10)
    typeof(1)
    typeof(1L)
    
    x <- sqrt(2) ^ 2
    x
    x - 2
    x == 2
    
    c(-1, 0, 1) / 0
    
    x <- sample(20, 100, replace = T)
    sum(x > 10)
    mean(x > 10)
    
    x <- c(10, 3, NA, 5, 8, 1, NA)
    x[!is.na(x)]
    x[x %% 2 == 0]
    x[x %% 2 == 0 & !is.na(x)]
    
    y <- c(x = 1, y = 2, z = 3)
    purrr::set_names(1:3, c("a", "b", "c"))
    y["x"]
    y[c("x", "y")]
    
    x <- list(1, 2, 3)
    x
    str(x)
    x_named <- list(a = 1, b = 2, c = 3)
    str(x_named)
    y <- list("a", 1L, 1.5, T)
    str(y)
    z <- list(list(1, 2), list(3, 4))
    str(z)
    
    a <- list(a = 1:3, b = "a string", c = pi, d = list(-1, -5))
    str(a)
    str(a[1:2])
    str(a[[1]])
    str(a[[4]])
    a[[4]][1]
    a[[4]][[1]]
    
    df <- tibble(
      a = rnorm(10),
      b = rnorm(10), 
      c = rnorm(10),
      d = rnorm(10)
    )
    
    output <- vector("double", ncol(df))
    for (i in seq_along(df)) {
      output[[i]] <- median(df[[i]])
    }
    output
    
    
    # 1.The output: output <- vector("double", length(x)). Before you start the loop, you must always allocate sufficient space for the output. This is very important for efficiency: if you grow the for loop at each iteration using c() (for example), your for loop will be very slow.
    # 
    # A general way of creating an empty vector of given length is the vector() function. It has two arguments: the type of the vector (“logical”, “integer”, “double”, “character”, etc) and the length of the vector.
    # 
    # 2.The sequence: i in seq_along(df). This determines what to loop over: each run of the for loop will assign i to a different value from seq_along(df). It’s useful to think of i as a pronoun, like “it”.
    # 
    # You might not have seen seq_along() before. It’s a safe version of the familiar 1:length(l), with an important difference: if you have a zero-length vector, seq_along() does the right thing:
    #   
    #   y <- vector("double", 0)
    # seq_along(y)
    # #> integer(0)
    # 1:length(y)
    # #> [1] 1 0
    # 
    # You probably won’t create a zero-length vector deliberately, but it’s easy to create them accidentally. If you use 1:length(x) instead of seq_along(x), you’re likely to get a confusing error message.
    # 
    # 3.The body: output[[i]] <- median(df[[i]]). This is the code that does the work. It’s run repeatedly, each time with a different value for i. The first iteration will run output[[1]] <- median(df[[1]]), the second will run output[[2]] <- median(df[[2]]), and so on.
    
    rescale01 <- function(x) {
      rng <- range(x, na.rm = T)
      (x - rng[1]) / (rng[2] - rng[1])
    }
    
    df$a <- rescale01(df$a)
    df$b <- rescale01(df$b)
    df$c <- rescale01(df$c)
    df$d <- rescale01(df$d)
    
    for (i in seq_along(df)) {
      df[[i]] <- rescale01(df[[i]])
    }
    
    # for (x in xs)
    #   
    #   for (nm in names(xs))
    #     x[[nm]]
    
    results <- vector("list", length(iris))
    names(results) <- names(iris)
    
    # “logical”, “integer”, “double”, “character”
    
    for (i in seq_along(iris)) {  # what is mean?
      names <- names(iris)[[i]]
      value <- iris[[i]]
    }
    
    means <- c(0, 1, 2)
    output <- double()
    for (i in seq_along(means)) {
      n <- sample(100, 1)
      output <- c(output, rnorm(n, means[[i]]))
    }
    str(output)
    
    #save time and space
    out <- vector("list", length(means))
    for (i in seq_along(means)) {
      n <- sample(100, 1)
      out[[i]] <- rnorm(n, means[[i]])
    }
    str(out)
    str(unlist(out))
    
    #You might be generating a long string. Instead of paste()ing together each iteration with the previous, save the output in a character vector and then combine that vector into a single string with paste(output, collapse = "").
    #You might be generating a big data frame. Instead of sequentially rbind()ing in each iteration, save the output in a list, then use dplyr::bind_rows(output) to combine the output into a single data frame.
    
    # for (i in seq_along(x)) {
    #   # body
    # }
    # 
    # # eauivalent to 
    # i <- 1
    # while (i <= length(x)) {
    # # body  
    # i <- i + 1
    # }
    
    flip <- function() sample(c("T", "H"), 1)
    flips <- 0
    nheads <- 0
    
    while (nheads < 3) {
      if (flip() == "H") {
        nheads = nheads + 1
      } else {
        nheads <- 0
      }
      flips <- flips + 1
    }
    
    # files <- dir("data/", pattern = "\\.csv$", full.names = T)
    
    # created by Rujiang Liang, it is not perfect
    # show_mean <- function(df) {
    #   vector <- logical()
    #   for (i in 1:ncol(df)) {
    #     vector[i] <- is.numeric(df[[i]])  
    #   }
    #   df_reduced <- df[, vector]
    #   out_name <- names(df_reduced)
    #   out_value <- double()
    #   for (j in 1:ncol(df_reduced)) {
    #     out_value[j] <- mean(df_reduced[[j]])
    #   }
    #   as.matrix(c(out_name, out_value), ncol = 2)
    # }
    
    df <- tibble(
      a = rnorm(10),
      b = rnorm(10), 
      c = rnorm(10), 
      d = rnorm(10)
    )
    
    output <- vector("double", length(df))
    for (i in seq_along(df)) {
      output[i] <- mean(df[[i]])
    }
    output
    
    col_mean <- function(df) {
      output <- vector("double", length(df))
      for (i in seq_along(df)){
        output[i] <- mean(df[[i]])
      }
      output
    }
    
    col_summary <- function(df, fun) {
      out <- vector("double", length(df))
      for (i in seq_along(df)) {
        out[i] <- fun(df[[i]])
      }
      out
    }
    
    col_summary(df, mean)
    col_summary(df, sd)
    
    map_dbl(df, mean)
    map_dbl(df, median)
    map_dbl(df, sd)
    map_dbl(iris[-5], function(x) length(unique(x)))
    map_dbl(iris[-5], ~length(unique(.)))
    
    # map() makes a list.
    # map_lgl() makes a logical vector.
    # map_int() makes an integer vector.
    # map_dbl() makes a double vector.
    # map_chr() makes a character vector.
    
    df %>% map_dbl(mean)
    
    map_dbl(df, mean, trim = .5)
    
    z <- list(x = 1:3, y = 4:5)
    map_int(z, length)
    
    models <- mtcars %>%
      split(.$cyl) %>%
      map(function(df) lm(mpg ~ wt, data = df)) 
    
    models <- mtcars %>%
      split(.$cyl) %>%
      map(~lm(mpg ~ wt, data = .))
    
    models %>%
      map(summary) %>%
      map_dbl(~.$r.squared)
    
    models %>%
      map(summary) %>%
      map(~.$coefficients)
    
    models %>%
      map(summary) %>%
      map(~.$coefficients[2, 1])
    
    
    models %>%
      map(summary) %>%
      map(~.$coefficients["wt", ]) 
    
    models %>%
      map(summary) %>%
      map_dbl("r.squared")
    
    models %>%
      map(summary) %>%
      map("coefficients")
    
    mtcars %>%
      split(.$cyl) %>%
      map(~lm(mpg ~ wt, .)) %>%
      map(summary) %>%
      map("coefficients") %>%
      map(~.["wt", c("Estimate", "t value")]) %>%
      reduce(rbind)
    
    x <- list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9))
    x %>% map_dbl(2)
    
    threshold <- function(x, cutoff = .8) x[x > cutoff]
    
    safe_log <- safely(log)
    str(safe_log(10))
    str(safe_log("a"))
    x <- list(1, 10, "a")
    y <- x %>% map(safely(log))
    str(y)
    y <- y %>% transpose()
    str(y)
    is_ok <- y$error %>% map_lgl(is_null)
    x[!is_ok]
    y$result[is_ok] %>% flatten_dbl()
    
    x %>% map_dbl(possibly(log, NA))
    x <- list(1, -1)
    x %>% map(quietly(log)) %>% str()
    
    mu <- tibble(`5` = 5, `10` = 10, `-3` = -3)
    mu %>%
      map_df(rnorm, n = 5)
    
    mu <- list(`5` = 5, `10` = 10, `-3` = -3)
    mu %>% 
      map(rnorm, n = 5)  
    
    mu %>%
      map_df(rnorm, n = 5)
    
    sigma <- list(1, 5, 10)
    seq_along(mu) %>%
      map(~rnorm(5, mu[[.]], sigma[[.]])) %>%
      str()
    
    map2(mu, sigma, rnorm, n = 5) %>% str()
    
    n <- list(1, 3, 5)
    arg1 <- list(n, mu, sigma)
    arg1 %>%
      pmap(rnorm) %>%
      str()
    # better to name the arguments
    arg2 <- list(mean = mu, sd = sigma, n = n)
    arg2 %>%
      pmap(rnorm) %>%
      str()
    
    # a data.frame ensures name and some lenght of each column
    params <- tibble( 
      mean = c(5, 10, -3),
      sd = c(1, 5, 10),
      n = c(1, 3, 5)
    )
    params %>%
      pmap(rnorm)
    
    sim <- tribble(
      ~f,      ~params,
      "runif", list(min = -1, max = 1),
      "rnorm", list(sd = 5),
      "rpois", list(lambda = 10)
    )
    sim %>%
      mutate(sim = invoke_map(f, params, n = 10))
    
    plots <- mtcars %>%
      # group_by(cyl) %>% do not work
      split(.$cyl) %>%
      map(~ggplot(., aes(mpg, wt)) + geom_point())
    paths <- str_c(names(plots), ".pdf")
    pwalk(list(paths, plots), ggsave)
    
    iris %>%
      keep(is.factor) %>%
      str()
    iris %>%
      keep(is.numeric) %>%
      str()
    iris %>%
      discard(is.factor) %>%
      str()
    
    x <- list(1:5, letters, list(10))
    x %>%
      some(is_character)
    x %>%
      every(is_vector)
    
    #detect() finds the first element where the predicate is true; 
    #detect_index() returns its position.
    x <- sample(10)
    x %>%
      detect(~ . > 5)
    x %>%
      detect_index(~ . > 5)
    
    #head_while() and tail_while() 
    #take elements from the start or end of a vector 
    #while a predicate is true
    x %>%
      head_while(~ . > 5)
    x %>%
      tail_while(~ . > 5)
    
    dfs <- list(
      age = tibble(name = "John", age = 30),
      sex = tibble(name = c("John", "Marry"), sex = c("m", "f")),
      trt = tibble(name = "Marry", treatment = "a")
    )
    dfs %>% reduce(full_join)
    
    dfs <- list(
      c(1, 3, 5, 6, 10),
      c(1, 2, 3, 7, 8, 10),
      c(1, 2, 3, 4, 8, 9, 10)
    )
    dfs %>%
      reduce(intersect)
    #Accumulate keeps all the interim results.
    x <- sample(10)
    x %>%
      accumulate(`+`)
    
    # MODEL -------------------------------------------------------------------
    
    library(tidyverse)
    library(modelr)
    options(na.action = na.warn)
    
    ggplot(sim1, aes(x, y)) + 
      geom_point()
    
    models <- tibble(
      a1 = runif(250, -20, 40),
      a2 = runif(250, -5, 5)
    )
    
    ggplot(sim1, aes(x, y)) + 
      geom_abline(aes(intercept = a1, slope = a2), 
                  data = models, alpha = 1/4) + 
      geom_point()
    
    model1 <- function(a, data) {
      a[1] + data$x * a[2]
    }
    
    model1(c(7, 1.5), sim1)
    
    measure_distance <- function(mod, data) {
      diff <- data$y - model1(mod, data)
      sqrt(mean(diff ^ 2))
    }
    
    measure_distance(c(7, 1.5), sim1)
    
    sim1_dist <- function(a1, a2) {
      measure_distance(c(a1, a2), sim1)
    }
    
    models <- models %>%
      mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
    
    models
    
    ggplot(sim1, aes(x, y)) + 
      geom_point(size = 2, color = "grey30") + 
      geom_abline(
        aes(intercept = a1, slope = a2, color = -dist), 
        data = filter(models, rank(dist) <= 10)
      )
    
    ggplot(models, aes(a1, a2)) + 
      geom_point(data = filter(models, rank(dist) <= 10), 
                 size = 4, color = "red") + 
      geom_point(aes(color = -dist))
    
    grid <- expand.grid(
      a1 = seq(-5, 20, length = 25),
      a2 = seq(1, 3, length = 25)
    ) %>%
      mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
    
    grid %>%
      ggplot(aes(a1, a2))  + 
      geom_point(data = filter(grid, rank(dist) <= 10), 
                 size = 4, color = "red") + 
      geom_point(aes(color = -dist))
    
    ggplot(sim1, aes(x, y)) + 
      geom_point(size = 2, color = "grey30") + 
      geom_abline(
        aes(intercept = a1, slope = a2, color = -dist),
        data = filter(grid, rank(dist) <= 10)
      )
    
    best <- optim(c(0, 0), measure_distance, data = sim1)
    best$par
    
    ggplot(sim1, aes(x, y)) + 
      geom_point(size = 2, color = "grey30") + 
      geom_abline(intercept = best$par[1], slope = best$par[2])
    
    sim1_mod <- lm(y ~ x, data = sim1)
    coef(sim1_mod)
    
    ggplot(sim1, aes(x, y)) + 
      geom_point(size = 2, color = "grey30") + 
      geom_abline(intercept = best$par[1], slope = best$par[2],
                  size = 3, alpha = 1 / 4) +
      geom_abline(intercept = coef(sim1_mod)[1], 
                  slope = coef(sim1_mod)[2],
                  color = "red")
    
    # bookmark ----------------------------------------------------------------
    
    grid <- sim1 %>%
      data_grid(x)
    
    grid
    
    grid <- grid %>%
      add_predictions(sim1_mod)
    
    grid
    
    ggplot(sim1, aes(x)) + 
      geom_point(aes(y = y)) + 
      geom_line(aes(y = pred), data = grid, color = "red", size = 1)
    
    sim1 <- sim1 %>%
      add_residuals(sim1_mod)
    sim1
    
    ggplot(sim1, aes(resid)) + 
      geom_freqpoly(binwidth = .5)
    
    ggplot(sim1, aes(x, resid)) + 
      geom_ref_line(h = 0) + 
      geom_point()
    
    df <- tibble(
      y = c(4, 5), 
      x1 = c(2, 1), 
      x2 = c(5, 6)
    )
    
    model_matrix(df, y ~ x1)
    
    ggplot(sim2) + 
      geom_point(aes(x, y))
    
    mod2 <- lm(y ~ x, data = sim2)
    grid <- sim2 %>%
      data_grid(x) %>%
      add_predictions(mod2)
    grid  
    
    ggplot(sim2, aes(x)) + 
      geom_point(aes(y = y)) +
      geom_point(data = grid, aes(y = pred), color = "red", size = 4)
    
    ggplot(sim3, aes(x1, y)) +
      geom_point(aes(color = x2))
    
    mod1 <- lm(y ~ x1 + x2, data = sim3)
    mod2 <- lm(y ~ x1 * x2, data = sim3)
    
    grid <- sim3 %>%
      data_grid(x1, x2) %>%
      gather_predictions(mod1, mod2)
    grid
    
    ggplot(sim3, aes(x1, y, color = x2)) + 
      geom_point() + 
      geom_line(data = grid, aes(y = pred)) +
      facet_wrap(~ model)
    
    sim3 <- sim3 %>%  
      gather_residuals(mod1, mod2) 
    
    ggplot(sim3, aes(x1, resid, color = x2)) +
      geom_point() + 
      facet_grid(model ~ x2)
    
    mod1 <- lm(y ~ x1 + x2, data = sim4)
    mod2 <- lm(y ~ x1 * x2, data = sim4)
    
    grid <- sim4 %>%
      data_grid(
        x1 = seq_range(x1, 5),
        x2 = seq_range(x2, 5)
      ) %>%
      gather_predictions(mod1, mod2)
    grid
    
    ggplot(grid, aes(x1, x2)) + 
      geom_tile(aes(fill = pred)) + 
      facet_wrap(~ model)
    
    ggplot(grid, aes(x1, pred, color = x2, group = x2)) + 
      geom_line() + 
      facet_wrap(~ model)
    ggplot(grid, aes(x2, pred, color = x1, group = x1)) + 
      geom_line() + 
      facet_wrap(~ model)
    
    library(splines)
    
    sim5 <- tibble(
      x = seq(0, 3.5 * pi, length = 50), 
      y = 4 * sin(x) + rnorm(length(x))
    )
    
    ggplot(sim5, aes(x, y)) + 
      geom_point()
    
    mod1 <- lm(y ~ ns(x, 1), data = sim5)
    mod2 <- lm(y ~ ns(x, 2), data = sim5)
    mod3 <- lm(y ~ ns(x, 3), data = sim5)
    mod4 <- lm(y ~ ns(x, 4), data = sim5)
    mod5 <- lm(y ~ ns(x, 5), data = sim5)
    mod6 <- lm(y ~ ns(x, 6), data = sim5)
    mod7 <- lm(y ~ ns(x, 7), data = sim5)
    
    grid <- sim5 %>%
      data_grid(x = seq_range(x, n = 50, expand = .1)) %>%
      gather_predictions(mod1, mod2, mod3, mod4, mod5, mod6, mod7, .pred = "y")
    
    ggplot(sim5, aes(x, y)) + 
      geom_point() + 
      geom_line(data = grid, color = "red") + 
      facet_wrap(~ model)
    
    nobs(mod1)
    
    
    library(tidyverse)
    library(modelr)
    options(na.action = na.warn)
    library(nycflights13)
    library(lubridate)
    
    ggplot(diamonds, aes(cut, price)) + geom_boxplot()
    ggplot(diamonds, aes(color, price)) + geom_boxplot()
    ggplot(diamonds, aes(clarity, price)) + geom_boxplot()
    
    ggplot(diamonds, aes(carat, price)) + geom_hex(bins = 100)
    
    diamonds2 <- diamonds %>%
      filter(carat <= 2.5) %>%
      mutate(lprice = log2(price), lcarat = log2(carat))
    
    ggplot(diamonds2, aes(lcarat, lprice)) + 
      geom_hex(bins = 50)
    
    mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)
    
    grid <- diamonds2 %>%
      data_grid(carat = seq_range(carat, 20)) %>%
      mutate(lcarat = log2(carat)) %>%
      add_predictions(mod_diamond, "lprice") %>%
      mutate(price = 2 ^ lprice)
    grid
    
    ggplot(diamonds2, aes(carat, price)) +
      geom_point(alpha = .1) + 
      geom_line(data = grid, color = "red", size = 1)
    
    diamonds2 <- diamonds2 %>%
      add_residuals(mod_diamond, "lresid")
    
    ggplot(diamonds2, aes(lcarat, lresid)) + 
      geom_hex(bins = 50)
    
    ggplot(diamonds2, aes(cut, lresid)) + geom_boxplot()
    ggplot(diamonds2, aes(color, lresid)) + geom_boxplot()
    ggplot(diamonds2, aes(clarity, lresid)) + geom_boxplot()
    
    mod_diamond2 <- lm(lprice ~ lcarat + color + cut + clarity, data = diamonds2)
    
    grid <- diamonds2 %>%
      data_grid(cut, .model = mod_diamond2) %>%
      add_predictions(mod_diamond2)
    grid
    
    ggplot(grid, aes(cut, pred)) + 
      geom_point()
    diamonds2 <- diamonds2 %>%
      add_residuals(mod_diamond2, "lresid2")
    ggplot(diamonds2, aes(lcarat, lresid2)) +
      geom_hex(bins = 50)
    
    diamonds2 %>%
      filter(abs(lresid2) > 1) %>%
      add_predictions(mod_diamond2) %>%
      mutate(pred = round(2 ^ pred)) %>%
      select(price, pred, carat:table, x:z) %>%
      arrange(price)
    daily <- flights %>%
      mutate(date = make_date(year, month, day)) %>%
      group_by(date) %>%
      summarise(n = n())
    daily
    
    ggplot(daily, aes(date, n)) + 
      geom_line()
    
    daily <- daily %>%
      mutate(wday = wday(date, label = T))
    ggplot(daily, aes(wday, n)) + 
      geom_boxplot()
    
    mod <- lm(n ~ wday, data = daily)  
    grid <- daily %>%
      data_grid(wday) %>%
      add_predictions(mod, "n")
    
    ggplot(daily, aes(wday, n)) + 
      geom_boxplot() + 
      geom_point(data = grid, color = "red", size = 4)
    
    daily <- daily %>%
      add_residuals(mod)
    daily %>%
      ggplot(aes(date, resid)) + 
      geom_ref_line(h = 0) + 
      geom_line()
    
    ggplot(daily, aes(date, resid, color = wday)) + 
      geom_ref_line(h = 0) + 
      geom_line()
    
    daily %>%
      filter(resid < -100)
    
    daily %>%
      ggplot(aes(date, resid)) + 
      geom_ref_line(h = 0) + 
      geom_line(color = "grey50") + 
      geom_smooth(se = F, span = .3)
    
    daily %>%
      filter(wday == "周六") %>% 
      ggplot(aes(date, n)) + 
      geom_point() + 
      geom_line() + 
      scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b") + 
      theme_bw()
      
    term <- function(date) {
      cut(date,
          breaks = ymd(20130101, 20130605, 20130825, 20140101),
          labels = c("spring", "summer", "fall"))
    } 
    
    daily <- daily %>%
      mutate(term = term(date))
    
    daily %>%
      filter(wday == "周六") %>%
      ggplot(aes(date, n, color = term)) + 
      geom_point(alpha = 1/3) + 
      geom_line() + 
      scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b")
    
    daily %>%
      ggplot(aes(wday, n, color = term)) + 
      geom_boxplot()
    
    mod1 <- lm(n ~ wday, data = daily)
    mod2 <- lm(n ~ wday * term, data = daily)
    
    daily %>%
      gather_residuals(without_term = mod1, with_term = mod2) %>%
      ggplot(aes(date, resid, color = model)) + 
      geom_line(alpha = .75)
    
    grid <- daily %>%
      data_grid(wday, term) %>%
      add_predictions(mod2, "n")
    
    ggplot(daily, aes(wday, n)) + 
      geom_boxplot() + 
      geom_point(data = grid, color = "red") + 
      facet_wrap(~ term)
    
    mod3 <- MASS::rlm(n ~ wday * term, data = daily)
    
    daily %>%
      add_residuals(mod3, "resid") %>%
      ggplot(aes(date, resid)) + 
      geom_hline(yintercept = 0, color = "grey75") + 
      geom_line()
    
    #chapter 25
    #chapter 27
    
    # R BASE ------------------------------------------------------------------
    
    test <- 1:3
    test[1] #ok
    test[[1]] #ok
    # object[index]  "index" can be a vector of numeric 
    # or a vector of character if nameed
    df <- data.frame(a = 1:5, b = 1:5, c = 1:5)
    df
    df$c(a, b) # not appliable
    df$c("a", "b") # not appliable
    df[c(a,b)] # not applyable
    
    df[c("a","b")] #  ok
    
    my_index <- "a"
    df[my_index]
    
    df[!names(df) %in% c("a", "b")] # ok
    
    x <- c("one", "two", "three")
    x[c(1, -1)] # it does not work, do not mix negative and positive sub_number
    x[rep(1, 10)]
    x[0]
    
    x <- c(10, 3, NA, 5, 8, 1, NA)
    x[x %% 2 == 0]
    x[x %% 2 == 1]
    
    attach(mtcars)
    summary(mpg)
    plot(mpg, disp)
    plot(mpg, wt)
    detach(mtcars)
    
    data$gender <- factor(data$gender,
                          levels = c(1, 2),
                          labels = c("male", "female"))
    
    data$age[data$age == 99] <- NA
    data <- within(data, {
      agecat <- NA
      agecat[age > 75] <- "Elder" 
      agecat[age <= 75] <- "Young"
    })
    
    data <- transform(data,
                      sumx = x1 + x2,
                      meanx = (x1 + x2) / 2)
    
    library(car)
    x <- 10:100
    df <- data.frame(x = x)
    df
    df$y <- recode(df$x, "lo:60 = 'C'; 61:80 = 'B'; 81:100 = 'A'")
    df$z <- recode(df$x, "lo:60 = 'C'; 60:80 = 'B'; 81:hi='A'; else='NULL'")
    df
    
    help(recode) 
    
    leadership <- iris
    names(leadership)[2] <- "something_you_like"
    names(leadership)
    names(leadership)[2:4] <- c("something", "you", "like")
    names(leadership)
    names(leadership)[[2]] <- "something_you_like"
    names(leadership)
    
    order(4:6)
    (3:1)[order(3:1)]
    leadership[order(leadership$Sepal.Length),]
    leadership[order(
      c(leadership$Sepal.Length, leadership$something_you_like)
      ),]
    # is.na() can be applied to a dataframe, this is awesome
    # is.infinite()  in.nan()
    # na.omit() drops all the obs(rows) that contain NA
    # something == NA do not work
    # any(c(T, T, F))   all(c(T, T, F))
    
    sum(c(1, 2, NA))
    
    # cbind(a, b) is a relatively safe function,
    # to make it work, dataframe a and b must have same variable,
    # but do not have to be arranged to same order
    
    # subset
    df[6:10, ]
    
    myvars <- c("q1", "q2", "q3", "q4", "q5")
    df[myvars]
    
    myvars <- paste("q", 1:5, sep = "")
    myvars
    df[myvars]
    
    myvars <- names(df) %in% c("q3", "q4")
    df[!myvars]
    
    head(df)
    df$Sepal.Length <- NULL
    df$Sepal.Width <- df$Petal.Length <- NULL # interesting
    
    leadership[leadership$male == 1, ]
    
    df[1, "Species"] 
    df[1, 2]
    
    subset(leadership, age >= 35 | age < 24,
           select = c(q1, q2, q3, q4, q5)) # select = q1:q5
    
    sample(1:10, 3, replace = F)
    
    mean(x, trim = .05, na.rm = T) # drop 10% obs and na
    
    ##strings
    x <- c("ab", "cde", "fghij")
    nchar(x)
    length(x)
    substr(x, 1, 1)
    substr(x, 1, 1) <- "something"
    x
    substr(x, 1, 2) <- "something"
    x
    toupper(c("aaa", "bb"))
    tolower(c("someTHING", "hErE"))
    seq(1, 5, 2)
    
    x <- 1:10
    cut(x, 3)
    cut(x, 3, ordered_result = T)
    cut(x, c(0, 3, 6, 9, 10), 
        labels = c("not_so_good", "good", "great", "awesome"),
        ordered_result = T)
    
    pretty(x, 3)
    
    c <- matrix(runif(12), nrow = 3)
    c
    mean(c)
    apply(c, 1, mean)
    apply(c, 2, mean)
    apply(c, 2, mean, trim = .2)
    
    apply(iris, 2, mean) # why does not work?
    apply(iris[-5], 2, mean) # it works
    apply(iris[-5], 1, mean) # it works
    
    ## function
    mystats <- function(x, parametric = T, print = F) {
      if (parametric) {
        center = mean(x)
        spread = sd(x)
      } else {
        center = median(x)
        spread = mad(x)
      } 
      
      if (print & parametric) {
        cat("Mean =", center, "\n", "SD =", spread, "\n")
      } else if (print & !parametric){
          cat("Median =", center, "\n", "SD =", spread, "\n")
      }
      
      results <- list(center = center, spread = spread)
      return(results)
    }
    
    set.seed(1234)
    x <- rnorm(500)
    mystats(x)
    y <- mystats(x, print = T)
    x <- 1:10
    x
    names(x) <- c(letters[1:10])
    x
    attributes(x)
    attr(x, "dim") <- c(2, 5)
    x
    attributes(x)
    row.names(x) <- c("a", "b")
    attributes(x)
    x
    attr(x, "dimnames")[[2]] <- c(LETTERS[1:5])
    x
    attr(x, "names") <- NULL
    x
    attributes(x)
    attr(x, "test") <- "this is a test"
    x
    attr(x, "dim") <- NULL
    x
    
    attributes(iris)
    iris <- unclass(iris)
    str(iris)
    attributes(iris)
    
    data(iris)
    set.seed(1234)
    fit <- kmeans(iris[1:4], 3)
    fit
    str(fit)
    names(fit)
    summary(fit)
    unclass(fit)
    fit[c(2, 7)]
    fit[2]
    fit[[2]]
    fit[["centers"]]
    fit$"centers"
    fit$centers
    fit[[2]][1, 1:3]
    fit[2][1][1][[1]][1, ]
    
    interactive()
    args(mean)
    
    #output to file
    sink("test.txt", append = T)
    sink()
    cat("hello")
    cat("hello", file = "mytxt.txt", append = T)
    #work with dirs
    getwd()
    list.dirs()
    list.files()
    dir()
    dir(path = "./check")
    file.info("./check.readme.txt")
    file.info("communication.html")
    file.exists("./check/readme.txt")
    file.exists("test.txt")
    file_test("-f", "check")
    file_test("-d", "check")
    dir.create("./something")
    file.exists("something")
    file.create("test.txt")
    file.info("test.txt")
    file.rename("test.txt", "something.txt")
    file.info("something.txt")
    file.remove("something.txt")
    file.exists(c("something.txt", "test.txt"))
    system("tree")
    file.rename("something", "something2")
    file.copy("diamonds.csv", "./something2/renamed.html")
    unlink("something2")
    
    readLines("./check/readme.txt", encoding = "UTF-8")
    readLines("temp.R")
    
    # SOME FUNCTION BY ME -----------------------------------------------------
    
    my_sample <- function(x, size = 30, repeats = 100) {
      out <- vector("double", length = repeats)
      for (i in seq_len(repeats)) {
        out[[i]] <- mean(sample(x, size))
      }
      out
    }
    
    sizes <- c(`20` = 20, `30` = 30, `50` = 50, `100` = 100)
    
    sizes %>%
      lapply(my_sample, x = 1:999, repeats = 100) %>%
      as.tibble() %>%
      gather() %>%
      qplot(x = value, data = ., 
            facets = factor(key, levels = sizes) ~ .)
    
    library(tidyverse)
    out %>%
      as.tibble() %>%
      gather() %>%
      ggplot(aes(value)) + 
      geom_histogram(aes(y = ..density..), fill = "blue", alpha = .5) +
      geom_density(color = "red", fill = "pink", alpha = .3) + 
      facet_grid(factor(key, levels = c("size_20", "size_30", "size_50", "size_100"))~.) + 
      geom_vline(aes(xintercept = 500), color = "orange", size = 1.2) + 
      theme_bw()
    
    map_dbl(out, mean)
    
    col_num <- function(df) {
      index <- vector("logical", ncol(df))
      for (i in seq_along(df)) {
        index[[i]] <- is.numeric(df[[i]])
      } 
      df[index]
    }
    
    col_mean <- function(df) {
      results <- vector("double", ncol(df))
      names(results) <- names(df)
      for (i in seq_along(df)) {
        results[[i]] <- mean(df[[i]])  
      }
      results
    }
    
    show_vec_mean <- function(x) {
      for (i in seq_along(x)) {
        cat(names(x)[[i]], ":", "\t", round(x[[i]], 3), "\n")
      }
    }
    
    show_mean <- function(df) {
      show_vec_mean(col_mean(col_num(df)))
    }
    
    data(iris)
    show_mean(iris)
    data(diamonds)
    show_mean(diamonds)
    
    
    chk_outlier <- function(df, min = T, number = 10) {
      out <- vector("list", length = length(df))
      names(out) <- names(df)
      if (min) {
        for (i in seq_along(df)) {
          out[[i]] <- head(sort(df[[i]]), number)
        }
      } else {
        for (i in seq_along(df)) {
          out[[i]] <- rev(tail(sort(df[[i]]), number))
        }
      }
      out
    }
    
    num <- col_num(iris)
    chk_outlier(num, min = F, number = 4)
    
    
    library(tidyverse)
    
    split_chr <- function(df, sep = "#") {
      vec <- df[[1]]
      strlist <- str_split(vec, sep)
      unique_chr <- unique(flatten_chr(strlist))
      col_name <- unique_chr[order(as.numeric(unique_chr))]
      list <- vector("list", length = length(col_name))
      names(list) <- col_name
      for (i in seq_along(col_name)) {
        list[[i]] <- map_lgl(strlist, ~col_name[[i]] %in% .)
      }
      list
    }
    
    list2df <- function(list) {
      as.data.frame(do.call(cbind, list))
    }
    
    lgl2chr <- function(df, chr = c("1", "")) {
      map_df(df, ~ifelse(., chr[1], chr[2]))
    }
    
    split_col <- function(df) {
      lgl2chr(list2df(split_chr(df)), c("bingo",""))
    }
    
    test <- tibble(a = c("1#3#2", "1#4#something", "2#3#@"))
    split_col(test)
    
    keep_na <- function(df, start_num, end_num) {
      #start_num, end_num for cols that must have their value keeped,
      #like ID, name....
      right_df <- df[-(start_num:end_num)]
      na_rows <- apply(right_df, 1, function(x) {any(is.na(x))})
      na_cols <- apply(right_df, 2, function(x) {any(is.na(x))})
      left_df <- df[start_num:end_num][na_rows, ]
      right_df_na_keeped <- right_df[na_rows, na_cols]
      logical <- is.na(right_df_na_keeped)
      chr_na_keeped <- ifelse(logical, "missing", "")
      cbind(left_df, chr_na_keeped)
    }
      
    data(iris)
    names(iris) <- c("a", "b", "c", "d", "e")
    head(iris)
    
    variable <- c("a", "b", "d")
    conds <- list(c(4.5, 7.5),
                  c(2.3, 4),
                  c(.2, 2.4))
    
    chk_rng <- function(vec, cond) {
      between(vec, cond[1], cond[2])
    }
    
    chk_rng(1:10, c(3,7))
    
    map2(iris[variable], conds, chk_rng)
    
    chk_range <- function(df, var, cond) {
      temp_list <- map2(df[var], cond, chk_rng) 
      df_lgl <- list2df(temp_list)
    }
    
    head(chk_range(iris, variable, conds))
    
    keep_FALSE <- function(df, id) {
      right_df <- df[-(id[1]:id[2])]
      na_rows <- apply(right_df, 1, function(x) {any(!x)})
      na_cols <- apply(right_df, 2, function(x) {any(!x)})
      left_df <- df[id[1]:id[2]][na_rows, ]
      right_df_na_keeped <- right_df[na_rows, na_cols]
      cbind(left_df, right_df_na_keeped)
    }
    
    test_data <- data.frame(a = 1:5,
                            b = c(T, F, T, T, T),
                            c = c(T, T, F, T, T))
    keep_FALSE(test_data, id = c(1, 1))
    
    chk_range_id <- function(df, var, cond, id) {
      right_df <- chk_range(df, var, cond)
      full_df_interested <- cbind(df[id[1]:id[2]], right_df)
      newid <- c(1, (id[2] - id[1]) + 1)
      keep_FALSE(full_df_interested, newid)
    }
    
    results <- chk_range_id(iris, variable, conds, c(5,5))
    results
    
    lgl2chr(results[-1])
    cbind(results[1], lgl2chr(results[-1]))
    
    chk_lgl_vecout <- function(df, vec, cond, skip) {
      temp_df <- df[skip]
      temp_df <- is.na(temp_df)
      
      logical_value <- df[[vec]] == cond
      
      out <- vector("logical", length(logical_value))
      for (i in seq_along(logical_value)) {
        if (logical_value[[i]]) {
          out[[i]] <- all(unlist(temp_df[i, ]))
        } else {
          out[[i]] <- all(!unlist(temp_df[i, ]))
        }
      }
      out
    }
    
    test_data <- tibble(id = 1:10, 
                        name = c(rep(letters[1:3], 3), "a"),
                        a1 = c(rep(1:3, each = 3), 1),
                        a2 = c("A", NA, NA, "A", NA, NA, "A", NA, NA, "A"),
                        a3 = c(1, 1, NA, 1, 1, NA, 1, 1, NA, 1))
    test_data 
    
    (sub <- chk_lgl_vecout(test_data, "a1", 3, c("a2", "a3")))
    test_data[!sub, ]
    
    test_data1 <- test_data %>%
      mutate(b1 = as.character(a1),
             b2 = a2,
             b3 = rep(1, 10))
    
    test_data1
    
    vec <- list("a1", "b1")
    cond <- list(3, "3")
    skip <- list(c("a2", "a3"), c("b2", "b3"))
    
    vec; cond; skip
    chk_lgl_vecout_with_df <- function(a, b, c) {
      chk_lgl_vecout(test_data1, a, b, c)
    }
    
    chk_lgl_vecout_with_df("b1", "3", c("b2", "b3"))
    pars <- list(vec, cond, skip)
    pmap(pars, chk_lgl_vecout_with_df)
    
    chk_lgl_id <- function(df, id, vec, cond, skip) {
      pars <- list(vec, cond, skip)
      right_df <- pmap(pars, function(a, b, c) chk_lgl_vecout(df, a, b, c))
      names(right_df) <- unlist(vec)
      left_df <- df[id]
      full_df <- cbind(left_df, right_df)
      keep_FALSE(full_df, id)
    }
    
    test_data1[9, "b3"] <- NA
    test_data1[2, "a2"] <- "B"
    test_data1[3, "b2"] <- "c"
    test_data1
    chk_lgl_id(test_data1, 1:2, vec, cond, skip)
    chk_lgl_id(test_data1, 1:2, vec, cond, skip) %>%
      select(-(1:2)) %>%
      lgl2chr(c("", "1"))
    
    test_data <- tibble(a = 1:5,
                        b = c("1", "2", "3", "4", "5"),
                        c = c("one", "2", "3", "4#5", "5"))
    
    test_data
    
    map_df(test_data, as.numeric)
    
    chk_illegal_type <- function(df, id) {
      right_df <- df[-id]
      right_df <- map_df(right_df, as.numeric)
      na_rows <- apply(right_df, 1, function(x) {any(is.na(x))})
      na_cols <- apply(right_df, 2, function(x) {any(is.na(x))})
      left_df <- df[id][na_rows, ]
      right_df_na_keeped <- right_df[na_rows, na_cols]
      logical <- is.na(right_df_na_keeped)
      chr_na_keeped <- ifelse(logical, "wrong_type_or_missing", "")
      cbind(left_df, chr_na_keeped)
    }
    
    test_data <- test_data %>%
      mutate(id = row_number(), 
             d = lag(c)) %>%
      select(id, everything())
    test_data  
    chk_illegal_type(test_data, 1)
    
    # A LITTLE BITTER OF VISUALITION ------------------------------------------
    
    data <- map_dbl(col_num(iris), mean)
    #par(mfrow = c(2, 2))
    par(mfrow = c(1, 1))
    barplot(data)
    pie(data)
    boxplot(iris$Sepal.Length)
    hist(iris$Sepal.Length, col = "gray", cex = 1.5)
    plot(iris$Sepal.Length, iris$Sepal.Width, cex = 1.5)
    plot(women$height, women$weight, type = "b")
    plot(density(iris$Sepal.Length))
    boxplot(mtcars$mpg ~ mtcars$cyl)
    mtcars <- mtcars[order(mtcars$mpg),]
    dotchart(mtcars$mpg, labels = row.names(mtcars), pch = 19, cex = 1.3)
    library(rgl)
    with(mtcars, {
      plot3d(wt, disp, mpg, col = "red", size = 5)
    })
    library("GGally")
    ggpairs(iris[iris$Species %in% c("versicolor", "virginica"), ], aes(color = Species)) 
    + theme_bw()
    ggpairs(mtcars[c("wt", "disp", "mpg")])
    
    
    # ADVANCED R --------------------------------------------------------------
    # SECTION_1 basic ---------------------------------------------------------
    
    a <- matrix(1:4, ncol = 2)
    is.vector(a) 
    is.atomic(a)
    is.list(a)
    typeof(a)
    class(a)
    attributes(a)
    x <- c(1, 2, 3)
    typeof(x)
    x <- c(1L, 2L, 3L)
    typeof(x)
    x
    
    structure(1:20, my_attribute = "this is a vecgtor")
    
    # name a vector
    x <- c(a = 1, b = 2, c = 3)
    x <- 1:3; names(x) <- c("a", "b", "c")
    x <- setNames(1:3, c("a", "b", "c"))
    # drop name
    unname(x)
    names(x) <- NULL
    
    # factor
    x <- factor(c("a", "b", "b", "a"))
    class(x)
    levels(x)
    x[2] <- "c"
    x
    c(factor("a"), factor("b")) # do not work
    sex_char <- c("m", "m", "m")
    sex_factor <- factor(sex_char, levels = c("m", "f"))
    table(sex_factor)
    
    f1 <- factor(letters)
    levels(f1) <- rev(levels(f1))
    f2 <- rev(factor(letters))
    f3 <- factor(letters, levels = rev(letters))
    f1; f2; f3
    
    l <- list(1:3, "a", T, 1.0)
    dim(l) <- c(2, 2)
    l
    dim(1:3)
    as.data.frame(l)
    as.data.frame(list(a = 1:4, b = 2:5))
    
    df <- data.frame(x = 1:3)
    df$y <- list(1:2, 1:3, 1:4)
    df
    data.frame(x = 1:3, y = list(1:2, 1:3, 1:4))
    data.frame(x = 1:3, y = I(list(1:2, 1:3, 1:4)))
    
    dfm <- data.frame(x = 1:3, y = I(matrix(1:9, nrow = 3)))
    dim(dfm)
    names(dfm)
    str(dfm)
    
    x <- c(2.1, 4.2, 3.3, 5,4)
    x[c(1, 1)]
    x[c(2.1, 2.9)]
    x[c(-1, 2)]
    
    x[c(T, F)]
    x[c(T, F, T, F)] #same as above one
    x[c(T, F, NA, F)]
    x[]
    x[0]
    
    y <- setNames(x, letters[1:5])
    y
    y[c("a", "a", "a")]
    y[c("a", "aa")] #matched precisely
    
    a <- matrix(1:9, nrow = 3)
    colnames(a) <- c("A", "B", "C")
    a[c(T, F, T), c("B", "B")]
    a[0, -2]
    
    vals <- outer(1:5, 1:5, FUN = "paste", sep = ",")
    vals
    vals[c(4, 15)]
    select <- matrix(ncol = 2, byrow = T, c(
      1, 1,
      3, 1,
      2, 4
    ))
    select
    vals[select]
    vals[1]
    z <- as.data.frame(vals)
    z <- setNames(z, letters[1:5]); z
    z[, "a"]
    z[, "a", drop = F]
    
    a <- list(a = 1, b = 2)
    a[[1]]
    a[["a"]]
    
    b <- list(a = list(b = list(c = list(d = 1))))
    b[[c("a", "b", "c", "d")]] #recursive, very useful
    b[["a"]][["b"]][["c"]][["d"]] #same as above
    
    x <- 1:5
    x[-1] <- 4:1; x #length(left_hand = right_hand)
    x[c(1, 1)] <- 2:3; x # legal
    x[c(1, NA)] <- c(1:2) # illegal
    x[c(T, F, NA)] <- 1 #legal, NA is treated as FALSE
    
    data(mtcars)
    mtcars <- lapply(mtcars, as.integer); mtcars
    data(mtcars)
    mtcars[] <- lapply(mtcars, as.integer); mtcars
    
    x <- list(a = 1, b = 2)
    x[["a"]] <- NULL; str(x)
    
    x <- list(a = 1)
    x["b"] <- list(NULL); str(x)
    
    x <- c("m", "f", "u", "f", "f", "m", "u")
    lookup <- c(m = "male", f = "female", u = NA)
    lookup[x]
    unname(lookup[x])
    c(m = "known", f = "known", u = "unkame")[x]
    
    grades <- c(1, 2, 2, 3, 1) 
    info <- data.frame( grade = 3:1, 
                        desc = c("Excellent", "Good", "Poor"), 
                        fail = c(F, F, T) )
    id <- match(grades, info$grade)
    info[id, ]
    
    rownames(info) <- info$grade
    info[as.character(grades), ]
    
    df <- data.frame(x = rep(1:3, each = 2), y = 6:1, z = letters[1:6])
    df[sample(nrow(df)), ]
    df[sample(nrow(df), 3), ]
    df[sample(nrow(df), 6, rep = T), ]
    
    df2 <- df[sample(nrow(df)), 3:1]
    df2[order(df2$x), ]
    df2[, order(names(df2))]
    
    df <- data.frame(x = c(2, 4, 1), y = c(9, 11, 6), n = c(3, 5, 1))
    df
    rep(1:nrow(df), df$n)
    df[rep(1:nrow(df), df$n), ]
    
    subset(mtcars, gear == 5)
    subset(mtcars, gear == 5 & cyl == 4)
    
    x <- sample(10) < 4
    which(x)
    unwhich <- function(x, n) {
      out <- rep_len(F, n)
      out[x] <- T
      out
    }
    unwhich(which(x), 10)
    
    (x1 <- 1:10 %% 2 == 0)
    (x2 <- which(x1))
    (y1 <- 1:10 %% 5 == 0)
    (y2 <- which(y1))
    x1 & y1
    intersect(x2, y2)
    x1 | y1
    union(x2, y2)
    x1 & !y1
    setdiff(x2, y2)
    xor(x1, y1)
    setdiff(union(x2, y2), intersect(x2, y2))
    
    
    # FUNCTION ----------------------------------------------------------------
    
    # install.packages("pryr")
    library("pryr")
    library("purrr")
    
    f <- function(x) x^2 # three parts
    f
    formals(f)
    body(f)
    environment(f)
    
    sum
    formals(sum)
    body(sum)
    environment(sum)
    
    objs <- mget(ls("package:base"), inherits = T)
    funs <- Filter(is.function, objs)
    map(map(funs, formals), length)
    
    x <- 1
    h <- function() {
      y <- 2
      i <- function() {
        z <- 3
        c(x, y, z)
      }
    i()
    }
    h()
    rm(x, h)
    
    j <- function(x) {
      y <- 2
      function() {
        c(x, y)
      }
    }
    
    k <- j(1)
    k()
    k
    rm(j, k)
    
    l <- function(x) x + 1
    m <- function() {
      l <- function(x) x * 2
           l(10)
    }
    m()
    rm(l, m)
    
    n <- function(x) x / 2
    o <- function() {
      n <- 10
      n(n)
    }
    o()
    rm(o, n)
    
    j <- function() {
      if(!exists("a")) {
        a <- 1
      } else {
        a <- a + 1
      }
      print(a)
    }
    j()
    
    f <- function() x
    x <- 15
    f()
    x <- 30
    f()
    #above function f should be avoid because of outer dependency
    
    f <- function() x + 1
    codetools::findGlobals(f)
    environment(f) <- emptyenv()
    f()
    
    c <- 10
    c(c = c)
    add <- function(x, y) x + y 
    sapply(1:10, add, 3)
    sapply(1:10, `+`, 3)
    sapply(1:10, "+", 3)
    
    x <- list(1:3, 4:9, 10:12)
    sapply(x, "[", 2)
    sapply(x, function(x) x[2])
    library(purrr)
    map(x, function(x) x[2])
    
    f <- function(abcdef, bcde1, bcde2) {
      list(a = abcdef, b1 = bcde1, b2 = bcde2)
    }
    
    str(f(1, 2, 3))
    str(f(2, 3, a = 1))
    str(f(1, 3, b = 1))
    
    args <- list(1:10, na.rm = T)
    do.call(mean, args) #divery the formals to the functions
    
    g <- function(a = 1, b = a * 2) {
      c(a, b)
    }
    g()
    g(109)
    
    f <- function(a = 1, b = d) {
      d <- (a + 1) ^ 2
      c(a, b)
    }
    f()
    f(10)
    
    i <- function(a, b) {
      c(missing(a), missing(b))
    }
    i()
    i(a = 1)
    i(b = 2)
    i(a = 1, b = 2)
    
    f <- function(x) {
      10
    }
    f(stop("this is an error!"))
    f <- function(x) {
      force(x)
      10
    }
    f(stop("this is an error"))
    
    add <- function(x) {
      function(y) x + y
    }
    adders <- lapply(1:10, add)
    adders[[1]](10)
    adders[[10]](10)
    
    add <- function(x) {
      force(x)
      function(y) x + y
    }
    adders2 <- lapply(1:10, add)
    adders2[[1]](10)
    adders2[[10]](10)
    
    f <- function(x = ls()) {
      a <- 1
      x
    }
    f()
    f(ls())
    
    `&&` <- function(x, y) {
      if (!x) return(FALSE)
      if (!y) return(FALSE)
      TRUE
    }
    a <- NULL
    !is.null(a) && a > 0
    a > 0 && !is.null(a)
    
    if(is.null(a)) stop("a is null")
    !is.null(a) || stop("a is null")
    
    barplot(1:5, col = "red", pch = 20)
    plot(1:5, bty = "u")
    plot(1:5, labels = F)
    
    f <- function(...) {
      names(list(...))
    }
    f(a = 1, b = 2)
    
    `%+%` <- function(a, b) paste(a, b, sep = "")
    "new" %+% "string"
    `second<-` <- function(x, value) {
      x[2] <- value
      x
    }
    x <- 1:10
    second(x) <- 5
    x
    
    library(pryr)
    x <- 1:10
    address(x)
    second(x) <- 6
    address(x)
    
    `modify<-` <- function(x, position, value) {
      x[position] <- value
      x
    }
    modify(x, 1) <- 10
    modify(get("x"), 1) <- 10 # do not work, 
    # get("x") <- `modify<-`(get("x"), 1, 10)
    
    replace_fun <- function(fun = F) {
      objs <- mget(ls("package:base"), inherits = T)
      fun_names <- names(Filter(is.function, objs))
      replace_fun_names <- fun_names[str_detect(fun_names, "<-")]
      if (fun) objs[replace_fun_names] else replace_fun_names
    }
    replace_fun(fun = T)
    
    f1 <- function() 1
    f1()
    f2 <- function() invisible(1)
    f2()
    f1() == 1
    f2() == 1
    (f2())
    a <- 2
    (a <- 2) # <- is one of the most common used function that
             # return invisible value, so, it can be used below
    a <- b <- c <- d <- 2
    
    in_dir <- function(dir, code) {
      old <- setwd(dir)
      on.exit(setwd(old))
      force(code)
    }
    getwd()
    in_dir("~", getwd())
    
    # OO GUIDE ----------------------------------------------------------------
    
    f <- function() {}
    typeof(f)
    is.function(f)
    
    typeof(sum)
    is.primitive(sum)
    
    df <- data.frame(x = 1:10, y = letters[1:10])
    otype(df)
    otype(df$x)
    otype(df$y)
    
    mean
    UseMethod("mean")
    ftype(mean)
    
    ftype(t.data.frame)
    ftype(t.test)
    methods("mean")
    
    methods(class = "list")
    
    
    foo <- structure(list(), class = "foo")
    foo
    foo <- list()
    class(foo) <- "foo"
    foo
    class(foo)
    inherits(foo, "foo")
    
    foo <- function(x) {
      if(!is.numeric(x)) stop("X must be numeric")
      structure(list(x), class = "foo")
    }
    
    mod <- lm(log(mpg) ~ log(disp), data = mtcars)
    class(mod)
    print(mod)
    class(mod) <- "data.frame"
    print(mod)
    mod$coefficients
    
    f <- function(x) UseMethod("f")
    f.a <- function(x) "class a"
    a <- structure(list(), class = "a")
    class(a)
    f(a)
    
    mean.a <- function(x) "a"
    mean(a)
    
    f <- function(x) UseMethod("f")
    f.a <- function(x) "class a"
    f.default <- function(x) "unknown class"
    f(structure(list(), class = "a"))
    f(structure(list(), class = "b"))
    f(structure(list(), class = c("b", "a")))
    
    library(methods)
    library(stats4)
    y <- c(26, 17, 13, 12,20, 5, 9, 8, 5, 4, 8)
    nLL <- function(lambda) -sum(dpois(y, lambda, log = T))
    fit <- mle(nLL, start = list(lambda = 5), nobs = length(y))
    isS4(fit)
    otype(fit)
    isS4(nobs)
    ftype(nobs)
    
    # s4 object ---------------------------------------------------------------
    
    mle_nobs <- method_from_call(nobs(fit))
    isS4(mle_nobs)
    ftype(mle_nobs)
    is(fit)
    class(fit)
    is(fit, "mle")
    getGenerics()
    getclass()
    
    setClass("Person", 
             slots = list(name = "character", age = "numeric")) 
    setClass("Employee", 
             slots = list(boss = "Person"), 
             contains = "Person")
    alice <- new("Person", name = "Alice", age = 40)
    john <- new("Employee", name = "John", age = 20, boss = alice)
    alice@age
    slot(john, "boss")
    
    setClass("RangedNumeric", 
             contains = "numeric",
             slots = list(min = "numeric", max = "numeric"))
    rn <- new("RangedNumeric", 1:10, min = 1, max = 10)
    rn@min
    rn@.Data
    
    setGeneric("union")
    setMethod("union",
              c(x = "data.frame", y = "data.frame"),
              function(x, y) {
                unique(rbind(x, y))
              })
    
    setGeneric("myGeneric", function(x) {
      standardGeneric("myGeneric")
    })
    
    
    # reference class ---------------------------------------------------------
    
    Account <- setRefClass("Account")
    Account$new()
    
    Account <- setRefClass("Account",
                           fields = list(balance = "numeric"))
    a <- Account$new(balance = 100)
    a$balance
    a$balance <- 200
    a$balance
    
    b <- a
    b$balance
    a$balance <- 0
    b$balance
    
    c <- a$copy()
    c$balance
    a$balance <- 100
    c$balance
    
    Account <- setRefClass("Account",
                           fields = list(balance = "numeric"),
                           methods = list(
                             withdraw = function(x) {
                               balance <<- balance - x
                             },
                             deposit = function(x) {
                               balance <<- balance + x
                             }
                           ))
    a <- Account$new(balance = 100)
    a$deposit(100)
    a$balance
    
    NoOverdraft <- setRefClass("NoOverdraft",
                               contains = "Account",
                               methods = list(
                                 withdraw = function(x) {
                                   if (balance < x) stop("Not enough money")
                                   balance <<- balance - x
                                 }
                               ))
    accountJohn <- NoOverdraft$new(balance = 100)
    accountJohn$deposit(50)
    accountJohn
    accountJohn$withdraw(1000)
    pryr::otype(accountJohn)
    
    # env ---------------------------------------------------------------------
    
    e <- new.env()
    e$a <- FALSE
    e$b <- "a"
    e$c <- 2.3
    e$d <- 1:3
    
    e$a <- e$d
    e$a <- 1:3
    
    search()
    as.environment("package:stats")
    ls(e)
    parent.env(e)
    
    e$.a <- 2
    ls(e)
    ls(e, all.names = T)
    
    str(e)
    ls.str(e)
    
    e$c <- 3
    e$c
    e[["c"]]
    get("c", envir = e)
    
    e <- new.env()
    e$a <- 1
    e$a <- NULL
    ls(e)
    e$a
    rm("a", envir = e)
    ls(e)
    
    x <- 10
    exists("x", envir = e)
    exists("x", envir = e, inherits = F)
    identical(globalenv(), environment())
    
    
    library(pryr)
    x <- 5
    where("x")
    where("mean")
    
    where <- function(name, env = parent.env()) {
      if(identical(env, emptyenv())) {
        stop("can not find", name, call. = FALSE)
      } else if (exists(name, envir = env, inherits = FALSE))
        env
    } else {
      where(name,parent.env(env))
    }
    
    
    y <- 1
    f <- function(x) x + y
    environment(f)
    
    e <- new.env()
    e$g <- function() 1
    environment(e$g)
    
    environment(sd)
    where("sd")
    
    g <- function(x) {
      if(!exists("a", inherits = F)) {
        message("define a")
        a <- 1
      } else {
        a <- a + 1
      }
      a
    }
    g(10)
    g(10)
    
    h <- function() {
      x <- 10
      function() {
        x
      }
    }
    i <- h()
    x <- 20
    i()
    
    #assign
    x <- 0
    f <- function() {
      x <<- 1
    }
    f()
    x
    
    library(pryr)
    system.time(b%<d-%{Sys.sleep(1); 1})
    system.time(b)
    
    x %<a-% runif(1)
    x
    x
    
    modify <- function(x) {
      x$a <- 2
      invisible()
    }
    x_1 <- list()
    x_1$a <- 1
    modify(x_1)
    x_1$a
    x_e <- new.env()
    x_e$a <- 1
    modify(x_e)
    x_e$a
    
    remove(x)
    x <- 1
    e1 <- new.env()
    get("x", envir = e1)
    get("x", envir = e1, inherits = F)
    e2 <- new.env(parent = emptyenv())
    get("x", envir = e2)
    
    
    # debug & defensive programing (skip some subcharpter) --------------------
    
    
    f <- function(a) g(a) 
    g <- function(b) h(b) 
    h <- function(c) i(c) 
    i <- function(d) "a" + d 
    f(10)
    
    f1 <- function(x) {
      log(x)
      10
    }
    f1("x")
    
    f2 <- function(x) {
      try(log(x)) 
      10
    }
    f2("x")
    
    try({
      a <- 1
      b <- "x"
      a + b
    })
    
    success <- try(1 + 2)
    failure <- try("a" + "b")
    class(success)
    class(failure)
    
    elements <- list(1:10, c(-1, 10), c(T, F), letters)
    rm("results")
    results <- lapply(elements, log)
    results
    results <- lapply(elements, function(x) try(log(x)))
    results
    
    is.error <- function(x) inherits(x, "try-error")
    succeeded <- !sapply(results, is.error)
    str(results[succeeded])
    str(elements[!succeeded])
    
    default <- NULL
    try(default <- read.csv("possibly-bad-input.csv"), silent = T)
    
    
    show_condition <- function(code) {
      tryCatch(code, 
               error = function(c) "error",
               warning = function(c) "warning",
               message = function(c) "message")
    }
    show_condition(stop("!"))
    show_condition(warning("?!"))
    show_condition(10)
    
    try2 <- function(code, silent = F) {
      tryCatch(code, error = function(c) { 
        msg <- conditionMessage(c)
        if(!silent) message(c)
        invisible(structure(msg, class = "try-error"))
      })
    }
    try2(1)
    try2(stop("hi"))
    try2(stop("hi"), silent = T)
    
    i <- 1
    while(i < 3) {
      tryCatch({
        Sys.sleep(.5)
        message("try to escape")
      }, interrupt = function(x) {
        message("try again!")
        i <<- i + 1
      })
    }
    
    #defensive programing
    
    col_means <- function(df) {
      numeric <- sapply(df, is.numeric)
      numeric_cols <- df[, numeric]
      data.frame(lapply(numeric_cols, mean))
    }
    
    if (T) {
      col_means <- function(df) {
        numeric <- lapply(df, is.numeric)
        numeric_cols <- df[, unlist(numeric), drop = FALSE]
        data.frame(lapply(numeric_cols, mean))
      }
    }
    
    col_means(mtcars)
    col_means(mtcars[, 0])
    col_means(mtcars[0, ])
    col_means(mtcars[, "mpg", drop = F])
    col_means(1:10)
    col_means(as.matrix(mtcars))
    col_means(as.list(mtcars))
    mtcars2 <- mtcars
    mtcars2[-1] <- lapply(mtcars2[-1], as.character)
    col_means(mtcars2)
    
    # SECTION_2 functional programing -----------------------------------------
    set.seed(1014) 
    df <- data.frame(replicate(6, sample(c(1:10, -99), 6, rep = TRUE))) 
    names(df) <- letters[1:6] 
    df
    
    fix_missing <- function(x) {
      x[x == -99] <- NA
      x
    }
    
    df$a <- fix_missing(df$a)
    df$b <- fix_missing(df$b)
    df$c <- fix_missing(df$c)
    df$d <- fix_missing(df$d)
    
    df[] <- lapply(df, fix_missing)
    df[1:5] <- lapply(df[1:5], fix_missing)
    
    fix_missing_999 <- function(x) {
      x[x == -999] <- NA
      x
    }
    
    fix_missing_9999 <- function(x) {
      x[x == -9999] <- NA
      x
    }
    
    missing_fixer <- function(na_value) {
      function(x) {
        x[x == na_value] <- NA
        x
      }
    }
    
    fix_missing_88 <- missing_fixer(88)
    fix_missing_88(c(88, 89))
    
    fix_missing <- function(x, na.value) {
      x[x == na.value] <- NA
      x
    }
    
    summary <- function(x) {
      c(mean(x, na.rm = T),
        median(x, na.rm = T), 
        sd(x, na.rm = T), 
        mad(x, na.rm = T), 
        IQR(x, na.rm = T))
    }
    
    lapply(df, summary)
    
    summary <- function(x) {
      funs <- c(mean, median, sd, mad, IQR)
      lapply(funs, function(f) f(x, na.rm = T))
    }
    
    lapply(df, summary)
    
    lapply(mtcars, function(x) length(unique(x)))
    Filter(function(x) !is.numeric(x), mtcars)
    integrate(function(x)  sin(x)^2, 0, pi)
    
    function(x) 3()
    (function(x) 3)()
    (function(x) x + 10)(3)
    
    power <- function(exponent) {
      function(x) {
        x ^ exponent
      }
    }
    
    square <- power(2)
    square(4)
    cube <- power(3)
    cube(2)
    
    square
    cube
    as.list(environment(square))
    as.list(environment(cube))
    library(pryr)
    unenclose(square)
    unenclose(cube)
    
    new_counter <- function() {
      i <- 0
      function() {
        i <<- i + 1
        i
      }
    }
    count_one <- new_counter()
    count_two <- new_counter()
    count_one()
    count_one()
    count_one()
    count_two()
    count_two()
    
    i <- 0
    new_counter2 <- function() {
      i <<- i + 1
      i
    }
    
    new_counter3 <- function() {
      i <- 0
      function() {
        i <- i + 1
        i
      }
    }
    new_counter2()
    new_counter2()
    
    new_count <- new_counter3()
    new_count()
    
    compute_mean <- list(
      base = function(x) mean(x),
      sum = function(x) sum(x) / length(x),
      manual = function(x) {
        total <- 0
        n <- length(x)
        for (i in seq_along(x)) {
          total <- total + x[1] / n
        }
        total
      }
    )
    
    x <- runif(1e5)
    system.time(compute_mean$base(x))
    system.time(compute_mean[[2]](x))
    system.time(compute_mean[["manual"]](x))
    
    lapply(compute_mean, function(f) f(x))
    call_fun <- function(f, ...) f(...)
    lapply(compute_mean, call_fun, x)
    lapply(compute_mean, function(x) system.time(f(x)))
    
    x <- 1:10
    funs <- list(
      sum = sum,
      mean = mean,
      median = median
    )
    lapply(funs, function(f) f(x))
    funs2 <- list(
      sum = function(x, ...) sum(x, ..., na.rm = TRUE),
      mean = function(x, ...) mean(x, ..., na.rm = TRUE),
      median = function(x, ...) median(x, ..., na.rm = TRUE)
    )
    lapply(funs2, function(f) f(x))
    lapply(funs, function(f) f(x, na.rm = T))
    
    simple_tag <- function(tag) {
      force(tag)
      function(...) {
        paste0("<", tag, ">", paste0(...), "</", tag, ">")
      }
    }
    tags <- c("p", "b", "i")
    html <- lapply(setNames(tags, tags), simple_tag)
    html$p("this is ", html$b("bold"))
    with(html, p("this is", b("bold"), "text."))
    
    attach(html)
    p("this is", b("bold"), "text.")
    detach()
    
    list2env(html, environment())
    p("This is ", b("bold"), " text.")
    rm(list = names(html), envir = environment())
    
    midpoint <- function(f, a, b) {
      (b - a) * f((a + b) / 2)
    }
    
    trapezoid <- function(f, a, b) {
      (b - a) / 2 * (f(a) + f(b))
    }
    
    midpoint(sin, 0, pi)
    trapezoid(sin, 0, pi)
    
    midpoint_composite <- function(f, a, b, n = 10) {
      points <- seq(a, b, length = n + 1)
      h <- (b - a) / n
      area <- 0
      for (i in seq_len(n)) {
        area <- area + h * f((points[i] + points[i + 1]) / 2)
      }
      area
    }
    
    trapezoid_composite <- function(f, a, b, n = 10) {
      points <- seq(a, b, length = n + 1)
      h <- (b - a) / n
      area <- 0
      for (i in seq_len(n)) {
        area <- area + h / 2 * (f(points[i]) + f(points[i + 1]))
      }
      area
    }
    
    midpoint_composite(sin, 0, pi, n = 10)
    midpoint_composite(sin, 0, pi, n = 100)
    trapezoid_composite(sin, 0, pi, n = 10)
    trapezoid_composite(sin, 0, pi, n = 100)
    
    composite <- function(f, a, b, n = 10, rule) {
      points <- seq(a, b, length = n + 1)
      area <- 0
      for (i in seq_len(n)) {
        area <- area + rule(f, points[i], points[i + 1])
      }
      area
    }
    
    composite(sin, 0, pi, n = 10, rule = midpoint)
    composite(sin, 0, pi, n = 10, rule = trapezoid)
    
    simpson <- function(f, a, b) {
      (b - a) / 6 * (f(a) + 4 * f((a + b) / 2) + f(b))
    }
    boole <- function(f, a, b) {
      pos <- function(i) a + i * (b - a) / 4
      fi <- function(i) f(pos(i))
      (b - a) / 90 *
        (7 * fi(0) + 32 * fi(1) + 12 * fi(2) + 32 * fi(3) + 7 * fi(4))
    }
    
    composite(sin, 0, pi, n = 10, rule = simpson)
    composite(sin, 0, pi, n = 10, rule = boole)
    
    newton_cotes <- function(coef, open = FALSE) {
      n <- length(coef) + open
      function(f, a, b) {
        pos <- function(i) a + i * (b - a) / n
        points <- pos(seq.int(0, length(coef) - 1))
        (b - a) / sum(coef) * sum(f(points) * coef)
      }
    }
    boole <- newton_cotes(c(7, 32, 12, 32, 7))
    milne <- newton_cotes(c(2, -1, 2), open = TRUE)
    composite(sin, 0, pi, n = 10, rule = milne)
    
    randomise <- function(f) f(runif(1e3))
    randomise(mean)
    randomise(mean)
    randomise(sum)
    
    # functional --------------------------------------------------------------
    
    lapply2 <- function(x, f, ...) {
      out <- vector("list", length(x))
      for (i in seq_along(x)) {
        out[[i]] <- f(x[[i]], ...)
      }
      out
    }
    
    l <- replicate(20, runif(sample(1:10, 1)), simplify = F)
    
    out <- vector("list", length(l))
    for (i in seq_along(l)) {
      out[[i]] <- length(l[[i]])
    }
    unlist(out)
    unlist(lapply(l, length))
    
    unlist(lapply(mtcars, class))
    mtcars[] <- lapply(mtcars, function(x) x / mean(x))
    
    trims <- c(0, .1, .2, .5)
    x <- rcauchy(1000)
    unlist(lapply(trims, function(trim) mean(x, trim = trim)))  # beautiful
    lapply(trims, mean, x = x)
    
    # 3 ways to for-loop 1: for (x in xs) 2: for (i in seq_along(xs))
    # 3:for(nm in names(xs))
    xs <- runif(1e3)
    res <- c()
    for (x in xs) res <- c(res, sqrt(x)) #it is slowl
    res
    
    # 3 ways to using lapply
    lapply(xs, function(x) {}) # most frequently used methods
    lapply(seq_along(xs), function(i) {})
    lapply(names(xs), function(nm) {})
    
    formulas <- list(
      mpg ~ disp,
      mpg ~ I(1 / disp),
      mpg ~ disp + wt,
      mpg ~ I(1 / disp) + wt
    )
    
    lapply(formulas, function(formula) lm(formula = formula, data = mtcars))
    
    bootstraps <- lapply(1:10, function(i) {
      rows <- sample(1:nrow(mtcars), rep = T)
      mtcars[rows, ]
    })
    
    out <- vector("list", 10)
    for (i in 1:10) {
      rows <- sample(1:nrow(mtcars), rep = T)
      out[[i]] <- mtcars[rows, ]
    }
    out
    
    sapply(mtcars, is.numeric)
    vapply(mtcars, is.numeric, logical(1))
    
    sapply(list(), is.numeric) # better in interactive
    vapply(list(), is.numeric, logical(1)) #better in program
    
    df <- data.frame(x = 1:10, y = letters[1:10])
    sapply(df, class)
    vapply(df, class, character(1))
    
    df2 <- data.frame(x = 1:10, y = Sys.time() + 1:10)
    sapply(df2, class)
    vapply(df2, class, character(1))
    
    xs <- replicate(5, runif(10), simplify = F)
    ws <- replicate(5, rpois(10, 5) + 1, simplify = F)
    unlist(lapply(xs, mean))
    unlist(lapply(seq_along(xs), function(i) {
      weighted.mean(xs[[i]], ws[[i]])
    }))
    unlist(Map(weighted.mean, xs, ws))
    
    mtmeans <- lapply(mtcars, mean)
    mtmeans[] <- Map(`/`, mtcars, mtmeans)
    Map(function(x, w) weighted.mean(x, w, na.rm = T), xs, ws)
    
    rollmean <- function(x, n) {
      out <- rep(NA, length(x))
      offset <- trunc(n / 2)
      for (i in (offset + 1):(length(x) - n + offset - 1)) {
        out[i] <- mean(x[(i - offset):(i + offset - 1)])
      }
      out
    }
    
    x <- seq(1, 3, length = 1e2) + runif(1e2)
    plot(x)
    lines(rollmean(x, 5), col = "blue", lwd = 2)
    lines(rollmean(x, 10), col = "red", lwd = 2)
    
    rollapply <- function(x, n, f, ...) {
      out <- rep(NA, length(x))
      offset <- trunc(n / 2)
      for (i in (offset = 1):(length(x) - n + offset + 1)) {
        out[i] <- f(x[(i - offset):(i + offset)], ...)
      }
      out
    }
    x <- seq(1, 3, length = 1e2) + rt(1e2, df = 2) / 3
    plot(x)
    lines(rollmean(x, 5), col = "red", lwd = 2)
    lines(rollapply(x, 5, median), col = "blue", lwd = 2)
    
    # which element be computed first is not important when you using apply
    lapply3 <- function(x, f, ...) {
      out <- vector("list", length(x))
      for (i in sample(seq_along(x))) { 
        out[[i]] <- f(x[[i]], ...)
      }
      out
    }
    
    unlist(lapply(1:10, sqrt))
    unlist(lapply3(1:10, sqrt))
    
    library(parallel)
    boot_df <- function(x) x[sample(nrow(x), rep = T), ]
    rsquared <- function(mod) summary(mod)$r.square
    boot_lm <- function(i) {
      rsquared(lm(mpg ~ wt + disp, data = boot_df(mtcars)))
    }
    system.time(lapply(1:500, boot_lm))
    system.time(mclapply(1:500, boot_lm, mc.cores = 2)) # unfortunately, it does not work on windows,try parLapply() instead.
    
    a <- matrix(1:20, nrow = 5)
    a1 <- apply(a, 1, identity)
    a1
    identical(a, a1)
    identical(a, t(a1))
    
    x <- matrix(rnorm(20, 0, 1), nrow = 4)
    x1 <- sweep(x, 1, apply(x, 1, min), `-`)
    x2 <- sweep(x1, 1, apply(x1, 1, max), `/`)
    
    outer(1:3, 1:10, `*`)
    
    pulse <- round(rnorm(22, 70, 10 / 3)) + rep(c(0, 5), c(10, 12))
    group <- rep(c("A", "B"), c(10, 12))
    tapply(pulse, group, length)
    tapply(pulse, group, mean)
    split(pulse, group)
    tapply2 <- function(x, group, f, ..., simplify = TRUE) {
      pieces <- split(x, group)
      sapply(pieces, f, simplify = simplify)
    }
    tapply2(pulse, group, mean)
    tapply2(pulse, group, length)
    
    Reduce2 <- function(f, x) {
      out <- x[[1]]
      for (i in seq(2, length(x))) {
        out <- f(out, x[[i]])
      }
      out
    }
    
    l <- replicate(5, sample(1:10, 15, replace = T), simplify = F)
    Reduce(intersect, l)
    
    where <- function(f, x) {
      vapply(x, f, logical(1))
    }
    
    df <- data.frame(x = 1:3, y = c("a", "b", "c"))
    where(is.factor, df)
    str(Filter(is.factor, df))
    Find(is.numeric, df)
    Position(is.factor, df)
    
    # integrate() uniroot() potimise() are skiped
    
    trans <- list(
      disp = function(x) x * 0.0163871,
      am = function(x) factor(x, levels = c("auto", "manual"))
    )
    for (var in names(trans)) {
      mtcars[[var]] <- trans[[var]](mtcars[[var]])
    }
    
    # recursively using for-loop is skiped
    
    #11.7
    
    add <- function(x, y) {
      stopifnot(length(x) == 1, length(y) == 1,
                is.numeric(x), is.numeric(y))
      x + y
    }
    rm_na <- function(x, y, identity) {
      if(is.na(x) && is.na(y)) {
        identity
      } else if (is.na(x)) {
        y
      } else {
        x
      }
    }
    rm_na(NA, 10, 0)
    rm_na(10, NA, 0)
    rm_na(NA, NA, NA)
    
    add <- function(x, y, na.rm = F) {
      if (na.rm && (is.na(x) || is.na(y))) rm_na(x, y, 0) else x + y
    }
    add(10, NA)
    add(10, NA, na.rm = T)
    add(NA, NA)
    add(NA, NA, na.rm = T)
    
    r_add <- function(xs, na.rm = T) {
      Reduce(function(x, y) add(x, y, na.rm = na.rm), xs)
    }
    r_add(1:10)
    r_add(NA, na.rm = T)
    r_add(numeric())
    r_add <- function(xs, na.rm = T) {
      Reduce(function(x, y) add(x, y, na.rm = na.rm), xs, init = 0)
    }
    r_add(1:10)
    r_add(NA, na.rm = T)
    r_add(numeric())
    
    v_add1 <- function(x, y, na.rm = FALSE) {
      stopifnot(length(x) == length(y), is.numeric(x), is.numeric(y))
      if (length(x) == 0) return(numeric())
      simplify2array(Map(function(x, y) add(x, y, na.rm = na.rm), x, y))
    }
    v_add1(1:10, 1:10)
    v_add1(numeric(), numeric())
    v_add1(c(1, NA), c(1, NA))
    v_add1(c(1, NA), c(1, NA), na.rm = T)
    
    c_add <- function(xs, na.rm = FALSE) {
      Reduce(function(x, y) add(x, y, na.rm = na.rm), xs, accumulate = TRUE)
    }
    c_add(1:10)
    
    row_sum <- function(x, na.rm = FALSE) {
      apply(x, 1, add, na.rm = na.rm)
    }
    col_sum <- function(x, na.rm = FALSE) {
      apply(x, 2, add, na.rm = na.rm)
    }
    
    # 12 ----------------------------------------------------------------------
    
    chatty <- function(f) {
      function(x, ...) {
        res <- f(x, ...)
        cat("processing", x, "\n", sep = "")
        res
      }
    }
    f <- function(x) x ^ 2
    s <- 3:1
    chatty(f)(1)
    vapply(s, chatty(f), numeric(1))
    
    delay_by <- function(delay, f) {
      function(...) {
        Sys.sleep(delay)
        f(...)
      }
    }
    system.time(runif(100))
    system.time(delay_by(.1, runif)(100))
    
    dot_every <- function(n, f) {
      i <- 1
      function(...) {
        if (i %% n == 0) cat(".")
        i <<- i + 1
        f(...)
      }
    }
    x <- lapply(1:100, runif)
    x <- lapply(1:100, dot_every(10, runif)) 
    
    library(memoise)
    slow_func <- function(x) {
      Sys.sleep(1)
      10
    }
    system.time(slow_func())
    system.time(slow_func())
    
    fast_func <- memoise(slow_func)
    system.time(fast_func())
    system.time(fast_func())
    
    fib <- function(n) {
      if(n < 2) return(1)
      fib(n -2) + fib(n - 1)
    }
    system.time(fib(25))
    system.time(fib(26))
    
    fib2 <- memoise(function(n) {
      if (n < 2) return(1)
      fib2(n - 2) + fib2(n - 1)
    })
    system.time(fib2(23))
    system.time(fib2(24))
    
    runif <- memoise(runif)
    runif(5)
    runif(5)
    
    # a lot of contents skiped
    
    
    # some interesting FUNCs by me --------------------------------------------
    library(tidyverse)
    
    get_coefs <- function(model) {
      summary(model)[["coefficients"]]
    }
    
    line_two <- function(matrix, n = 2) {
     matrix[n, , drop = F]
    }
    
    vstack <- function(list) {
      do.call(rbind, list)
    }
    
    do_model <- function(response, data, model = lm, ...) {
      formula <- as.formula(paste(response, "~", "."))
      model(formula, data = data, ...)
    }
    
    extend_scaler <- function(x) {
        c(x, unlist(lapply(0:99, function(i) str_c(x, i))))
    }
    
    extend_chr <- function(chr) {
      unlist(lapply(chr, extend_scaler))
    }
    
    keep_row <- function(matrix, vras, extend = T) {
      if (extend) vars = extend_chr(vars)
      vars = intersect(rownames(matrix), vars)
      matrix[vars, , drop = F]  
    }
    
    univar <- function(response, vars_interested, controls = NA, data, model = lm, ...) {
      entry_name <- c(response, vars_interested, controls)
      entry_name <- entry_name[!is.na(entry_name)]
      wrong_name <- setdiff(entry_name, names(data))
      if (length(wrong_name) != 0) stop("wrong variable name in you entry")
      
      out <- vector("list", length = length(vars_interested))
      for (i in seq_along(vars_interested)) {
        new_data <- data[c(response, vars_interested[[i]], controls[!is.na(controls)])]
        fit <- do_model(response, new_data, model, ...)
        out[[i]] <- get_coefs(fit)
      }
      out
    }
    
    
    data(mtcars); names(mtcars)
    vars <- c("cyl", "disp", "hp")
    univar("am", vars, c("qsec", "gear"), mtcars)
    univar("am", vars, c("qsec", "gear"), mtcars) %>% lapply(line_two)
    univar("am", vars, c("qsec", "gear"), mtcars) %>% lapply(line_two) %>% vstack
    
    mtcars$cyl <- as.factor(mtcars$cyl)
    univar("vs", vars, "gear", data = mtcars, model = glm, family = binomial())
    univar("vs", vars, "gear", data = mtcars, model = glm, family = binomial()) %>%
      vstack %>% keep_row(vars)
    
    library(reshape)
    library(tidyverse)
    library(tableone)
    q_plot <- function(data, ...) qplot(data = data, ...)
    
    to_draw <- function(data, group_var) {
      nms <- names(data)
      nms2 <- setdiff(nms, group_var)
      melt(data, id = group_var)
    }
    
    keep_num <- function(data, group) {
      out <- keep(data, is.numeric)
      if (is.numeric(data[[group]])) out else cbind(out, data[group])
    }
    
    data(iris); names(iris) <- letters[1:5]; iris$group <- c(1, 0); str(iris); head(iris)
    
    iris %>% keep_num("group") %>% to_draw("group") %>% 
      q_plot(x = value, facets = variable ~ group, geom = "density")
    
    CreateTableOne(c("a", "b", "c", "d", "e"), "group", data = iris) %>% 
      print(nonnormal = c("c", "d"))
    
    

    相关文章

      网友评论

          本文标题:Notes of R for data science_07Ju

          本文链接:https://www.haomeiwen.com/subject/mzdaqktx.html