dplyr

作者: 茶苯海 | 来源:发表于2017-02-22 17:21 被阅读13次

    1.syntax-Helpful conventions for wrangling

    • .tbl_df
      converts data to tbl class.tbls arre easier to examine than data frames.R displays only the data that fits onscreen
    library(dplyr)
    tbl_df(iris)
    
    • .glimpse
      Information dense summary of tbl data.
    glimpse(iris)
    
    • .View{utils}
      View data set in spreadsheet-like display(note capital V)
    View(iris)
    
    • .%>%
      Passes object on lefthand side as first argument(or.argument) of function on righthand side
      x %>% f(y) is the same as f(x,y)
      y %>% f(x,.,z) is the same as f(x,y,z)

    "Piping" with %>% makes code more readable,e.g.

    iris %>% 
      group_by(Species) %>% 
      summarise(avg=mean(Sepal.Width)) %>% 
      arrange(avg)
    

    Tidy Data-A foundation for wrangling in R
    2.Reshaping Data -Change the layout of a data set
    -. Gather columns into rows
    tidyr::gather(cases,"year","n",2:4)
    -. Spread rows into columns
    tidyr::spread(pollution,size,amount)
    -. Separate one column into several.
    tidyr::separate(storms,date,c("y","m","d"))
    -. Unite several columns into one
    tidyr::unite(data,col,...,sep)

    library(dplyr)
    # Combine vectors into data frame(optimized)
    data_frame(a=1:3,b=4:6)
    # Order rows by values  od a column(low to high)
    arrange(mtcars,mpg)
    #Order rows by values of a column(high to low)
    arrange(mtcars,desc(mpg))
    #Rename the columns of a data frame
    rename(tb,y=year)
    

    3.Subset Observations(Rows)

    library(dplyr)
    # Extract rows that meet logical criteria
    filter(iris,Sepal.Length>7)
    # Remove duplicate rows
    distinct(iris)
    # Randomly select fraction of rows fraction(分数)
    sample_frac(iris,0.5,replace = TRUE)
    # Randomly select n rows
    sample_n(iris,10,replace = TRUE)
    #Select rows by position.
    slice(iris,10:15)
    #Select and order top n entries(by group if grouped data)
    #top_n(storms,2,date)
    

    Logic in R - ?Comparison, ?base::logic
    4.Subset Variables(Volumns)

    library(dplyr)
    #Select columns by name or helper function.
    select(iris,Sepal.Width,Petal.Length,Species)
    ## Helper functions for select -  ?select
    # Select columns whose name contains a character string.
    select(iris,contains("."))
    #  Select columns whose name ends with a character string.
    select(iris,ends_with("Length"))
    # Select every column.
    select(iris,everything())
    # Select columns whose name matches a regular expression
    select(iris,matches(".t."))
    # Select columns whose names are in a group of names
    select(iris,one_of(c("Species","Genus")))
    # Select columns named x1,x2,x3,x4,x5
    select(iris,num_range("x",1:5))
    #  Select columns whose name starts with a charactere
    select(iris,starts_with("Sepal"))
    # Select all columns between Sepal.Length and Petal.Width(inclusive)
    select(iris,Sepal.Length:Petal.Width)
    # select all columns except Species
    select(iris,-Species)
    

    5.Summarise Data

    library(dplyr)
    # Summarise data into single row of values
    summarise(iris,avg = mean(Sepal.Length))
    # Apply summary function to wach column
    summarise_each(iris,funs(mean))
    # Count number of rows with  each unique value of variable(with or without weights)
    count(iris,Species,wt = Sepal.Length)
    
    

    Summarise uses summary functions,functions that take a vector of values and return a single value,such as:
    ...
    6.Group Data

    # Group data into rows with the same value of Species.
    group_by(iris,Species)
    # Remove grouping information from data frame
    ungroup(iris)
    # Compute separate summary row for each group
    iris  %>% group_by(Species) %>% summarise(...)
    # Compute new variables by group
    iris %>% group_by(Species) %>% mutate(...)
    

    7.Make New Variables

    library(dplyr)
    # Compute and append one or more new columns.
    mutate(iris,sepal=Sepal.Length+Sepal.Width)
    #Apply window function to each column.
    mutate_each(iris, funs(min_rank))
    # Compute one or more new columns.Drop original columns
    transmute(iris,sepal=Sepal.Length + Sepal.Width)
    

    window function
    Mutate uses window functions,functions that take a vector of values and return another vector of values,such as:

    dplyr::lead         Copy with values shifted by 1.
           lag          Copy with values lagged by 1.
           dense_rank   Ranks with no gaps
           min_rank     Ranks.Ties get min rank
           percent_rank Ranks rescaled to [0,1]
           row_number   Ranks.Ties got to first value
           ntile        Bin vector into n buckets
           between      Are values between a and b?
           cume_dist    Cumulative distribution
           cumall       Cumulative all
           cumany       Cumulative any
           cummean      Cumulative mean
           cumsun
           cunmax
           cummin
           cumprod
           pmax         Element-wise max
           pmin         Element-wise min
    

    8.Combine Data Set

    • .Mutating Joins
    a <- data.frame(x1=c('A',"B",'C'),x2=c(1,2,3))
    b <- data.frame(x1=c('A',"B",'D'),x3=c('T','F','T'))
    #Join matching rows from b to a
    left_join(a,b,by='x1')
    # Join matching rows from a to  b
    right_join(a,b,by='x1')
    #Join data.Retain only rows in both  set
    inner_join(a,b,by='x1')
    #Join data.Retain all values,all rows.
    full_join(a,b,by='x1')
    
    • .Filtering Joins
    #Allrows in a that have a match in b
    semi_join(a,b,by='x1')
    # All rows in a that do not have a match in b
    anti_join(a,b,by='x1')
    
    • .Set Operations
    y <- data.frame(x1=c('A',"B",'C'),x2=c(1,2,3))
    z <- data.frame(x1=c("B",'C','D'),x2=c(2,3,4))
    #Rows that appear in both y and z.
    intersect(y, z)
    #Rows that appear in either or both y and z
    union(y,z)
    #Rows that appear in y but not z
    setdiff(y,z)
    

    -.Binding

    #append z to y as new rows
    bind_rows(y,z)
    # Append z to y as new columns.Caution:matches rows by position
    bind_cols(y,z)
    

    相关文章

      网友评论

        本文标题:dplyr

        本文链接:https://www.haomeiwen.com/subject/ffhmwttx.html