美文网首页R for Data Science
[R语言] forcats包 因子操作《R for data s

[R语言] forcats包 因子操作《R for data s

作者: 半为花间酒 | 来源:发表于2020-04-27 08:28 被阅读0次

    《R for Data Science》第十五章 factors 啃书知识点积累
    参考链接:R for Data Science

    Creating factors

    x1 <- c("Dec", "Apr", "Jan", "Mar")
    

    纯粹创建一个向量记录月份,有两个缺点:

    1. 没有很好的办法避免打字错误
    x2 <- c("Dec", "Apr", "Jam", "Mar")
    
    1. 排序只能按照首字母顺序
    sort(x1)
    #> [1] "Apr" "Dec" "Jan" "Mar"
    

    策略:创建factor
    首先创建levels

    month_levels <- c(
      "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
      "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
    )
    

    然后创建因子

    y1 <- factor(x1, levels = month_levels)
    
    sort(y1)
    #> [1] Jan Mar Apr Dec
    #> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
    
    • 如果向量中的值不存在于levels中会被静默转换为NA
      可以用readr::parse_factor捕获warning
    x2 <- c("Dec", "Apr", "Jam", "Mar")
    
    y2 <- factor(x2, levels = month_levels)
    y2
    #> [1] Dec  Apr  <NA> Mar 
    #> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
    
    y2 <- parse_factor(x2, levels = month_levels)
    #> Warning: 1 parsing failure.
    #> row col           expected actual
    #>   3  -- value in level set    Jam
    
    • 如果不设定levels,会自动创建按照字母表顺序的levels
    factor(x1)
    #> [1] Dec Apr Jan Mar
    #> Levels: Apr Dec Jan Mar
    
    • 按照分类变量第一次出现的顺序设定levels

    方法一:创建时用unique

    f1 <- factor(x1, levels = unique(x1))
    f1
    #> [1] Dec Apr Jan Mar
    #> Levels: Dec Apr Jan Mar
    

    方法二:创建后用fct_inorder

    f2 <- x1 %>% factor() %>% fct_inorder()
    f2
    #> [1] Dec Apr Jan Mar
    #> Levels: Dec Apr Jan Mar
    
    • ·levels·直接查询因子内部水平
    levels(f2)
    #> [1] "Dec" "Apr" "Jan" "Mar"
    

    General Social Survey

    ??forcats::gss_cat
    
    • 分类变量映射ggplot2的x轴

    会自动转factor并且删除没有任何值的级别,可以用drop=FALSE强迫显示

    library(ggplot2)
    library(patchwork)
    
    p1 <- ggplot(gss_cat, aes(race)) +
      geom_bar() 
    
    p2 <- ggplot(gss_cat, aes(race)) +
      geom_bar() +
      scale_x_discrete(drop = FALSE)
    
    p1 + p2
    

    - Exercises

    gss_cat %>%
      # 过滤掉符合条件的
      filter(!rincome %in% c("Not applicable")) %>%
      # 修改变量中某一亚群的名字
      mutate(rincome = fct_recode(rincome,
                                  "Less than $1000" = "Lt $1000"
      )) %>%
      # 区别填充色的预处理
      mutate(rincome_na = rincome %in% c("Refused", "Don't know", "No answer")) %>%
      ggplot(aes(x = rincome, fill = rincome_na)) +
      geom_bar() +
      coord_flip() +
      scale_y_continuous("Number of Respondents", labels = scales::comma) +
      scale_x_discrete("Respondent's Income") +
      # 区别填充
      scale_fill_manual(values = c("FALSE" = "black", "TRUE" = "gray")) +
      theme(legend.position = "None")
    

    Modifying factor order

    It’s often useful to change the order of the factor levels in a visualisation.

    - 依数值重排序 fct_reorder

    relig_summary <- gss_cat %>%
      group_by(relig) %>%
      summarise(
        age = mean(age, na.rm = TRUE),
        tvhours = mean(tvhours, na.rm = TRUE),
        n = n()
      )
    
    p1 <- ggplot(relig_summary, aes(tvhours, relig)) + 
      geom_point()
    
    # 默认降序
    p2 <- ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) +
      geom_point()
    
    # 也可以用EDA提到的reorder
    ggplot(relig_summary, aes(tvhours, reorder(relig, tvhours))) +
      geom_point()
    
    p1 + p2
    

    - 自定义重排序 fct_relevel

    It takes a factor, f, and then any number of levels that you want to move to the front of the line.

    rincome_summary <- gss_cat %>%
      group_by(rincome) %>%
      summarise(
        age = mean(age, na.rm = TRUE),
        tvhours = mean(tvhours, na.rm = TRUE),
        n = n()
      )
    
    p1 <- ggplot(rincome_summary, aes(age, rincome)) + 
      geom_point()
    
    p2 <- ggplot(rincome_summary, aes(age, fct_relevel(rincome, "Not applicable"))) +
      geom_point()
    
    p1 + p2
    

    - 调节图例顺序 fct_reorder2()

    fct_reorder2() reorders the factor by the y values associated with the largest x values. This makes the plot easier to read because the line colours line up with the legend.

    主要作用是调节图例顺序便于阅读

    by_age <- gss_cat %>%
      filter(!is.na(age)) %>%
      count(age, marital) %>%
      group_by(age) %>%
      mutate(prop = n / sum(n))
    
    ggplot(by_age, aes(age, prop, colour = marital)) +
      geom_line(na.rm = TRUE)
    
    ggplot(by_age, aes(age, prop, colour = fct_reorder2(marital, age, prop))) +
      geom_line() +
      labs(colour = "marital")
    
    • 另一个例子:各党派每年比例的变化
    p1 <- gss_cat %>%
      mutate(
        partyid =
          fct_collapse(
             partyid,
             Others = c("No answer", "Don't know", "Other party"),
             Republican = c("Strong republican", "Not str republican"),
             Independent = c("Ind,near rep", "Independent", "Ind,near dem"),
             Democrat = c("Not str democrat", "Strong democrat")
          )
      ) %>%
      count(year, partyid) %>% 
      group_by(year) %>%
      mutate(proportions = n / sum(n)) %>% 
      ggplot(aes(year, proportions,
        colour = partyid
      )) +
      geom_point() +
      geom_line(size = 1) 
    
    p2 <- gss_cat %>%
      mutate(
        partyid =
          fct_collapse(
            partyid,
            Others = c("No answer", "Don't know", "Other party"),
            Republican = c("Strong republican", "Not str republican"),
            Independent = c("Ind,near rep", "Independent", "Ind,near dem"),
            Democrat = c("Not str democrat", "Strong democrat")
          )
      ) %>%
      count(year, partyid) %>% 
      group_by(year) %>%
      mutate(proportions = n / sum(n)) %>% 
      ggplot(aes(year, proportions,
                 colour = fct_reorder2(partyid, year, proportions)
      )) +
      geom_point() +
      geom_line(size = 1) +
      labs(colour = "Party ID")    
    
    p1 + p2
    

    - 柱形图的简易重排

    利用fct_infreq()fct_rev()

    # 调节为顺序递增
    p1 <- gss_cat %>%
      mutate(marital = marital %>% fct_infreq()) %>%
      ggplot(aes(marital)) +
      geom_bar()
    
    # 配合fct_rev是顺序递减
    p2 <- gss_cat %>%
      mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
      ggplot(aes(marital)) +
      geom_bar()
    
    p1 + p2
    
    • 判断一个dataset哪些变量是factor
    str(gss_cat)
    # 或者有更简便的办法
    keep(gss_cat,is.factor) %>% 
      names(.)
    # [1] "marital" "race"    "rincome" "partyid" "relig"   "denom"  
    

    Modifying factor levels

    More powerful than changing the orders of the levels is changing their values.

    - 修改变量中的值 fct_recode()

    gss_cat %>% 
      count(partyid)
    #> # A tibble: 10 x 2
    #>   partyid                n
    #>   <fct>              <int>
    #> 1 No answer            154
    #> 2 Don't know             1
    #> 3 Other party          393
    #> 4 Strong republican   2314
    #> 5 Not str republican  3032
    #> 6 Ind,near rep        1791
    #> # … with 4 more rows
    
    gss_cat %>%
      mutate(partyid = fct_recode(
        partyid,
        "Republican, strong"    = "Strong republican",
        "Republican, weak"      = "Not str republican",
        "Independent, near rep" = "Ind,near rep",
        "Independent, near dem" = "Ind,near dem",
        "Democrat, weak"        = "Not str democrat",
        "Democrat, strong"      = "Strong democrat"
      )) %>%
      count(partyid)
    #> # A tibble: 10 x 2
    #>   partyid                   n
    #>   <fct>                 <int>
    #> 1 No answer               154
    #> 2 Don't know                1
    #> 3 Other party             393
    #> 4 Republican, strong     2314
    #> 5 Republican, weak       3032
    #> 6 Independent, near rep  1791
    #> # … with 4 more rows
    

    fct_recode() will leave levels that aren’t explicitly mentioned as is, and will warn you if you accidentally refer to a level that doesn’t exist.

    • 可以将多个不同值整合为同一种便于分组
    gss_cat %>%
      mutate(partyid = fct_recode(partyid,
        "Republican, strong"    = "Strong republican",
        "Republican, weak"      = "Not str republican",
        "Independent, near rep" = "Ind,near rep",
        "Independent, near dem" = "Ind,near dem",
        "Democrat, weak"        = "Not str democrat",
        "Democrat, strong"      = "Strong democrat",
        "Other"                 = "No answer",
        "Other"                 = "Don't know",
        "Other"                 = "Other party"
      )) %>%
      count(partyid)
    

    - 同时整合多个值 fct_collapse()

    gss_cat %>%
      mutate(partyid = fct_collapse(partyid,
        other = c("No answer", "Don't know", "Other party"),
        rep = c("Strong republican", "Not str republican"),
        ind = c("Ind,near rep", "Independent", "Ind,near dem"),
        dem = c("Not str democrat", "Strong democrat")
      )) %>%
      count(partyid)
    #> # A tibble: 4 x 2
    #>   partyid     n
    #>   <fct>   <int>
    #> 1 other     548
    #> 2 rep      5346
    #> 3 ind      8409
    #> 4 dem      7180
    
    • 放一个案例:整合收入数据可视化
    gss_cat %>%
      mutate(
        rincome =
          fct_collapse(
            rincome,
            `Unknown` = c("No answer", "Don't know", "Refused", "Not applicable"),
            `Less than $5000` = c("Lt $1000", str_c(
              "$", c("1000", "3000", "4000"),
              " to ", c("2999", "3999", "4999")
            )),
            `$5000 to 10000` = str_c(
              "$", c("5000", "6000", "7000", "8000"),
              " to ", c("5999", "6999", "7999", "9999")
            )
          )
      ) %>%
      ggplot(aes(x = rincome)) +
      geom_bar() +
      coord_flip()
    

    - 自动堆砌值,多值化少值 fct_lump()

    整合方式是从最少堆开始逐渐向上吞并

    一般用于无序数据的整合

    gss_cat %>%
      mutate(relig = fct_lump(relig)) %>%
      count(relig)
    #> # A tibble: 2 x 2
    #>   relig          n
    #>   <fct>      <int>
    #> 1 Protestant 10846
    #> 2 Other      10637
    
    • 可以用参数n控制最后整合成的堆数
    gss_cat %>%
      mutate(relig = fct_lump(relig, n = 10)) %>%
      count(relig, sort = TRUE) %>%
      print(n = Inf)
    #> # A tibble: 10 x 2
    #>    relig                       n
    #>    <fct>                   <int>
    #>  1 Protestant              10846
    #>  2 Catholic                 5124
    #>  3 None                     3523
    #>  4 Christian                 689
    #>  5 Other                     458
    #>  6 Jewish                    388
    #>  7 Buddhism                  147
    #>  8 Inter-nondenominational   109
    #>  9 Moslem/islam              104
    #> 10 Orthodox-christian         95
    
    
    gss_cat %>%
      mutate(relig = fct_lump(relig, n = 5)) %>%
      count(relig, sort = TRUE)
    # # A tibble: 6 x 2
    #   relig          n
    #   <fct>      <int>
    # 1 Protestant 10846
    # 2 Catholic    5124
    # 3 None        3523
    # 4 Other        913
    # 5 Christian    689
    # 6 Jewish       388
    

    相关文章

      网友评论

        本文标题:[R语言] forcats包 因子操作《R for data s

        本文链接:https://www.haomeiwen.com/subject/emjrwhtx.html