美文网首页ggplot2绘图基因组数据绘图R
过去说complexheatmap太复杂,是我太菜了

过去说complexheatmap太复杂,是我太菜了

作者: 邵扬_Barnett | 来源:发表于2021-02-24 17:00 被阅读0次

    写在前面

    之前曾抱怨过complexheatmap太复杂了,现在我想说是我太菜了。折腾了一圈complexheatmap后,发现complexheatmap真的是极致全面的一个热图包了。

    TidyTuesday

    如果你想入门R或者python,但是没有数据给你折腾而发愁。那么我推荐你去看看TidyTueday。点这里

    TT

    每周二一个新的数据包,覆盖各种类型。比如今天我们折腾的数据就是来自TidyTuesday。想要获取数据非常简单,只需要安装TidyTuesday的包。

    install.packages("tidytuesdayR")
    

    首先读入需要的包。

    #Loading necessary packages
    library(tidytuesdayR)
    library(tidyverse)
    library(DataExplorer) # easy way for data exploring
    

    然后

    # Import data from tidytuesday
    tuesdata <- tidytuesdayR::tt_load('2020-09-01')
    # or
    tuesdata <- tidytuesdayR::tt_load(2020, week = 36)
    --- Compiling #TidyTuesday Information for 2020-09-01 ----
    --- There are 5 files available ---
    --- Starting Download ---
    
        Downloading file 1 of 5: `arable_land_pin.csv`
        Downloading file 2 of 5: `cereal_crop_yield_vs_fertilizer_application.csv`
        Downloading file 3 of 5: `cereal_yields_vs_tractor_inputs_in_agriculture.csv`
        Downloading file 4 of 5: `key_crop_yields.csv`
        Downloading file 5 of 5: `land_use_vs_yield_change_in_cereal_production.csv`
    
    --- Download complete ---
    

    当然,具体请考虑自己的网速。因为数据保存在github上……

    数据可视化

    key_crop_yields <- tuesdata$key_crop_yields
    
    key_crop_yields %>%
      filter(Code != ""& Entity != "World") %>% 
      select(-(2)) %>% 
      create_report(
        config = configure_report(
          add_plot_bar = FALSE
        )
      )
    

    这次使用DataExplorer自动生成报告,当然你可以通过函数查看更具体的内容。

    key_crop_yields %>%
      filter(
        Entity %in% c(
          "Japan",
          "China",
          "United States",
          "Brazil",
          "Indonesia",
          "India",
          "Pakistan",
          "Nigeria",
          "Bangladesh",
          "Russia",
          "Mexico"
        )
      ) %>%
      select(-(2)) %>%
      mutate(Year = as.factor(Year),
             Entity = as.factor(Entity)) %>% plot_boxplot(by = c("Year"))
      
    

    可可的产量很有意思啊。
    或者

    key_crop_yields %>%
      drop_na() %>% 
      filter(Code != "" & Entity != "World" & Year == 2018) %>%
      select(-(1:3)) %>% 
      plot_correlation(type = "c")
    

    当然,也能用ggplot2

    key_crop_yields %>%
      filter(
        Entity %in% c(
          "Japan",
          "China",
          "United States",
          "Brazil",
          "Indonesia",
          "India",
          "Pakistan",
          "Nigeria",
          "Bangladesh",
          "Russia",
          "Mexico"
        )
      ) %>%
      ggplot(., aes(x = Year, y = `Wheat (tonnes per hectare)`, color = Entity)) +
      geom_point(size = .1) +
      geom_smooth(method = 'lm') +
      theme_bw()
    

    尼日利亚的小麦产量居然是下降的,听说他们主粮是一种豆子。中国小麦公顷产增长超级快,那么中国小麦公顷产达到世界前列了么?一张热图告诉你。
    注:这里选择的是人口前11的国家。

    世界小麦公顷产量热图

    hmmm 看来不是…… 什么你问热图怎么做的?complexheatmap 啊!
    代码具体参考kevinblighe/E-MTAB-6141: Data from Lewis, Barnes, Blighe et al., Cell Rep. 2019 Aug 27; 28(9): 2455–2470.e5. (github.com)

    #载入包
    require(RColorBrewer)
    require(ComplexHeatmap)
    require(circlize)
    require(cluster)
    require(countrycode)
    

    然后整理一下数据

    set.seed(2021)
    yield_heatmap <- key_crop_yields %>%
      filter(!is.na(Code)) %>%
      select(Entity, Year, `Wheat (tonnes per hectare)`) %>%
      spread(Year, `Wheat (tonnes per hectare)`) %>% 
      filter(Entity != "World") %>% 
      column_to_rownames(var = "Entity") %>% 
      drop_na()
    
    key_crop_yields %>% 
      filter(str_detect(Entity,"Sudan")) %>% 
      select(Entity, Year, `Wheat (tonnes per hectare)`) %>% 
      drop_na() %>% 
      mutate(Entity = "Sudan") %>% 
      arrange(Year) %>% 
      spread(Year, `Wheat (tonnes per hectare)`) %>% 
      column_to_rownames(var = "Entity") %>% 
      bind_rows(yield_heatmap) -> heat
    

    上面做了两件事,一个是把数据里各种大洲和世界数据去掉了,另一个就是把苏丹数据跟前苏丹的数据合并了。
    注:这里指的是Entity不是Country,防杠还是去掉了一些有争议的地区。

    然后joint一下地理信息,后来发现起始直接通过Code就能获得准确信息了……以后说吧。

    rownames(heat) %>% data.frame(country = .) -> df
    df$Continent <-  countrycode(sourcevar = df[, "country"],
                                 origin = "country.name",
                                 destination = "continent")
    

    接下来是定义五大洲板块的色块注释。

    ann <- data.frame(Continent = df$Continent,
                      row.names = df$country,
                      stringsAsFactors =  FALSE)
    colours <- list(
      Continent = c('Africa' = '#2171B5', 'Asia' = '#6A51A3', 'Europe' = 'red2', 'Americas' = 'green3', 'Oceania' = 'orange')
    )
    
    colAnn <- HeatmapAnnotation(
      df = ann,
      which = 'row', # 'col' (samples) or 'row' (gene) annotation?
      col = colours,
      annotation_height = 0.6,
      annotation_width = unit(1, 'cm'),
      gap = unit(1, 'mm'),
      annotation_legend_param = list(
        Continent = list(
          nrow = 5,
          title = 'Continent',
          title_position = 'topcenter',
          legend_direction = 'vertical',
          title_gp = gpar(fontsize = 12, fontface = 'bold'),
          labels_gp = gpar(fontsize = 12, fontface = 'bold'))
      )
    )
    

    顶部的箱线图注释

    pick.col <- brewer.pal(5, 'Greens')
    boxplotCol <- HeatmapAnnotation(
        Average = anno_boxplot(
          heat,
          border = FALSE,
          gp = gpar(fill = colorRampPalette(pick.col)(length(1:58))),
          pch = '.',
          size = unit(2, 'mm'),
          axis = TRUE,
          axis_param = list(
            gp = gpar(fontsize = 12),
            side = 'left')),
          annotation_width = unit(c(2.0), 'cm'),
          which = 'col')
    

    然后是顶部线图的注释

    heat_anno <- HeatmapAnnotation(
        Average = anno_lines(
          apply(heat, 2, mean),
          border = TRUE,
          gp = gpar(col = "red2"),
          size = unit(2, 'mm'),
          axis = TRUE,
          axis_param = list(
            gp = gpar(fontsize = 10),
            side = 'left')),
        Distribution = anno_boxplot(
          heat,
          border = FALSE,
          gp = gpar(fill = colorRampPalette(pick.col)(length(1:58))),
          pch = '.',
          size = unit(2, 'mm'),
          axis = TRUE,
          axis_param = list(
            gp = gpar(fontsize = 12),
            side = 'left')),
          annotation_width = unit(c(2.0), 'cm'),
          which = 'col')
    

    当然你也可以换成柱状图

    barplotCol <- HeatmapAnnotation(
        Average = anno_barplot(
          apply(heat, 2, mean),
          border = TRUE,
          gp = gpar(fill = colorRampPalette(pick.col)(length(1:58))),
          size = unit(2, 'mm'),
          axis = TRUE,
          axis_param = list(
            gp = gpar(fontsize = 12),
            side = 'left')),
          annotation_width = unit(c(2.0), 'cm'),
          which = 'col')
    

    然后开始画图热图吧
    说个题外话,如果你对魔改complexheatmap有兴趣,强烈建议看看这儿
    ComplexHeatmap Complete Reference (jokergoo.github.io)
    虽然看的人想喊师傅别念了别念了……

    myCol <- colorRampPalette(c('dodgerblue', 'black', 'yellow'))(100) #三色
    myBreaks <- seq(1, 8, length.out = 100)  #设置色彩范围
    pamClusters <- cluster::pam(heat, k = 5) #设置聚类数
    
    Heatmap(
      heat,
      name = 'Wheat
      (tonnes per hectare)', 
      col = colorRamp2(myBreaks, myCol), #这个颜色设定值得学习,做出来很高级
      row_split = factor(pamClusters$clustering, levels = c(5,4,3,1,2)),  #聚类排序,
      cluster_row_slices = FALSE, #不解释,自己看manual
      heatmap_legend_param = list( #设置标签
        color_bar = 'continuous',
        legend_direction = 'vertical',
        legend_width = unit(8, 'cm'),
        legend_height = unit(5.0, 'cm'),
        title_position = 'topcenter',
        title_gp = gpar(fontsize = 12, fontface = 'bold'),
        labels_gp = gpar(fontsize = 12, fontface = 'bold')
      ),
      # row parameters
      cluster_rows = TRUE, #聚类
      show_row_dend = TRUE, #显示聚类
      row_title = "Cluster",
      row_title_gp = gpar(fontsize = 10,  fontface = 'bold'), #字号,粗体
      row_title_rot = 90, 
      show_row_names = TRUE,
      row_names_gp = gpar(fontsize = 10, fontface = 'bold'),
      row_names_side = 'right',
      row_dend_width = unit(25, 'mm'),
      
      # column parameters
      cluster_columns = FALSE,
      column_title = 'Year',
      column_title_side = 'bottom',
      column_title_gp = gpar(fontsize = 10, fontface = 'bold'),
      column_title_rot = 0,
      show_column_names = TRUE,
      column_names_gp = gpar(fontsize = 10, fontface = 'bold'),
      column_names_max_height = unit(10, 'cm'),
      column_dend_height = unit(25, 'mm'),
      # cluster methods for rows and columns
      clustering_distance_rows = function(m) dist(m),
      row_dend_reorder = T,
      clustering_method_rows = 'ward.D2',
      top_annotation = heat_anno,
      right_annotation = colAnn
    ) -> heat_map
    heat_map
    

    是不是很简单呢!做完我都不记得那些参数是干嘛的了……
    那么有没有简单的办法呢!有个低配版,效果嘛……

    yield_mean <- apply(heat, 2, mean) %>% data.frame(average = .) %>% mutate(year = rownames(.))
    
    heat %>%
      mutate(country = rownames(.)) %>%
      left_join(., df, by = "country") %>%
      gather(year, yield, -country, -Continent ) %>%
      left_join(., yield_mean, by = "year") %>%
      as.tibble() %>%
      tidyHeatmap::heatmap(country, year, yield,
                           cluster_columns = FALSE,
                           .scale = "none",
                           transform = NULL) %>%
      tidyHeatmap::add_tile(Continent) %>%
      tidyHeatmap::add_bar(average)
    

    怎么说呢,又不是不能用!


    相关文章

      网友评论

        本文标题:过去说complexheatmap太复杂,是我太菜了

        本文链接:https://www.haomeiwen.com/subject/mojdfltx.html