美文网首页R语言
ggplot2-数据变形和映射(2)

ggplot2-数据变形和映射(2)

作者: Zhigang_Han | 来源:发表于2019-05-22 13:43 被阅读1次

    ggplot2,我总结了下,主要就是数据变形、映射、几何对象、统计变换以及后期图片调整及美化

    1、数据导入和变形

    (1)数据导入及格式
    #设置工作目录
    setwd("/media/han/b/rosalind/ggplot2")
    #导入数据框(Rstudio中的Rscript栏输入)
    gene_exp <- read.table(file = "gene_exp.txt",
                           sep = "\t",
                           header = T,
                           stringsAsFactors = F)
    gene_len <- read.table(file = "gene_len.txt",
                           sep = "\t",
                           header = T,
                           stringsAsFactors = F)
    group <- read.table(file = "group.txt",
                        sep = "\t",
                        header = T,
                        stringsAsFactors = F)
    #数据格式
    > head(gene_exp)
      Gene        S1        S2        S3        S4
    1   G1  844.9510 1301.7828 1207.7967 1153.3719
    2   G2 1246.8492  785.4974 1182.4283 1193.2796
    3   G3 1496.4822 1206.2611 1060.4760 1480.8871
    4   G4 1392.3307 1100.6337  687.7282  781.1865
    5   G5 1170.3425  857.2048  916.0348 1092.2339
    6   G6  721.2031 1477.6733 1543.1986  824.7960
             S5        S6        S7        S8
    1  899.1235 1150.6556  957.9256 1728.3804
    2 1922.2217  731.0976  631.0565 1318.9178
    3 1276.8723 1174.2532 1037.7988  842.4523
    4 1586.0368 1176.4862 1082.6887  896.8659
    5  353.4801  808.1170 1246.9023 1066.0504
    6  655.0933  442.7497 1089.1102  859.6822
             S9
    1  356.0858
    2 1399.4487
    3 1419.1017
    4 1295.0609
    5  671.8151
    6 1223.9446
    > head(gene_len)
      Gene Length
    1   G1   1712
    2   G2   1884
    3   G3   2514
    4   G4   1559
    5   G5   1952
    6   G6   2295
    > head(group)
      Sample  Group
    1     S1 group1
    2     S2 group1
    3     S3 group1
    4     S4 group2
    5     S5 group2
    6     S6 group2
    
    (2)数据变形
    library(tidyr)
    library(dplyr)
    #gather函数用于数据变形
    dexp <- gather(gene_exp, key = Sample, value = Expression, -Gene)
    #查看数据变形后的格式
    ##tidyr数据格式所有列是变量,每行是观测值,可以直接调用列变量
    >head(dexp)
      Gene Sample Expression
    1   G1     S1   844.9510
    2   G2     S1  1246.8492
    3   G3     S1  1496.4822
    4   G4     S1  1392.3307
    5   G5     S1  1170.3425
    6   G6     S1   721.2031
    
    (3)合并表格
    #重定向%>%符号默认将dexp传递给left_join(a, b, by = "Gene")中的a位置
    #a, b 位置互换才能引起合并表格的较大变化,比如left_join(b, a, by = "Gene"),则b表格是在前面的
    #left_join或者right_join仅仅是改变了a, b中“Gene”的排列方式,整体还是 a表样式在前,b表在后
    dexp <- gather(data = gene_exp,
                   key = Sample,
                   value = Expression,
                   -Gene) %>%
         left_join(gene_len, by = "Gene") %>%
         left_join(group, by = "Sample") %>%
    #select()函数选择变量的顺序和个数
         select(Gene, Sample, Group, Expression, Length) %>%
         arrange(Gene)
    > head(dexp)
      Gene Sample  Group Expression Length
    1   G1     S1 group1   844.9510   1712
    2   G1     S2 group1  1301.7828   1712
    3   G1     S3 group1  1207.7967   1712
    4   G1     S4 group2  1153.3719   1712
    5   G1     S5 group2   899.1235   1712
    6   G1     S6 group2  1150.6556   1712
    
    (4)映射
    #导入包
    library(ggplot2)
    library(dplyr) 
    #数据简化
    dexp_small <- filter(dexp, Group =="group1", Gene %in% paste("G", 1:10, sep = "")) %>%
      select(-Group, -Length)
    >head(dexp_small)
      Gene Sample Expression
    1   G1     S1    844.951
    2   G1     S2   1301.783
    3   G1     S3   1207.797
    4  G10     S1   1407.990
    5  G10     S2    473.370
    6  G10     S3   1134.640
    #ggplot:映射,几何对象,图层
    #第一步ggplot()确定了主图层
    #X轴是Sample名称,y轴是Expression,aes是映射函数,几何对象是散点图。
    ggplot(data = dexp_small, aes(x=Sample, y = Expression, color = Gene)) +
      geom_point()
    
    ##映射类型
    
    #颜色类:color(颜色或边框颜色)、fill(填充颜色)和alpha(透明度)
    #形状类:linetype(线型)、size(点的大小或线的宽度)和shape(形状)
    #位置类:x, y, xmin, xmax, ymin, ymax, xend, yend
    #特殊类:一类是group和order,另一类是字符串映射
    ##主图层
    p <- ggplot(data = dexp, aes(x = Sample,
                            y = Expression)) +
    ##个体几何对象
        geom_point(aes(color=Gene,
                     size=Length,
                     shape=Group),
                     #透明度
                     alpha=8/10)
    
    ##分组
    
    #群组几何对象:
    ##按照样品名进行绘图
    p + geom_boxplot(aes(group = Sample))
    ##按照组名进行分组
    p + geom_boxplot(aes(group = Group))
    #基因表达趋势变化,并且在group1中添加拟合曲线
    p + geom_line(aes(group = Gene, color=Gene)) +
      geom_smooth(aes(group=1))
    
    ##分面   
     #将一个图形分配成多个小图形
    #facet_wrap()只能按照一个变量进行分面       
    #facet_wrap(facets, nrow = NULL, ncol = NULL, scales = "fixed",
    #           shrink = TRUE, labeller = "label_value", as.table = TRUE,
    #           switch = NULL, drop = TRUE, dir = "h", strip.position = "top")
    #重要参数:
    #facets:  分面参数如 ~Group,表示用 Group 变量进行数据分类
    #nrow:    绘制图形的行数
    #ncol:    绘制图形的列数,一般nrow/ncol只设定一个即可
    #scales: fixed,小图均使用统一坐标;
    #         free每个小图按照各自数据范围自由调整坐标;
    #         free_x为自由调整x轴刻度范围;
    #         free_y为自由调整y轴刻度范围。
    p <- ggplot(data = dexp, aes(x = Sample, y = Expression)) 
    p + geom_point(aes(color=Gene, size=Length)) +
      #~Group表示按照组进行分组, scales="free_x"表示坐标轴自由调整,nrow表示只分成一行
      facet_wrap(~Group, scales = "free_x", nrow = 1)
    
    #facet_grid():可以按照两个变量进行分面
    #facet_grid(facets, margins = FALSE, scales = "fixed", space = "fixed",
    #           shrink = TRUE, labeller = "label_value", as.table = TRUE,
    #           switch = NULL, drop = TRUE)
    #与facet_wrap不同的重要参数:
    #facets:  应用两个标准分面,如Gene ~ Group
    #margins: Ture,包含所有数据的组 
    #space:   每张小图的坐标宽度,值同scales(具有free, fixed等参数),类似于WORD按照内容进行调整
    #选择前9条进行展示
    dexp_small <- filter(dexp, Gene %in% paste("G", 1:9, sep = ""))
    #主图层
    ps <- ggplot(data = dexp_small, aes(x=Sample, y = Expression))
    #几何图形以及按照基因和数组进行分面
    ps + geom_point(aes(color=Length)) + 
      facet_grid(Gene~Group, scales = "free", margins = TRUE, space = "free")
    

    相关文章

      网友评论

        本文标题:ggplot2-数据变形和映射(2)

        本文链接:https://www.haomeiwen.com/subject/hkwszqtx.html