美文网首页R语言作业
R_basics_Biotrainee

R_basics_Biotrainee

作者: 明莉雅 | 来源:发表于2019-05-07 19:17 被阅读46次

    1.R脚本

    rm(list = ls())
    #gc()
    options(stringsAsFactors = F)
    library(data.table)
    library(tidyr)
    library(dplyr)
    library(pwr)
    library(ggplot2)
    library(ggpubr)
    #get the working directory
    getwd()
    #新建6个向量,基于不同的“原子类型”
    v1 <- letters[1:6];class(v1)
    v2 <- c(1.2,3.4,5,6.7);class(v2)
    v3 <- 1L:6L;class(v3)
    v4 <- c(T,F,T,T,F,F);class(v4)
    v5 <- 2i + 1;class(v5)
    v6 <- charToRaw("Biotrainee");class(v6) 
    
    #新建一些数据结构,数据框data1,矩阵data2,三维数组data3,列表data4
    data1 <- data.frame(c1 = paste0("gene",1:4),
                        c2 = 1:4,
                        c3 = 2:5,
                        c4 = 3:6,
                        row.names = paste0("r",1:4));data1
    class(colnames(data1))
    data2 <- matrix(seq(24,2,-2),nrow = 4,dimnames = list(paste0("r",seq(4)),paste0("c",seq(3))));data2
    data3 <- array(1:12,c(2,3,2));data3
    data4 <- list(data1,data2,data3);data4
    
    #get the subset of data frame & matrix
    (sub1_data1 <- data1[1:2,])
    (sub2_data1 <- data1[,2:3])
    (sub1_data2 <- data2[1:2,])
    (sub2_data2 <- data2[,2:3])
    
    #加载内置数据集`rivers`并加以描述
    d <- rivers;head(d,20)
    class(d)
    length(d)
    
    #统计描述,极值、中位数、四分位数等信息
    (summary(d))
    
    #GSE111229
    name_folder <- "RunInfoTable"
    name_fold <- "SraRunTable.txt"
    runinfo <- data.frame(fread(paste(name_folder,name_fold,sep = "/")))
    dim(runinfo);colnames(runinfo)
    
    #The class of each column in `runinfo`
    sapply(runinfo,class)
    
    #Read `sample`
    sample <- data.frame(fread("sample.csv"))
    dim(sample);colnames(sample)
    
    #The class of each column in `sample`
    sapply(sample,class)
    
    #merge
    merge_GSE111229 <- merge(runinfo,sample,by.x = "Sample_Name",by.y = "Accession")
    
    
    #########################################
    ##############Visualization##############
    #########################################
    mbs <- runinfo$MBases
    #Boxplot of MBases
    boxplot(mbs)
    #fivenum of MBases
    fivenum(mbs)
    #Hist of MBases
    hist(mbs,col = "orange",ylab = "freq of MBases",
         main = "MBases of RunInfoTable--GSE111229")
    #Density of MBases
    density(mbs)
    plot(density(mbs))
    
    ######################################
    ###############分组统计################
    ######################################
    #merge_GSE111229内拆分
    sep_merge <- separate(merge_GSE111229,Title,c("sep1","sep2","plate","sep3"),sep = "_",remove = F)
    select_merge <- select(sep_merge,c("MBases","plate"))#简化数据框,只取两列
    colnames(select_merge)
    #查看分组
    table(sep_merge$plate)##根据plate,分为两组,数量相同,各占一半
    
    #显著性水平为0.05,以plate分组,检验MBases是否有显著差异
    t.test(select_merge$MBases~select_merge$plate)
    #结论:p=0.02<0.05,拒绝原假设,即MBases两组数据均值有显著差异
    
    ###############分组作图################
    boxplot(select_merge$MBases~select_merge$plate)
    #ggplot2
    ggplot(select_merge,aes(x=plate,y=MBases))+geom_boxplot()
    #ggpubr
    ggboxplot(select_merge, x = "plate", y = "MBases", color = "plate", palette = "jco", add = "jitter")
    
    #随机取384条
    samp_merge <- sample_frac(select_merge,0.5)
    #与前面plate合并成新数据框
    comb_sample <- rbind(samp_merge,select_merge);nrow(comb_sample)
    

    2.运行结果及解析

    运行结果-part1

    > rm(list = ls())
    > #gc()
    > options(stringsAsFactors = F)
    > library(data.table)
    > library(tidyr)
    > library(dplyr)
    > library(pwr)
    > library(ggplot2)
    > library(ggpubr)
    > #get the working directory
    > getwd()
    [1] "E:/mingliya/Biotrainee/R_basics"
    > #新建6个向量,基于不同的“原子类型”
    > v1 <- letters[1:6];class(v1)
    [1] "character"
    > v2 <- c(1.2,3.4,5,6.7);class(v2)
    [1] "numeric"
    > v3 <- 1L:6L;class(v3)
    [1] "integer"
    > v4 <- c(T,F,T,T,F,F);class(v4)
    [1] "logical"
    > v5 <- 2i + 1;class(v5)
    [1] "complex"
    > v6 <- charToRaw("Biotrainee");class(v6) 
    [1] "raw"
    > #新建一些数据结构,数据框data1,矩阵data2,三维数组data3,列表data4
    > data1 <- data.frame(c1 = paste0("gene",1:4),
    +                     c2 = 1:4,
    +                     c3 = 2:5,
    +                     c4 = 3:6,
    +                     row.names = paste0("r",1:4));data1
          c1 c2 c3 c4
    r1 gene1  1  2  3
    r2 gene2  2  3  4
    r3 gene3  3  4  5
    r4 gene4  4  5  6
    > class(colnames(data1))
    [1] "character"
    > data2 <- matrix(seq(24,2,-2),nrow = 4,dimnames = list(paste0("r",seq(4)),paste0("c",seq(3))));data2
       c1 c2 c3
    r1 24 16  8
    r2 22 14  6
    r3 20 12  4
    r4 18 10  2
    > data3 <- array(1:12,c(2,3,2));data3
    , , 1
    
         [,1] [,2] [,3]
    [1,]    1    3    5
    [2,]    2    4    6
    
    , , 2
    
         [,1] [,2] [,3]
    [1,]    7    9   11
    [2,]    8   10   12
    
    > data4 <- list(data1,data2,data3);data4
    [[1]]
          c1 c2 c3 c4
    r1 gene1  1  2  3
    r2 gene2  2  3  4
    r3 gene3  3  4  5
    r4 gene4  4  5  6
    
    [[2]]
       c1 c2 c3
    r1 24 16  8
    r2 22 14  6
    r3 20 12  4
    r4 18 10  2
    
    [[3]]
    , , 1
    
         [,1] [,2] [,3]
    [1,]    1    3    5
    [2,]    2    4    6
    
    , , 2
    
         [,1] [,2] [,3]
    [1,]    7    9   11
    [2,]    8   10   12
    
    > #get the subset of data frame & matrix
    > (sub1_data1 <- data1[1:2,])
          c1 c2 c3 c4
    r1 gene1  1  2  3
    r2 gene2  2  3  4
    > (sub2_data1 <- data1[,2:3])
       c2 c3
    r1  1  2
    r2  2  3
    r3  3  4
    r4  4  5
    > (sub1_data2 <- data2[1:2,])
       c1 c2 c3
    r1 24 16  8
    r2 22 14  6
    > (sub2_data2 <- data2[,2:3])
       c2 c3
    r1 16  8
    r2 14  6
    r3 12  4
    r4 10  2
    

    运行结果--part2

    #加载内置数据集`rivers`并加以描述
    > d <- rivers;head(d,20)
     [1]  735  320  325  392  524  450 1459  135  465  600  330  336  280  315  870  906  202  329  290 1000
    > class(d)
    [1] "numeric"
    > is.vector(d)
    [1] TRUE
    > length(d)
    [1] 141
    > #统计描述,极值、中位数、四分位数等信息
    > (summary(d))
       Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      135.0   310.0   425.0   591.2   680.0  3710.0 
    > #GSE111229
    > name_folder <- "RunInfoTable"
    > name_fold <- "SraRunTable.txt"
    > runinfo <- data.frame(fread(paste(name_folder,name_fold,sep = "/")))
    > dim(runinfo)
    [1] 768  31       
    > #The class of each column in `runinfo`
    > sapply(runinfo,class)
             BioSample         Experiment             MBases             MBytes                Run 
           "character"        "character"          "integer"          "integer"        "character" 
            SRA_Sample        Sample_Name         Assay_Type       AssemblyName         AvgSpotLen 
           "character"        "character"        "character"        "character"          "integer" 
            BioProject        Center_Name            Consent DATASTORE_filetype DATASTORE_provider 
           "character"        "character"        "character"        "character"        "character" 
            InsertSize         Instrument      LibraryLayout   LibrarySelection      LibrarySource 
             "integer"        "character"        "character"        "character"        "character" 
              LoadDate           Organism           Platform        ReleaseDate          SRA_Study 
           "character"        "character"        "character"        "character"        "character" 
                   age          cell_type       marker_genes        source_name             strain 
           "character"        "character"        "character"        "character"        "character" 
                tissue 
           "character" 
    
    > #Read `sample`
    > sample <- data.frame(fread("sample.csv"))
    > dim(sample)
    [1] 768  12
    > #The class of each column in `sample`
    > sapply(sample,class)
              Accession               Title         Sample.Type            Taxonomy            Channels 
            "character"         "character"         "character"         "character"           "integer" 
               Platform              Series Supplementary.Types Supplementary.Links       SRA.Accession 
            "character"         "character"         "character"         "character"         "character" 
                Contact        Release.Date 
            "character"         "character" 
    
    > #######merge#######
    > merge_GSE111229 <- merge(runinfo,sample,by.x = "Sample_Name",by.y = "Accession")
    

    运行结果--part3

    > ###############Visualization############
    > mbs <- runinfo$MBases
    > #Boxplot of MBases
    > boxplot(mbs)
    > #fivenum of MBases
    > fivenum(mbs)
    [1]  0  8 12 16 74
    > #Hist of MBases
    > hist(mbs,col = "orange",ylab = "freq of MBases",
    +      main = "MBases of RunInfoTable--GSE111229")
    > #Density of MBases
    > density(mbs)
    
    Call:
        density.default(x = mbs)
    
    Data: mbs (768 obs.);   Bandwidth 'bw' = 1.423
    
           x                y            
     Min.   :-4.269   Min.   :0.0000000  
     1st Qu.:16.366   1st Qu.:0.0000353  
     Median :37.000   Median :0.0003001  
     Mean   :37.000   Mean   :0.0121039  
     3rd Qu.:57.634   3rd Qu.:0.0142453  
     Max.   :78.269   Max.   :0.0665647  
    > plot(density(mbs))
    
    > ###############分组统计################
    > #merge_GSE111229内拆分
    > sep_merge <- separate(merge_GSE111229,Title,c("sep1","sep2","plate","sep3"),sep = "_",remove = F)
    > select_merge <- select(sep_merge,c("MBases","plate"))#简化数据框,只取两列
    > colnames(select_merge)
    [1] "MBases" "plate" 
    > #查看分组
    > table(sep_merge$plate)##根据plate,分为两组,数量相同,各占一半
    
    0048 0049 
     384  384 
    
    > #显著性水平为0.05,以plate分组,检验MBases是否有显著差异
    > t.test(select_merge$MBases~select_merge$plate)
    
        Welch Two Sample t-test
    
    data:  select_merge$MBases by select_merge$plate
    t = 2.3019, df = 728.18, p-value = 0.02162
    alternative hypothesis: true difference in means is not equal to 0
    95 percent confidence interval:
     0.1574805 1.9831445
    sample estimates:
    mean in group 0048 mean in group 0049 
              13.08854           12.01823 
    
    > #结论:p=0.02<0.05,拒绝原假设,即MBases两组数据均值有显著差异
    > ###############分组作图################
    > boxplot(select_merge$MBases~select_merge$plate)
    > #ggplot2
    > ggplot(select_merge,aes(x=plate,y=MBases))+geom_boxplot()
    > #ggpubr
    > ggboxplot(select_merge, x = "plate", y = "MBases", color = "plate", palette = "jco", add = "jitter")
    > #随机取384条
    > samp_merge <- sample_frac(select_merge,0.5)
    > #与前面plate合并成新数据框
    > comb_sample <- rbind(samp_merge,select_merge);nrow(comb_sample)
    [1] 1152
    
    p.s.如有误望大家不吝赐教,提出错误和建议。作者:Leah 如需转载,请注明出处。

    相关文章

      网友评论

        本文标题:R_basics_Biotrainee

        本文链接:https://www.haomeiwen.com/subject/pzgyoqtx.html