美文网首页R语言作业
生信人的20个R语言习题-高级

生信人的20个R语言习题-高级

作者: DrKu | 来源:发表于2020-03-01 17:44 被阅读0次
    1. 安装一些R包:
      数据包: ALL, CLL, pasilla, airway
      软件包:limma,DESeq2,clusterProfiler
      工具包:reshape2
      绘图包:ggplot2
      不同领域的R包使用频率不一样,在生物信息学领域,尤其需要掌握bioconductor系列包。
    if(!require(ALL)) BiocManager::install("ALL")
    if(!require(CLL)) BiocManager::install("CLL")
    if(!require(pasilla)) BiocManager::install("pasilla")
    if(!require(airway)) BiocManager::install("airway")
    if(!require(limma))BiocManager::install("limma")
    if(!require(DESeq2)) BiocManager::install("DESeq2")
    if(!require(clusterProfiler)) BiocManager::install("clusterProfiler")
    if(!require(reshape2))install.packages("reshape2")
    if(!require(ggplot2))install.packages("ggplot2")
    
    1. 了解ExpressionSet对象,比如CLL包里面就有data(sCLLex) ,找到它包含的元素,提取其表达矩阵(使用exprs函数),查看其大小
      A. 参考:http://www.bio-info-trainee.com/bioconductor_China/software/limma.html
      B. 参考:https://github.com/bioconductor-china/basic/blob/master/ExpressionSet.md
    suppressPackageStartupMessages(library(CLL))
    data(sCLLex)
    str(sCLLex)
    exprSet <- exprs(sCLLex)
    dim(exprSet)
    # [1] 12625    22
    
    1. 了解 str,head,help函数,作用于 第二步提取到的表达矩阵
    str(exprSet)
    # num [1:12625, 1:22] 5.74 2.29 3.31 1.09 7.54 ...
    # - attr(*, "dimnames")=List of 2
    # ..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
    # ..$ : chr [1:22] "CLL11.CEL" "CLL12.CEL" "CLL13.CEL" "CLL14.CEL" ...
    
    head(exprSet)
    # CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL CLL18.CEL CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL
    # 1000_at    5.743132  6.219412  5.523328  5.340477  5.229904  4.920686  5.325348  4.826131  5.212387  5.285830  5.581859  6.251678
    # 1001_at    2.285143  2.291229  2.287986  2.295313  2.662170  2.278040  2.350796  2.325163  2.432635  2.256547  2.348389  2.263849
    # 1002_f_at  3.309294  3.318466  3.354423  3.327130  3.365113  3.568353  3.502440  3.394410  3.617099  3.279726  3.391734  3.306811
    # 1003_s_at  1.085264  1.117288  1.084010  1.103217  1.074243  1.073097  1.091264  1.076470  1.132558  1.096870  1.138386  1.061184
    # 1004_at    7.544884  7.671801  7.474025  7.152482  6.902932  7.368660  6.456285  6.824862  7.304803  8.757756  6.695295  7.372185
    # 1005_at    5.083793  7.610593  7.631311  6.518594  5.059087  4.855161  5.176975  4.874563  4.097580  9.250585  7.657463  7.683677
    # CLL23.CEL CLL24.CEL CLL2.CEL CLL3.CEL CLL4.CEL CLL5.CEL CLL6.CEL CLL7.CEL CLL8.CEL CLL9.CEL
    # 1000_at    5.480752  5.216033 5.966942 5.397508 5.281720 5.414718 5.460626 5.897821 5.253883 5.214155
    # 1001_at    2.264434  2.344079 2.350073 2.406846 2.341961 2.372928 2.356978 2.222276 2.254772 2.358544
    # 1002_f_at  3.341444  3.798335 3.427736 3.453564 3.412944 3.411922 3.396466 3.247276 3.255148 3.365746
    # 1003_s_at  1.046188  1.082169 1.501367 1.191339 1.139510 1.153548 1.135671 1.074631 1.166247 1.151184
    # 1004_at    7.616749  6.839187 7.545673 8.802729 7.307752 7.491507 8.063202 7.014543 8.019108 7.432331
    # 1005_at    8.700667  5.546996 9.611601 5.732182 6.483191 6.072558 9.919125 5.149411 4.989604 5.339848
    help(str)
    help(head)
    
    1. 安装并了解 hgu95av2.db 包,看看 ls("package:hgu95av2.db") 后 显示的那些变量
    library(hgu95av2.db)
    ls("package:hgu95av2.db")
    # 
    # [1] "hgu95av2"              "hgu95av2.db"           "hgu95av2_dbconn"       "hgu95av2_dbfile"       "hgu95av2_dbInfo"      
    # [6] "hgu95av2_dbschema"     "hgu95av2ACCNUM"        "hgu95av2ALIAS2PROBE"   "hgu95av2CHR"           "hgu95av2CHRLENGTHS"   
    # [11] "hgu95av2CHRLOC"        "hgu95av2CHRLOCEND"     "hgu95av2ENSEMBL"       "hgu95av2ENSEMBL2PROBE" "hgu95av2ENTREZID"     
    # [16] "hgu95av2ENZYME"        "hgu95av2ENZYME2PROBE"  "hgu95av2GENENAME"      "hgu95av2GO"            "hgu95av2GO2ALLPROBES" 
    # [21] "hgu95av2GO2PROBE"      "hgu95av2MAP"           "hgu95av2MAPCOUNTS"     "hgu95av2OMIM"          "hgu95av2ORGANISM"     
    # [26] "hgu95av2ORGPKG"        "hgu95av2PATH"          "hgu95av2PATH2PROBE"    "hgu95av2PFAM"          "hgu95av2PMID"         
    # [31] "hgu95av2PMID2PROBE"    "hgu95av2PROSITE"       "hgu95av2REFSEQ"        "hgu95av2SYMBOL"        "hgu95av2UNIGENE"      
    # [36] "hgu95av2UNIPROT" 
    
    1. 理解 head(toTable(hgu95av2SYMBOL)) 的用法,找到 TP53 基因对应的探针ID
    head(toTable(hgu95av2SYMBOL))
    # probe_id  symbol
    # 1   1000_at   MAPK3
    # 2   1001_at    TIE1
    # 3 1002_f_at CYP2C19
    # 4 1003_s_at   CXCR5
    # 5   1004_at   CXCR5
    # 6   1005_at   DUSP1
    
    ids <- toTable(hgu95av2SYMBOL)
    
    ids[ids$symbol=="TP53",]
    
    # probe_id symbol
    # 966    1939_at   TP53
    # 997  1974_s_at   TP53
    # 1420  31618_at   TP53
    
    1. 理解探针与基因的对应关系,总共多少个基因,基因最多对应多少个探针,是哪些基因,是不是因为这些基因很长,所以在其上面设计多个探针呢?
    length(unique(ids$symbol))
    # [1] 8585
    
    tail(sort(table(ids$symbol)))
    # YME1L1  GAPDH INPP4A    MYB PTGER3  STAT1 
    # 7      8      8      8      8      8 
    
    1. 第二步提取到的表达矩阵是12625个探针在22个样本的表达量矩阵,找到那些不在 hgu95av2.db 包收录的对应着SYMBOL的探针。

    A. 提示:有1165个探针是没有对应基因名字的。

    ids_out = rownames(exprSet[(!rownames(exprSet) %in% ids$probe_id),])
    
    ids_out
    # [1] "1007_s_at" "1047_s_at" "1089_i_at" "108_g_at"  "1090_f_at" "1099_s_at" "1104_s_at" "1106_s_at"
    # [9] "1116_at"   "1122_f_at"
    
    1. 过滤表达矩阵,删除那1165个没有对应基因名字的探针。
    save(exprSet,ids,file = "scllex_expr_ids.Rdata")
    load(file = "scllex_expr_ids.Rdata")
    ids_out = rownames(exprSet[(!rownames(exprSet) %in% ids$probe_id),])
    dim(exprSet)
    # [1] 12625    22 过滤前
    
    exprSet=exprSet[!(rownames(exprSet) %in% ids_out),]
    dim(exprSet)
    # [1] 11460    22 过滤后
    
    1. 整合表达矩阵,多个探针对应一个基因的情况下,只保留在所有样本里面平均表达量最大的那个探针。
      A.提示,理解 tapply,by,aggregate,split 函数 , 首先对每个基因找到最大表达量的探针。
      B. 然后根据得到探针去过滤原始表达矩阵
    exprSet[1:5,1:5]
    # CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
    # 1000_at    5.743132  6.219412  5.523328  5.340477  5.229904
    # 1001_at    2.285143  2.291229  2.287986  2.295313  2.662170
    # 1002_f_at  3.309294  3.318466  3.354423  3.327130  3.365113
    # 1003_s_at  1.085264  1.117288  1.084010  1.103217  1.074243
    # 1004_at    7.544884  7.671801  7.474025  7.152482  6.902932
    ids[1:5,1:2]
    # probe_id  symbol
    # 1   1000_at   MAPK3
    # 2   1001_at    TIE1
    # 3 1002_f_at CYP2C19
    # 4 1003_s_at   CXCR5
    # 5   1004_at   CXCR5
    p <- identical(ids$probe_id,rownames(exprSet))
    if(!p) exprSet=exprSet[,match(ids$probe_id,rownames(exprSet))]
    ids$mean <- apply(exprSet,1,mean)
    ids= ids[order(ids$symbol,ids$mean,decreasing = T),]
    dim(ids)
    # [1] 11460     3
    ids=ids[!duplicated(ids$symbol),]
    dim(ids)
    # [1] 8585    3
    
    exprSet=exprSet[ids$probe_id,]
    dim(exprSet)
    # [1] 8585   22
    
    1. 把过滤后的表达矩阵更改行名为基因的symbol,因为这个时候探针和基因是一对一关系了。
    rownames(exprSet)=ids$symbol
    exprSet[1:5,1:5]
    
    # CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
    # ZZZ3   6.645791  7.350613  6.333290  6.603640  6.711462
    # ZZEF1  5.289264  6.677600  4.447104  7.008260  6.046429
    # ZYX    3.949769  5.423343  3.540189  5.234420  3.603839
    # ZWINT  4.316881  2.705329  3.131087  2.821306  2.963397
    # ZW10   4.382004  4.355469  4.336743  4.304551  4.482850
    
    1. 对第10步得到的表达矩阵进行探索,先画第一个样本的所有基因的表达量的boxplot,hist,density , 然后画所有样本的 这些图
      A. 参考:http://bio-info-trainee.com/tmp/basic_visualization_for_expression_matrix.html
      B. 理解ggplot2的绘图语法,数据和图形元素的映射关系
    par(mfrow = c(3,1))
    boxplot(exprSet[,1])
    hist(exprSet[,1])
    plot(density(exprSet[,1]))
    
    for (i in 1:ncol(exprSet)) {
      par(mfrow = c(1,3))
      boxplot(exprSet[,i],ylab=colnames(exprSet)[i])
      hist(exprSet[,i],xlab  =colnames(exprSet)[i],main =colnames(exprSet)[i])
      plot(density(exprSet[,i]),main =colnames(exprSet)[i])
    }
    
    
    dat=data.frame(exprSet)
    p = list()
    library(ggplot2)
    for (i in 1:ncol(exprSet)) {
      p[[i]] = ggplot(data = dat,mapping = aes_string(y=colnames(dat)[i]))+
      geom_boxplot()+
      theme_bw()
    }
    library(patchwork)
    wrap_plots(p,nrow = 3,guides = "collect")
    
    1. 理解统计学指标mean,median,max,min,sd,var,mad并计算出每个基因在所有样本的这些统计学指标,最后按照mad值排序,取top 50 mad值的基因,得到列表。
      A. 注意:这个题目出的并不合规,请仔细看。
    dat=data.frame(exprSet)
    dat$mean=apply(exprSet, 1, mean)
    dat$median=apply(exprSet, 1, median)
    dat$max=apply(exprSet, 1, max)
    dat$min=apply(exprSet, 1, min)
    dat$sd=apply(exprSet, 1, sd)
    dat$var=apply(exprSet, 1, var)
    dat$mad=apply(exprSet, 1, mad)
    
    dat=dat[order(dat$mad,decreasing = T),]
    top50_mad =rownames(dat)[1:50]
    top50_mad
    # [1] "FAM30A"   "IGF2BP3"  "DMD"      "TCF7"     "SLAMF1"   "FOS"      "LGALS1"   "IGLC1"    "ZAP70"    "FCN1"     "LHFPL2"   "HBB"     
    # [13] "S100A8"   "GUSBP11"  "COBLL1"   "VIPR1"    "PCDH9"    "IGH"      "ZNF804A"  "TRIB2"    "OAS1"     "CCL3"     "GNLY"     "CYBB"    
    # [25] "VAMP5"    "RNASE6"   "RGS2"     "PLXNC1"   "CAPG"     "RBM38"    "VCAN"     "APBB2"    "ARF6"     "TGFBI"    "NR4A2"    "S100A9"  
    # [37] "ZNF266"   "TSPYL2"   "CLEC2B"   "FLNA"     "H1FX"     "DUSP5"    "DUSP6"    "ANXA4"    "LPL"      "THEMIS2"  "P2RY14"   "ARHGAP44"
    # [49] "TNFSF9"   "PFN2"  
    
    1. 根据第12步骤得到top 50 mad值的基因列表来取表达矩阵的子集,并且热图可视化子表达矩阵。试试看其它5种热图的包的不同效果。
    dat13 <- exprSet[top50_mad,]
    pheatmap::pheatmap(dat13,scale = "row")
    
    1. 取不同统计学指标mean,median,max,min,sd,var,mad的各top50基因列表,使用UpSetR包来看他们之间的overlap情况。
    dat=dat[order(dat$mean,decreasing = T),]
    top50_mean =rownames(dat)[1:50]
    
    dat=dat[order(dat$median,decreasing = T),]
    top50_median =rownames(dat)[1:50]
    
    dat=dat[order(dat$max,decreasing = T),]
    top50_max =rownames(dat)[1:50]
    
    dat=dat[order(dat$min,decreasing = T),]
    top50_min =rownames(dat)[1:50]
    
    dat=dat[order(dat$sd,decreasing = T),]
    top50_sd =rownames(dat)[1:50]
    
    dat=dat[order(dat$var,decreasing = T),]
    top50_var =rownames(dat)[1:50]
    
    input <- c(
      "top50_mean" = 50,
      "top50_median"=50, 
      "top50_max"= 50, 
      "top50_min"= 50,
      "top50_sd"= 50,
      "top50_var"= 50,
      "top50_mad"= 50,
      "top50_mean&top50_median" = length(intersect(top50_mean,top50_median)),
      "top50_mean&top50_min" = length(intersect(top50_mean,top50_min)),
      "top50_mean&top50_max" = length(intersect(top50_mean,top50_max)),
      "top50_mean&top50_sd" = length(intersect(top50_mean,top50_sd)),
      "top50_mean&top50_var" = length(intersect(top50_mean,top50_var)),
      "top50_mean&top50_mad" = length(intersect(top50_mean,top50_mad)),
      "top50_median&top50_max" = length(intersect(top50_mean,top50_max)),
      "top50_median&top50_min" = length(intersect(top50_mean,top50_min)),
      "top50_median&top50_sd" = length(intersect(top50_mean,top50_sd))
      )
    ###列表中两两关系只做了一部分
    data <- UpSetR::fromExpression(input)
    upset(data,nsets = 7)
    
    1. 在第二步的基础上面提取CLL包里面的data(sCLLex) 数据对象的样本的表型数据。
    suppressPackageStartupMessages(library(CLL))
    data(sCLLex)
    pdata <- pData(sCLLex)
    identical(rownames(pdata),colnames(exprSet))
    
    1. 对所有样本的表达矩阵进行聚类并且绘图,然后添加样本的临床表型数据信息(更改样本名)
    Cor <- data.frame(cor(exprSet))
    pheatmap::pheatmap(Cor)
    
    group_list <- pdata$Disease
    annotation_col = data.frame(
      group = group_list
    )
    rownames(annotation_col) =colnames(exprSet)
    pheatmap::pheatmap(Cor,annotation_col = annotation_col)
    
    dat2 <- exprSet
    save(dat2,group_list,annotation_col,file = "16.Rdata")
    
    1. 对所有样本的表达矩阵进行PCA分析并且绘图,同样要添加表型信息。
    load(file = "16.Rdata")
    
    dat3=as.data.frame(t(dat2))
    library(FactoMineR)#画主成分分析图需要加载这两个包
    library(factoextra) 
    # pca的统一操作走起
    dat.pca <- PCA(dat3, graph = FALSE)
    pca_plot <- fviz_pca_ind(dat.pca,
                             geom.ind = "point", # show points only (nbut not "text")
                             col.ind = group_list, # color by groups
                             #palette = c("#00AFBB", "#E7B800"),
                             addEllipses = TRUE, # Concentration ellipses
                             legend.title = "Groups"
    )
    print(pca_plot)
    
    1. 根据表达矩阵及样本分组信息进行批量T检验,得到检验结果表格
    t.test(dat2[1,]~group_list)
    # t = -0.18376, df = 10.212, p-value = 0.8578
    pvals=apply(dat2, 1, function(x){
      t.test(as.numeric(x)~group_list)$p.value
    })
    p.adj=p.adjust(pvals,method = "BH")
    p.adj=data.frame(p.adj)
    head(p.adj)
    
    # p.adj
    # ZZZ3   0.9854230
    # ZZEF1  0.6284552
    # ZYX    0.9756691
    # ZWINT  0.9756691
    # ZW10   0.9848623
    # ZSWIM8 0.9756691
    
    1. 使用limma包对表达矩阵及样本分组信息进行差异分析,得到差异分析表格,重点看logFC和P值,画个火山图(就是logFC和-log10(P值)的散点图。)。
    library(limma)
    
    design <- model.matrix(~0+factor(group_list))
    colnames(design)=levels(factor(group_list))
    head(design)
    exprSet=dat2
    rownames(design)=colnames(exprSet)
    design
    contrast.matrix<-makeContrasts("progres.-stable",
                                   levels = design)
    contrast.matrix ##这个矩阵声明,我们要把 Tumor 组跟 Normal 进行差异分析比较
    
    deg = function(exprSet,design,contrast.matrix){
      ##step1
      fit <- lmFit(exprSet,design)
      ##step2
      fit2 <- contrasts.fit(fit, contrast.matrix) 
      ##这一步很重要,大家可以自行看看效果
      
      fit2 <- eBayes(fit2)  ## default no trend !!!
      ##eBayes() with trend=TRUE
      ##step3
      tempOutput = topTable(fit2, coef=1, n=Inf)
      nrDEG = na.omit(tempOutput) 
      #write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
      head(nrDEG)
      return(nrDEG)
    }
    
    deg = deg(exprSet,design,contrast.matrix)
    head(deg)
    # logFC AveExpr      t   P.Value adj.P.Val     B
    # TBC1D2B -1.0285   5.621 -5.837 8.241e-06   0.02237 3.352
    # CLIC1    0.9888   9.954  5.773 9.560e-06   0.02237 3.231
    # DLEU1    1.8302   6.951  5.741 1.029e-05   0.02237 3.171
    # SH3BP2  -1.3836   4.463 -5.735 1.042e-05   0.02237 3.160
    # GPM6A    2.5472   6.915  5.043 5.269e-05   0.08731 1.822
    # YTHDC2  -0.5187   7.602 -4.874 7.881e-05   0.08731 1.485
    
    plot(deg$logFC,-log10(deg$P.Value))
    
    
    
    #20. 对T检验结果的P值和limma包差异分析的P值画散点图,看看哪些基因相差很大。
    plot(pvals)
    pvals=apply(dat2, 1, function(x){
      t.test(as.numeric(x)~group_list)$p.value
    })
    pvals=data.frame(pvals)
    p <- identical(rownames(deg),rownames(pvals))
    if(!p) deg=deg[match(rownames(pvals),rownames(deg)),]
    
    
    plot(deg$P.Value,pvals$pvals)
    
    1. 对T检验结果的P值和limma包差异分析的P值画散点图,看看哪些基因相差很大。
    plot(pvals)
    pvals=apply(dat2, 1, function(x){
      t.test(as.numeric(x)~group_list)$p.value
    })
    pvals=data.frame(pvals)
    p <- identical(rownames(deg),rownames(pvals))
    if(!p) deg=deg[match(rownames(pvals),rownames(deg)),]
    
    
    plot(deg$P.Value,pvals$pvals)
    

    相关文章

      网友评论

        本文标题:生信人的20个R语言习题-高级

        本文链接:https://www.haomeiwen.com/subject/vsfhkhtx.html