美文网首页生信
生信中级10题

生信中级10题

作者: javen_spring | 来源:发表于2020-05-24 15:25 被阅读0次

    根据R包org.Hs.eg.db找到下面ensembl 基因ID 对应的基因名(symbol)

    genes <- c("ENSG00000000003.13","ENSG00000000005.5","ENSG00000000419.11","ENSG00000000457.12","ENSG00000000460.15","ENSG00000000938.11")
    
    library(org.Hs.eg.db)
    options(stringsAsFactors = F)
    ls("package:org.Hs.eg.db")
    ids <- toTable(org.Hs.egENSEMBL2EG)
    #等同于ids <- toTable(org.Hs.egENSEMBL)
    ids2 <- toTable(org.Hs.egSYMBOL)
    str(ids) #查看提取出的数据框变量的特征
    str(ids2)
    ids$gene_id <- as.numeric(ids$gene_id)  #避免合并后gene_id出现错误
    ids2$gene_id <- as.numeric(ids2$gene_id)#避免合并后gene_id出现错误
    ids_t <- merge(ids,ids2,by="gene_id")
    
    library(stringr)
    genes <- str_match(genes,"([:alnum:]*)[.][:digit:]")[,2]
    match_results <- ids_t[ids_t$ensembl_id%in%genes,]
    match_results
    
    rm(list=ls())  #一键清除
    
    match_results

    根据R包hgu133a.db找到下面探针对应的基因名(symbol)

    probe_ids <- c('1053_at', '117_at','121_at',"1255_g_at","1316_at","1320_at","1405_i_at","1431_at","1438_at","1487_at","1494_f_at","1598_g_at","160020_at","1729_at","177_at")
    
    library(hgu133a.db)
    ls("package:hgu133a.db")
    ids <- toTable(hgu133aSYMBOL)
    symbols_res <- ids[ids$probe_id%in%probe_ids,]
    symbols_res
    
    rm(list=ls())  #一键清除
    
    symbols_res

    找到R包CLL内置的数据集的表达矩阵里面的TP53基因的表达量,并且绘制在 progres.-stable分组的boxplot图

    • 方法一:
    library(CLL)
    ?CLL  #查看帮助文档
    data(sCLLex)
    class(sCLLex)  #查看sCLLex是什么对象,决定了怎么提取其中的数据
    eSet <- sCLLex
    rm(sCLLex)
    options(stringsAsFactors = F)
    exprSet <- exprs(eSet)
    pdata <- pData(eSet)
    str(pdata)  #查看数据框变量特征后知道pdata$Disease为因子,但stable不是对照水平
    group_list <- relevel(pdata$Disease,ref = "stable")  #用relevel()将stable设定为对照水平
    
    gpl <- eSet@annotation  #用@来取出eSet对象中的注释数据
    if (!require(hgu95av2.db)) BiocManager::install("hgu95av2.db")
    library(hgu95av2.db)
    ls("package:hgu95av2.db")
    ids <- toTable(hgu95av2SYMBOL)
    
    expr_df <- as.data.frame(exprSet) #将表达矩阵转为数据框
    expr_df$median <- apply(expr_df,1,median)  #将数据框添加median一列
    expr_df$probe_id <- rownames(expr_df) #添加probe_id一列
    expr_df <- merge(expr_df,ids,by="probe_id")  #将expr_df与ids按照probe_id进行合并
    
    expr_df <- expr_df[order(expr_df$symbol,expr_df$median,decreasing = T),]  #将symbol排序后,再将median从大到小排序
    expr_df <- expr_df[!duplicated(expr_df$symbol),]  #根据symbol去重复
    
    rownames(expr_df) <- expr_df$symbol
    expr_df <- expr_df[,-c(1,ncol(expr_df)-1,ncol(expr_df))]
    
    identical(colnames(expr_df),rownames(pdata))  #确定表达矩阵中的列名和pdata中的行名是否一致
    
    ex_TP53 <- expr_df[rownames(expr_df)== "TP53",]
    
    library(ggpubr)
    library(reshape2)
    ex_TP53 <- melt(ex_TP53,variable.name = "variable",value = "value")
    str(ex_TP53)
    df <- ex_TP53
    identical(df$variable,colnames(expr_df))
    df$group <- group_list
    names(df)[2] <- "expression"
    p <- ggboxplot(df, x = "group", y = "expression",
                   color = "group", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
                   add = "jitter", shape = "group")+
      stat_compare_means(label.y = 4.0,method = "t.test")  
    p
    
    ggpubr结果
    • 方法二:
    suppressPackageStartupMessages(library(CLL))
    data(sCLLex)
    sCLLex  #直接查看数据集(S4对象)中的信息
    exprSet=exprs(sCLLex)   
    ##sCLLex是依赖于CLL这个package的一个对象
    samples=sampleNames(sCLLex)
    samples  #查看samples
    pdata=pData(sCLLex)
    pdata
    group_list=as.character(pdata[,2]) #pdata[,2]提取后为因子,需转为字符串
    dim(exprSet)  #查看基因表达矩阵的维度,即行数及列数(探针数及样本数)
    exprSet[1:5,1:5]  #当基因集较大时,查看部分数据可减少内存消耗
    
    #BiocManager::install('hgu95av2.db')  #安装注释包
    library(hgu95av2.db)
    ls("package:hgu95av2.db") #查看注释包的内容
    
    ids=toTable(hgu95av2SYMBOL)  #toTable可将Bimap对象转为数据框结构
    save(ids,exprSet,pdata,file = 'input.Rdata')
    length(unique(ids$symbol))  #查看共有多少个symbol,一般有多个探针对应一个symbol
    tail(sort(table(ids$symbol)))  ##查看基因对应最多多少个探针
    table(sort(table(ids$symbol)))  #显示总的symbol对应的探针数量并table
    
    plot(table(sort(table(ids$symbol))))
    
    table(rownames(exprSet) %in% ids$probe_id)  #注意两个数据集的先后顺序,%in%表示前者中的变量与后者一一比较,返回布尔值
    dim(exprSet)
    exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,]  #取得与注释包交集后的表达矩阵
    dim(exprSet)
    
    ids=ids[match(rownames(exprSet),ids$probe_id),]  #match相当于%in%,可以匹配返回逻辑值,也可以将前者数据的顺序匹配给后者
    head(ids)
    exprSet[1:5,1:5]
    
    identical(ids$probe_id,rownames(exprSet))  #identical判断两个对象是否一致
    dat=exprSet
    ids$median=apply(dat,1,median) #ids新建median这一列,列名为median,同时对dat这个矩阵按行操作,取每一行的中位数,将结果给到median这一列的每一行
    ids=ids[order(ids$symbol,ids$median,decreasing = T),]#先对ids$symbol排序,在此基础上按照ids$median中位数从大到小排列的顺序排序,将对应的行赋值为一个新的ids
    ids=ids[!duplicated(ids$symbol),]#将symbol这一列取取出重复项,'!'为否,即取出不重复的项,去除重复的gene ,保留每个基因最大表达量结果
    dat=dat[ids$probe_id,] #新的ids取出probe_id这一列,将dat按照取出的这一列中的每一行组成一个新的dat
    rownames(dat)=ids$symbol#把ids的symbol这一列中的每一行给dat作为dat的行名
    dat[1:4,1:4]  #保留每个基因ID第一次出现的信息
    dim(dat)
    
    df <- dat[rownames(dat)== "TP53",]
    identical(colnames(dat),rownames(pdata))
    df <- data.frame(expression = df,group = group_list)
    
    library(ggpubr)
    p <- ggboxplot(df, x = "group", y = "expression",
                   color = "group", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
                   add = "jitter", shape = "group")+
      stat_compare_means(label.y = 4.0,method = "t.test") +
      ylab("relative expression of TP53")
    p
    
    与第一种方法相比,第二种方法的组别应转为因子,并设定对照stable
    • 法三:使用dplyr包
    library(dplyr)
    data(sCLLex)
    class(sCLLex)  #查看sCLLex是什么对象,决定了怎么提取其中的数据
    eSet <- sCLLex
    rm(sCLLex)
    exprSet <- exprs(eSet)
    pdata <- pData(eSet)
    str(pdata)  #查看数据框变量特征后知道pdata$Disease为因子,但stable不是对照水平
    group_list <- relevel(pdata$Disease,ref = "stable")  #用relevel()将stable设定为对照水平
    
    gpl <- eSet@annotation  #用@来取出eSet对象中的注释数据
    if (!require(hgu95av2.db)) BiocManager::install("hgu95av2.db")
    library(hgu95av2.db)
    ls("package:hgu95av2.db")
    ids <- toTable(hgu95av2SYMBOL)
    
    dat <- exprSet%>%
      as.data.frame()%>%
      mutate(probe_id=rownames(exprSet),median = apply(exprSet,1,median))
    identical(dat$probe_id,rownames(exprSet))
    dat <- merge(dat,ids,by = "probe_id")
    dat <- dat[order(dat$symbol,dat$median,decreasing= "T"),]
    table(sort(table(dat$symbol)))
    dat <- dat[!duplicated(dat$symbol),]
    rownames(dat) <- dat$symbol
    dat <- dat[,-c(1,ncol(dat)-1,ncol(dat))]
    
    identical(colnames(dat),rownames(pdata))
    df <- as.numeric(dat[rownames(dat)== "TP53",])
    df <- data.frame(expression = df,group = group_list)
    
    library(ggpubr)
    p <- ggboxplot(df, x = "group", y = "expression",
                   color = "group", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
                   add = "jitter", shape = "group")+
      stat_compare_means(label.y = 4.0,method = "t.test") +
      ylab("relative expression of TP53")
    p
    
    第三种方法ggpubr作图

    找到BRCA1基因在TCGA数据库的乳腺癌数据集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表达情况

    因网络问题,暂无解答。提示:使用http://www.cbioportal.org/index.do 定位数据集:http://www.cbioportal.org/datasets

    找到TP53基因在TCGA数据库的乳腺癌数据集的表达量分组看其是否影响生存

    提示使用:http://www.oncolnc.org/ 数据库

    下载数据集GSE17215的表达矩阵并且提取下面的基因画热图

    genes <- c("ACTR3B", "ANLN","BAG1","mBCL2","BIRC5", "BLVRA", "CCNB1", "CCNE1", "CDC20", "CDC6", "CDCA1", "CDH3", "CENPF", "CEP55", "CXXC5", "EGFR", "ERBB2", "ESR1", "EXO1", "FGFR4", "FOXA1", "FOXC1", "GPR160", "GRB7", "KIF2C", "KNTC2", "KRT14", "KRT17", "KRT5", "MAPT", "MDM2", "MELK", "MIA", "MKI67", "MLPH", "MMP11", "MYBL2", "MYC", "NAT1", "ORC6L", "PGR", "PHGDH", "PTTG1", "RRM2", "SFRP1", "SLC39A6", "TMEM45B", "TYMS", "UBE2C", "UBE2T")
    rm(list=ls())
    library(GEOquery)
    library(stringr)
    library(AnnoProbe)
    library(ggplot2)
    
    gse = "GSE17215"
    
    if(require(AnnoProbe)){
      if(!file.exists(paste0(gse,"_eSet.Rdata"))) geoChina(gse)
      load(paste0(gse,"_eSet.Rdata"))
      eSet <- gset
      rm(gset)
    }else{
      eSet <- getGEO(gse, 
                     destdir = '.', 
                     getGPL = F)
    }
    
    #(1)提取表达矩阵exp
    exp <- exprs(eSet[[1]])
    exp[1:4,1:4]
    exp = log2(exp+1)
    boxplot(exp,cex.axis = 0.75,las = 2,col = rainbow(6))
    #(2)提取临床信息
    pd <- pData(eSet[[1]])
    #(3)提取芯片平台编号
    gpl <- eSet[[1]]@annotation
    
    p = identical(rownames(pd),colnames(exp));p
    if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
    
    #(4)分组信息:
    group_list <- ifelse(str_detect(pd$title,"paclitaxel"),"paclitaxel","salinomycin")
    
    group_list=factor(group_list,levels = c("paclitaxel","salinomycin"))
    
    #(5)下载注释信息:
    if (!require(hthgu133a.db)) BiocManager::install("hthgu133a.db")
    library(hthgu133a.db)
    ls("package:hthgu133a.db")
    ids <- toTable(hthgu133aSYMBOL)
    
    #(6)数据处理:
    table(rownames(exp)%in%ids$probe_id)
    exp <- exp[rownames(exp)%in%ids$probe_id,]
    identical(ids$probe_id,rownames(exp))
    
    exp <- as.data.frame(exp)
    exp$probe_id <- rownames(exp)
    ids$median <- apply(exp,1,median)
    ids <- ids[order(ids$symbol,ids$median,decreasing = T),]
    ids <- ids[!duplicated(ids$symbol),]
    
    exp <- merge(exp,ids,by="probe_id")
    exp <- exp[match(ids$probe_id,exp$probe_id),]
    identical(ids$probe_id,exp$probe_id)
    rownames(exp) <- exp$symbol
    exp <- exp[,-c(1,ncol(exp)-1,ncol(exp))]
    
    dat <- exp[rownames(exp)%in%genes,]
    dat <- as.matrix(dat)
    
    library(RColorBrewer)
    library(pheatmap)
    
    identical(rownames(pd),colnames(dat))
    annotation_col=data.frame(group=group_list)
    rownames(annotation_col)=colnames(dat) 
    
    pheatmap(dat,scale = "row",
             cluster_cols = T,
             cluster_rows = T,
             show_rownames = T,
             show_colnames = F,
             color = colorRampPalette(rev(brewer.pal(n = 7, name =
                                                       "RdYlBu")))(100),
             annotation_col = annotation_col,
             fontsize = 8, #调整图片中文本的大小
             legend = T)
    

    下载数据集GSE24673的表达矩阵计算样本的相关性并且绘制热图,需要标记上样本分组信息

    rm(list=ls())
    library(GEOquery)
    library(stringr)
    library(AnnoProbe)
    library(ggplot2)
    
    gse = "GSE24673"
    
    if(require(AnnoProbe)){
      if(!file.exists(paste0(gse,"_eSet.Rdata"))) geoChina(gse)
      load(paste0(gse,"_eSet.Rdata"))
      eSet <- gset
      rm(gset)
    }else{
      eSet <- getGEO(gse, 
                     destdir = '.', 
                     getGPL = F)
    }
    
    #(1)提取表达矩阵exp
    exp <- exprs(eSet[[1]])
    exp[1:4,1:4]
    #exp = log2(exp+1)
    ncol(exp)
    boxplot(exp,cex.axis = 0.75,las = 2,col = rainbow(11))
    #(2)提取临床信息
    pd <- pData(eSet[[1]])
    #(3)提取芯片平台编号
    gpl <- eSet[[1]]@annotation
    
    p = identical(rownames(pd),colnames(exp));p
    if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
    
    #(4)分组信息:
    group_list <- c(rep(c("139","535","984"),each = 3),rep("normal",2))
    
    group_list=factor(group_list,levels = c("normal","139","535","984"))
    
    #(5)下载注释信息:
    if (!require(hugene10sttranscriptcluster.db)) BiocManager::install("hugene10sttranscriptcluster.db")
    library(hugene10sttranscriptcluster.db)
    ls("package:hugene10sttranscriptcluster.db")
    ids <- toTable(hugene10sttranscriptclusterSYMBOL)
    
    #(6)数据处理:
    table(rownames(exp)%in%ids$probe_id)
    exp <- exp[rownames(exp)%in%ids$probe_id,]
    ids <- ids[ids$probe_id%in%rownames(exp),]
    identical(ids$probe_id,rownames(exp))
    
    ids$median <- apply(exp,1,median)
    ids <- ids[order(ids$symbol,ids$median,decreasing = T),]
    ids <- ids[!duplicated(ids$symbol),]
    
    exp <- as.data.frame(exp)
    exp$probe_id <- rownames(exp)
    exp <- merge(exp,ids,by="probe_id")
    exp <- exp[match(ids$probe_id,exp$probe_id),]
    identical(ids$probe_id,exp$probe_id)
    rownames(exp) <- exp$symbol
    exp <- exp[,-c(1,ncol(exp)-1,ncol(exp))]
    
    #画PCA图
    dat=as.data.frame(t(exp))
    library(FactoMineR)#画主成分分析图需要加载这两个包
    library(factoextra) 
    
    #pca的统一操作走起
    dat.pca <- PCA(dat, graph = FALSE)
    pca_plot <- fviz_pca_ind(dat.pca,
                             geom.ind = "point", # show points only (nbut not "text")
                             col.ind = group_list, # color by groups
                             #palette = c("#00AFBB", "#E7B800"),
                             addEllipses = TRUE, # Concentration ellipses
                             legend.title = "Groups"
    )
    pca_plot  #浓度椭圆
    
    pca_plot <- fviz_pca_ind(dat.pca,
                             geom.ind = "point", # show points only (nbut not "text")
                             col.ind = group_list, # color by groups
                             #palette = c("#00AFBB", "#E7B800"),
                             addEllipses = TRUE, 
                             ellipse.type = "confidence",
                             legend.title = "Groups"
    )
    pca_plot  #置信椭圆
    #参见:http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials/
    
    #表达矩阵热图:
    library(RColorBrewer)
    library(pheatmap)
    
    dat=as.data.frame(t(dat))
    identical(rownames(pd),colnames(dat))
    annotation_col=data.frame(group=group_list)
    rownames(annotation_col)=colnames(dat) 
    
    pheatmap(dat,scale = "row",
             cluster_cols = T,
             cluster_rows = T,
             show_rownames = F,
             show_colnames = F,
             color = colorRampPalette(rev(brewer.pal(n = 7, name =
                                                       "RdYlBu")))(100),
             annotation_col = annotation_col,
             fontsize = 8,#调整图片中文本的大小
             legend = T)
    
    #样本相关性热图:
    M <- cor(exp)
    pheatmap::pheatmap(M,annotation_col = annotation_col)  #(存疑)
    
    PCA 表达矩阵热图 样本相关性矩阵

    找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 对应的R的bioconductor注释包,并且安装它!

    GPL6244
    if (!require(hugene10sttranscriptcluster.db)) BiocManager::install("hugene10sttranscriptcluster.db")
    library(hugene10sttranscriptcluster.db)
    ls("package:hugene10sttranscriptcluster.db")
    ids <- toTable(hugene10sttranscriptclusterSYMBOL)
    

    下载数据集GSE42872的表达矩阵,并且分别挑选出 所有样本的(平均表达量/sd/mad/)最大的探针,并且找到它们对应的基因。

    (注释包:hugene10sttranscriptcluster.db)

    rm(list=ls())
    #数据下载
    options(stringsAsFactors = F)
    library(GEOquery)
    gse = "GSE42872"
    if(require(AnnoProbe)){
      if(!file.exists(paste0(gse,"_eSet.Rdata"))) geoChina(gse)
      load(paste0(gse,"_eSet.Rdata"))
      eSet <- gset
      rm(gset)
    }else{
      eSet <- getGEO(gse, 
                     destdir = '.', 
                     getGPL = F)
    }
    #(1)提取表达矩阵exp
    exp <- exprs(eSet[[1]])
    exp[1:4,1:4]
    #exp = log2(exp+1)
    #(2)提取临床信息
    pd <- pData(eSet[[1]])
    #(3)调整pd的行名顺序与exp列名完全一致
    p = identical(rownames(pd),colnames(exp));p
    if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
    #(4)提取芯片平台编号
    gpl <- eSet[[1]]@annotation
    save(gse,pd,exp,gpl,file = paste0(gse,"step1output.Rdata"))
    
    #group_list(实验分组)和ids(芯片注释),每次都需要改
    rm(list = ls())  
    load(file = "GSE42872step1output.Rdata")
    group_list=ifelse(str_detect(pd$title,"Control"),"control","treat")
    #设置参考水平,对照在前,处理在后
    group_list = factor(group_list,
                        levels = c("control","treat"))
    #ids  BioconductorR包
    gpl 
    #http://www.bio-info-trainee.com/1399.html
    if(!require(hugene10sttranscriptcluster.db))BiocManager::install("hugene10sttranscriptcluster.db")
    library(hugene10sttranscriptcluster.db)
    ls("package:hugene10sttranscriptcluster.db")
    ids <- toTable(hugene10sttranscriptclusterSYMBOL)
    head(ids)
    save(exp,group_list,ids,file = paste0(gse,"step2output.Rdata"))
    
    #数据处理:
    table(rownames(exp)%in%ids$probe_id)
    exp <- exp[rownames(exp)%in%ids$probe_id,]
    ids <- ids[ids$probe_id%in%rownames(exp),]
    identical(ids$probe_id,rownames(exp))
    
    ids$median <- apply(exp,1,median) #中位数
    ids$sd <- apply(exp,1,sd) #标准差
    ids$mad <- apply(exp,1,mad) #中位方差
    ids_median <- ids[order(ids$median,decreasing = F),]
    median_max <- ids_median[1,]
    ids_sd <- ids[order(ids$sd,decreasing = T),]
    sd_max <- ids_sd[1,];sd_max
    ids_mad <- ids[order(ids$mad,decreasing = T),]
    mad_max <- ids_mad[1,];mad_max
    
    所有样本的(平均表达量/sd/mad/)最大的探针及基因名

    下载数据集GSE42872的表达矩阵,并且根据分组使用limma做差异分析,得到差异结果矩阵

    load(file = "GSE42872step1output.Rdata")
    load("GSE42872step2output.Rdata")
    
    table(rownames(exp)%in%ids$probe_id)
    exp <- exp[rownames(exp)%in%ids$probe_id,]
    identical(rownames(exp),ids$probe_id)
    exp <- as.data.frame(exp)
    exp$median <- apply(exp,1,median)
    exp$probe_id <- rownames(exp)
    exp_m <- merge(exp,ids,by="probe_id")
    
    exp_m <- exp_m[order(exp_m$symbol,exp_m$median,decreasing = T),]
    exp_m <- exp_m[!duplicated(exp_m$symbol),]
    rownames(exp_m) <- exp_m$symbol
    exp_m <- exp_m[,-c(1,ncol(exp_m)-1,ncol(exp_m))]
    
    library(limma)
    design=model.matrix(~group_list)
    fit=lmFit(exp_m,design)
    fit=eBayes(fit)
    deg=topTable(fit,coef=2,number = Inf)
    save(exp_m,deg,file="GSE42872limma.Rdata")
    

    相关文章

      网友评论

        本文标题:生信中级10题

        本文链接:https://www.haomeiwen.com/subject/grrkahtx.html