R_test

作者: nvzhang | 来源:发表于2020-08-05 21:55 被阅读0次

    题目:
    初级:http://www.bio-info-trainee.com/3793.html
    中级:http://www.bio-info-trainee.com/3750.html
    高级: http://www.bio-info-trainee.com/3415.html

    basic

    #1打开 `Rstudio` 告诉我它的工作目录。
    getwd()
    #2新建6个向量,基于不同的`原子类型`。(重点是字符串,数值,逻辑值)
    R中的原子型向量可以存储以下6种数据类型:双整型(double),整型(integer),字符型(character),逻辑型(logical),复数类型(complex),原始类型(raw)
    dou <- c(1,2,3,4,5,6)
    int <- c(1:5)
    text <- c("Hello", "World")
    logic <- c(TRUE, FALSE, TRUE)
    comp <- c(1+1i, 2+2i, 1+3i)
    raw(3)
    
    #3告诉我在你打开的rstudio里面 `getwd()` 代码运行后返回的是什么?
    
    #4新建一些数据结构,比如矩阵,数组,数据框,列表等重点是数据框,矩阵)
    矩阵:test <- matrix(c(1:9),nrow = 3)
    数组:test <- array(1:27,c(3,3,3))
    数据框:test <- data.frame(c(1,2,3),c(4,5,6),c(7,8,9))
    列表:list <- list(one=c(1,2,3),two=c(4,5,6),three=c(7,8,9))
    
    #5在你新建的数据框进行切片操作,比如首先取第1,3行, 然后取第4,6列
    取第一行:test[c(1),]
    取第二列:test[,c(2)]
    
    #6使用data函数来加载R内置数据集 `rivers` 描述它。并且可以查看更多的R语言内置的数据集:[https://mp.weixin.qq.com/s/dZPbCXccTzuj0KkOL7R31g]
    data('rivers')
    
    #7下载 [https://www.ncbi.nlm.nih.gov/sra?term=SRP133642](https://www.ncbi.nlm.nih.gov/sra?term=SRP133642) 里面的 `RunInfo Table` 文件读入到R里面,了解这个数据框,多少列,每一列都是什么属性的元素。
    SRT <- read.csv("D:/RStudio/working_path/SraRunTable.txt")
    dim(SRT)
    
    #8下载 [https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111229](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111229) 里面的`样本信息sample.csv`读入到R里面,了解这个数据框,多少列,每一列都是什么属性的元素。(参考 [https://mp.weixin.qq.com/s/fbHMNXOdwiQX5BAlci8brA]) 获取样本信息sample.csv))
    library(GEOquery)
    GSE_name = 'GSE111229'
    options( 'download.file.method.GEOquery' = 'libcurl' ) 
    gset <- getGEO( GSE_name, getGPL = F )
    save( gset, file = 'gset.Rdata' )
    load("gset.Rdata")
    Gset <- gset[[1]]
    GEO<-pData(Gset)
    
    #9把前面两个步骤的两个表(`RunInfo Table` 文件,样本信息sample.csv)关联起来,使用merge函数。
    identical(SRT$Sample.Name , GEO$geo_accession)  
    m=merge(SRT,GEO,by.x = 'Sample.Name',by.y = 'geo_accession')
    
    #10对前面读取的 RunInfo Table 文件在R里面探索其MBases列,包括 箱线图(boxplot)和五分位数(fivenum),还有频数图(hist),以及密度图(density) 。
    e=m[,c("Bases","title")]
    boxplot(e[[1]])
    fivenum(e[[1]])
    hist(e[[1]])
    plot(density(e[[1]]))
    
    #11把前面读取的样本信息表格的样本名字根据下划线分割看第3列元素的统计情况(第三列代表该样本所在的plate)
    plate=unlist(lapply(e[,2],function(x){
     x
     strsplit(x,'_')[[1]][3]
    }))
    table(plate)
    
    #12根据plate把关联到的 RunInfo Table 信息的MBases列分组检验是否有统计学显著的差异。
    t.test(e[,1]~plate)
    e$plate=plate
    #分组绘制箱线图(boxplot),频数图(hist),以及密度图(density) 
    boxplot(e[,1]~plate)
    
    #13使用ggplot2把上面的图进行重新绘制。
    library(ggplot2)
    colnames(e)
    ggplot(e,aes(x=plate,y=Bases))+geom_boxplot()
    
    #14使用ggpubr把上面的图进行重新绘制。
    library(ggpubr)
    p <- ggboxplot(e, x = "plate", y = "MBases",
     color = "plate", palette = "jco",
     add = "jitter")
    # Add p-value
    p + stat_compare_means(method = 't.test')
    

    medium

    1. 根据R包org.Hs.eg.db找到下面ensembl 基因ID 对应的基因名(symbol)

    library(org.Hs.eg.db)
    g2s=toTable(org.Hs.egSYMBOL)
    g2e=toTable(org.Hs.egENSEMBL)
    #taget.txt
    ensembl_id
    ENSG00000000003.13
    ENSG00000000005.5
    ENSG00000000419.11
    ENSG00000000457.12
    ENSG00000000460.15
    ENSG00000000938.11
    t <- read.csv("D:/RStudio/working_path/target.txt", sep="")
    n=unlist(lapply(t[,1],function(x){
     x
     strsplit(x,'[.]')[[1]][1]}))
    m=data.frame(ensembl_id=n)
    a=merge(g2e,m,by="ensembl_id")
    b=merge(a,g2s,by="gene_id")
    
    图片.png

    2. 根据R包hgu133a.db找到下面探针对应的基因名(symbol)

    3. 找到R包CLL内置的数据集的表达矩阵里面的TP53基因的表达量,并且绘制在 progres.-stable分组的boxplot图

    library(CLL)
    data(sCLLex)
    exprSet=exprs(sCLLex) 
    CLL=pData(sCLLex)
    library(hgu95av2.db)
    ids=toTable(hgu95av2SYMBOL)
    probe_tp53 <- c("1939_at","1974_s_at","31618_at")
    boxplot(exprSet[probe_tp53[i],] ~ CLL$Disease)
    
    图片.png

    4. 找到BRCA1基因在TCGA数据库的乳腺癌数据集([Breast Invasive Carcinoma (TCGA, PanCancer Atlas)]的表达情况

    (http://www.cbioportal.org/datasets)

    rm(list=ls())
    options(stringsAsFactors = F)
    f <- read.csv("plot.txt", sep = "\t")
    colnames(f) <- c("id", "subtype", "expression", "mut")
    library(ggstatsplot)
    ggbetweenstats(data = f, 
                   x = subtype,
                   y = expression)
    library(ggplot2)
    ggsave("e4-BRCA1-TCGA.png")
    
    图片.png
    图片.png

    5找到TP53基因在TCGA数据库的乳腺癌数据集的表达量分组看其是否影响生存

    (http://www.oncolnc.org/)

    rm(list=ls())
    options(stringsAsFactors = F)
    f <- read.csv('BRCA_7157_50_50.csv')
    cp <- f
    library(ggstatsplot)
    ggbetweenstats(data = cp,
                   x = Group,
                   y = Expression)
    library(ggplot2)
    library(survival)
    library(survminer)
    cp$Status <- ifelse(cp$Status == "Dead", 1, 0)
    survf <- survfit(Surv(Days,Status)~Group, data=cp)
    ggsurvplot(survf, conf.int = F, pval = T)
    # complex survplot
    ggsurvplot(survf,palette = c("orange", "navyblue"),
               risk.table = T, pval = T,
               conf.int = T, xlab = "Time of days",
               ggtheme = theme_light(),
               ncensor.plot = T)
    ggsave("survival_TP53_in_BRCA_taga.png")
    
    图片.png

    6. 下载数据集GSE17215的表达矩阵并且提取下面的基因画热图

    ACTR3B ANLN BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDCA1 CDH3 CENPF CEP55 CXXC5 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GPR160 GRB7 KIF2C KNTC2 KRT14 KRT17 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11 MYBL2 MYC NAT1 ORC6L PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TMEM45B TYMS UBE2C UBE2T

    rm(list=ls())
    options(stringsAsFactors = F)
    #下载样本
    library(GEOquery)
    GSE_name = 'GSE17215'
    options( 'download.file.method.GEOquery' = 'libcurl' ) 
    geo <- getGEO( GSE_name, getGPL = F )
    save( geo, file = paste0(GSE_name,'.set.Rdata'))
    load(paste0(GSE_name,'.set.Rdata'))
    expr <- exprs(geo[[1]])
    #提取目标基因
    library(hgu133a.db)
    p2s=toTable(hgu133aSYMBOL)
    #剔除没有注释的id
    #expr <- expr[p2s$probe_id,] 
    target <- "ACTR3B ANLN BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDCA1 CDH3 CENPF CEP55 CXXC5 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GPR160 GRB7 KIF2C KNTC2 KRT14 KRT17 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11 MYBL2 MYC NAT1 ORC6L PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TMEM45B TYMS UBE2C UBE2T"
    target <- strsplit(target, ' ')[[1]]
    tmp <- dplyr::filter(p2s, p2s$symbol %in% target)
    tmp2 <- tibble::rownames_to_column(data.frame(expr),"probe_id")
    tmp3 <- merge(tmp,tmp2,by="probe_id")
    #计算每个probe的表达均值并画热图
    ##apply函数经常用来计算矩阵中行或列的均值、和值的函数
    ##第一个参数是指要参与计算的矩阵;
    ##第二个参数是指按行计算还是按列计算,1——表示按行计算,2——按列计算;
    ##第三个参数是指具体的运算参数
    tmp3$median <- apply(tmp3[,3:ncol(tmp3)], 1, median)
    #只保留平均表达量最大的探针
    tmp4 <- tmp3[order(tmp3$symbol, tmp3$median, decreasing = F),]
    #去重复,只保留表达量最大的一列
    tmp5 <- tmp4[!duplicated(tmp4$symbol),]
    #将每一行以基因名命名
    rownames(tmp5) <- tmp5$symbol
    #除去“probe_id" "symbol" "median" 三列
    tmp6 <- tmp5[,-c(1,2,ncol(tmp5))]
    #tmp6 <- tmp5[,c(3:8)]
    log2 <- log2(tmp6)
    pheatmap::pheatmap(log2, scale = "row") 
    
    图片.png

    7. 下载数据集GSE24673的表达矩阵计算样本的相关性并且绘制热图,需要标记上样本分组信息

    rm(list=ls())
    options(stringsAsFactors = F)
    #下载样本
    library(GEOquery)
    GSE_name = 'GSE24673'
    options( 'download.file.method.GEOquery' = 'libcurl' ) 
    geo <- getGEO( GSE_name, getGPL = F )
    save( geo, file = paste0(GSE_name,'.set.Rdata'))
    load(paste0(GSE_name,'.set.Rdata'))
    expr <- exprs(geo[[1]])
    pdata <- pData(geo[[1]]) 
    # 根据pdata第八列做一个分组信息矩阵
    grp <- pdata[,8]
    grp_df <- data.frame(group=grp)
    rownames(grp_df) <- colnames(expr)
    cor <- cor(expr)
    pheatmap::pheatmap(cor, annotation_col = grp_df)
    
    图片.png

    8. 找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 对应的R的bioconductor注释包,并且安装它!

    通过GSE序列号下载数据→从得到的list中获取GPL序列号→找到对应的bioconda注释包
    https://mp.weixin.qq.com/s/sVSsV2fWZOQmNd72Vb3SmQ

    9. 下载数据集GSE42872的表达矩阵,并且分别挑选出 所有样本的(平均表达量/sd/mad/)最大的探针,并且找到它们对应的基因。

    SD:Standard Deviation
    MAD:Median absolute deviation,就是先求出给定数据的中位数(注意并非均值)然后原数列的每个值与这个中位数求出绝对差,然后新数列的中位值就是MAD。平均绝对误差可以避免误差相互抵消的问题,因而可以准确反映实际预测误差的大小。

    rm(list=ls())
    options(stringsAsFactors = F)
    #下载样本
    library(GEOquery)
    GSE_name = 'GSE42872'
    options( 'download.file.method.GEOquery' = 'libcurl' ) 
    geo <- getGEO( GSE_name, getGPL = F )
    save( geo, file = paste0(GSE_name,'.set.Rdata'))
    load(paste0(GSE_name,'.set.Rdata'))
    expr <- exprs(geo[[1]])
    library(hugene10sttranscriptcluster.db)
    p2s=toTable(hugene10sttranscriptclusterSYMBOL)
    #剔除没有注释的id
    expr <- expr[p2s$probe_id,] 
    mean <- sort(apply(expr,1,mean),decreasing = T)[1]
    p2s[which(p2s == names(mean)),]
    sd <- sort(apply(expr,1,sd),decreasing = T)[1]
    p2s[which(p2s == names(sd)),]
    mad <- sort(apply(expr,1,mad),decreasing = T)[1]
    p2s[which(p2s == names(mad)),]
    
    图片.png

    10. 下载数据集GSE42872的表达矩阵,并且根据分组使用limma做差异分析,得到差异结果矩阵

    rm(list=ls())
    options(stringsAsFactors = F)
    #下载样本
    library(GEOquery)
    GSE_name = 'GSE42872'
    options( 'download.file.method.GEOquery' = 'libcurl' ) 
    geo <- getGEO( GSE_name, getGPL = F )
    save( geo, file = paste0(GSE_name,'.set.Rdata'))
    load(paste0(GSE_name,'.set.Rdata'))
    expr <- exprs(geo[[1]])
    pdata <- pData(geo[[1]]) 
    
    grp <- unlist(lapply(pdata$title, function(x){
      strsplit(x, ' ')[[1]][4]
    }))
    
    library(limma)
    #limma needs:表达矩阵(expr:需要用log后的矩阵)、分组矩阵(design)、比较矩阵(contrast)
    #表达矩阵:expr
    logexpr <- log2(expr)
    #分组矩阵:design
    design <- model.matrix(~0+factor(grp))
    colnames(design) <- levels(factor(grp))
    rownames(design) <- colnames(expr)
    #比较矩阵:contrast
    contrast<-makeContrasts(paste0(unique(grp),collapse = "-"),levels = design)
    contrast
    #limma差异分析流程
    fit <- lmFit(logexpr,design)
    fit2 <- contrasts.fit(fit, contrast) 
    fit3 <- eBayes(fit2)  
    output = topTable(fit3, coef=1, n=Inf)
    DEG = na.omit(output) 
    # 火山图
    if(T){
      logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
      
      DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
                                    ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT'))
      title <- paste0('log2FoldChange cutoff: ',round(logFC_cutoff,3),
                      '\nUp-regulated genes: ',nrow(DEG[DEG$change =='UP',]) ,
                      '\nDown-regulated genes: ',nrow(DEG[DEG$change =='DOWN',]))
    }
    library(ggplot2)
    plot = ggplot(data=DEG, aes(x=logFC, y=-log10(P.Value), color=change)) +
      geom_point(alpha=0.4, size=1.75) +
      theme_set(theme_set(theme_bw(base_size=20)))+
      xlab("log2FoldChange") + ylab("-log10 p-value") +
      ggtitle( title ) + theme(plot.title = element_text(size=10,hjust = 0.8))+
      scale_colour_manual(values = c('blue','black','red')) # according to the levels(DEG$change)
    print(plot)
    
    图片.png

    advanced

    library(CLL)
    data(sCLLex)
    exprSet=exprs(sCLLex)   
    samples=sampleNames(sCLLex)
    pdata=pData(sCLLex)
    group_list=as.character(pdata[,2])
    library(hgu95av2.db)
    ids=toTable(hgu95av2SYMBOL)
    #6.共有多少个gene
    length(unique(ids$symbol))
    #一个基因对应多少个probe
    table(sort(table(ids$symbol)))
    plot(table(sort(table(ids$symbol))))
    #7. expr中有多少个probe没在注释包中  
    table(rownames(exprSet) %in% ids$probe_id)
    #8. 剔除expr中注释包没有的探针
    exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,]
    #没懂,再一次对expr和ids匹配?
    ids=ids[match(rownames(exprSet),ids$probe_id),]
    #9. 只保留expr中所有样品平均表达量最大的探针
    ##第一种方法
    ###by(data, INDICES, FUN)函数的典型用法: 是将data数据框或矩阵按照INDICES因子水平进行分组,然后对每组应用FUN函数
    tmp = by(exprSet,ids$symbol,
    function(x) rownames(x)[which.max(rowMeans(x))] )
    probes <-  as.character(tmp)
    exprSet_filtered=exprSet[rownames(exprSet) %in% probes ,]
    ###match返回值为匹配上的probe在ids中的行数
    rownames(exprSet_filtered)=ids[match(rownames(exprSet_filtered),ids$probe_id),2]
    exprSet <- exprSet_filtered
    ##第二种方法
    ###ids新建median这一列,列名为median,同时对dat这个矩阵按行操作,取每一行的中位数,将结果给到median这一列的每一行
    ids$median=apply(exprSet,1,median) 
    ###按symbol分组后
    ids=ids[order(ids$symbol,ids$median,decreasing = T),]
    ###将symbol这一列取取出重复项,'!'为否,即取出不重复的项,去除重复的gene ,保留每个基因最大表达量结果
    ids=ids[!duplicated(ids$symbol),]
    ###按照新probe_id,取出exprSet中的各样品表达量
    exprSet=exprSet[ids$probe_id,]
    rownames(exprSet)=ids$symbol
    #对表达量作图
    boxplot(exprSet[,1])
    hist(exprSet[,1])
    density(exprSet[,1])
    boxplot(exprSet['GAPDH',])
    
    boxplot
    hist
    library(reshape2)
    exprSet_L=melt(exprSet)
    colnames(exprSet_L)=c('probe','sample','value')
    exprSet_L$group=rep(group_list,each=nrow(exprSet))
    library(ggplot2) 
    #rep(x, time = , length = , each = ,)
    #each:代表的是对向量中的每个元素进行复制的次数
    p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
    print(p)
    p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_violin()
    print(p)
    p=ggplot(exprSet_L,aes(value,fill=group))+geom_histogram(bins = 200)+facet_wrap(~sample, nrow = 4)
    print(p)
    p=ggplot(exprSet_L,aes(value,col=group))+geom_density()+facet_wrap(~sample, nrow = 4)
    print(p)
    p=ggplot(exprSet_L,aes(value,col=group))+geom_density() 
    print(p)
    p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
    p=p+stat_summary(fun.y="mean",geom="point",shape=23,size=3,fill="red")
    p=p+theme_set(theme_set(theme_bw(base_size=20)))
    p=p+theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1),axis.title=element_blank())
    print(p)
    
    boxplot
    violin
    histogram
    density
    density
    图片.png
    ##12 mean,median,max,min,sd,var,mad
    g_mean <- tail(sort(apply(exprSet,1,mean)),50)
    g_median <- tail(sort(apply(exprSet,1,median)),50)
    g_max <- tail(sort(apply(exprSet,1,max)),50)
    g_min <- tail(sort(apply(exprSet,1,min)),50)
    g_sd <- tail(sort(apply(exprSet,1,sd)),50)
    g_var <- tail(sort(apply(exprSet,1,var)),50)
    g_mad <- tail(sort(apply(exprSet,1,mad)),50)
    g_mad
    names(g_mad)
    ##13 heatmap 
    library(pheatmap)
    choose_gene=names(g_mad)
    choose_matrix=exprSet[choose_gene,]
    #scale()函数按列计算,所以先用t()进行矩阵倒置
    choose_matrix=t(scale(t(choose_matrix)))
    pheatmap(choose_matrix)
    
    图片.png
    ##14 使用UpSetR包来看他们之间的overlap情况。
    library(UpSetR)
    g_all <- unique(c(names(g_mean),names(g_median),names(g_max),names(g_min),
                      names(g_sd),names(g_var),names(g_mad) ))
    dat=data.frame(g_all=g_all,
                   g_mean=ifelse(g_all %in%  names(g_mean) ,1,0),
                   g_median=ifelse(g_all %in%  names(g_median) ,1,0),
                   g_max=ifelse(g_all %in%  names(g_max) ,1,0),
                   g_min=ifelse(g_all %in%  names(g_min) ,1,0),
                   g_sd=ifelse(g_all %in%  names(g_sd) ,1,0),
                   g_var=ifelse(g_all %in%  names(g_var) ,1,0),
                   g_mad=ifelse(g_all %in%  names(g_mad) ,1,0)
    )
    upset(dat,nsets = 7)
    
    overlap
    #16 对样品聚类并且绘图,然后添加样本的临床表型数据信息(更改样本名)
    colnames(exprSet)=paste(group_list,1:22,sep='')
    # Define nodePar
    nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), 
                    cex = 0.7, col = "blue")
    hc=hclust(dist(t(exprSet)))
    #par()修改plot边距
    par(mar=c(5,5,5,10)) 
    plot(as.dendrogram(hc), nodePar = nodePar,  horiz = TRUE)
    
    cluster
    #17 PCA 
    library(ggfortify)
    df=as.data.frame(t(exprSet))
    df$group=group_list 
    autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')
    
    library("FactoMineR")#画主成分分析图需要加载这两个包
    library("factoextra") 
    #现在dat最后一列是group_list,需要重新赋值给一个dat.pca,这个矩阵是不含有分组信息的
    df=as.data.frame(t(exprSet))
    dat.pca <- PCA(df, graph = FALSE)
    fviz_pca_ind(dat.pca,
                 geom.ind = "point", # show points only (nbut not "text")
                 col.ind = group_list, # color by groups
                 # palette = c("#00AFBB", "#E7B800"),
                 addEllipses = TRUE, # Concentration ellipses
                 legend.title = "Groups"
    )
    
    PCA1
    PCA2
    #18 t.test  
    dat = exprSet
    group_list=as.factor(group_list)
    group1 = which(group_list == levels(group_list)[1])
    group2 = which(group_list == levels(group_list)[2])
    dat1 = dat[, group1]
    dat2 = dat[, group2]
    dat3 = cbind(dat1, dat2)
    pvals = apply(exprSet, 1, function(x){
      t.test(as.numeric(x)~group_list)$p.value
    })
    #p值矫正,如何选择method?
    p.adj = p.adjust(pvals, method = "BH")
    avg_1 = rowMeans(dat1)
    avg_2 = rowMeans(dat2)
    log2FC = avg_2-avg_1
    DEG_t.test = cbind(avg_1, avg_2, log2FC, pvals, p.adj)
    DEG_t.test=DEG_t.test[order(DEG_t.test[,4]),]
    #为什么转换成dataframe?
    DEG_t.test=as.data.frame(DEG_t.test)
    
    #19 DEG by limma 
    suppressMessages(library(limma)) 
    ##分组矩阵:design
    design <- model.matrix(~0+factor(group_list))
    colnames(design)=levels(factor(group_list))
    rownames(design)=colnames(exprSet)
    design
    ##比较矩阵:contrast(制定了谁比谁这个规则,这个矩阵声明,我们要把progres.组跟stable进行差异分析比较)
    contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
    contrast.matrix 
    ##limma DEG steps
    fit <- lmFit(exprSet,design)
    fit2 <- contrasts.fit(fit, contrast.matrix) ##这一步很重要,大家可以自行看看效果
    fit2 <- eBayes(fit2,trend = TRUE)  
    ## default no trend !!!eBayes() with trend=TRUE
    tempOutput = topTable(fit2, coef=1, n=Inf)
    nrDEG = na.omit(tempOutput) 
    head(nrDEG)
    ## volcano plot
    DEG=nrDEG
    logFC_cutoff <- with(DEG,mean(abs( logFC)) + 2*sd(abs( logFC)) )
    DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
                                  ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
    )
    this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                        '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
                        '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
    )
    this_tile
    head(DEG)
    g = ggplot(data=DEG, aes(x=logFC, y=-log10(P.Value), color=change)) +
      geom_point(alpha=0.4, size=1.75) +
      theme_set(theme_set(theme_bw(base_size=20)))+
      xlab("log2 fold change") + ylab("-log10 p-value") +
      ggtitle( this_tile  ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
      scale_colour_manual(values = c('blue','black','red'))  ## corresponding to the levels(res$change)
    print(g)
    
    volcano
    #20 对T检验结果的P值和limma包差异分析的P值画散点图,看看哪些基因相差很大。
    DEG_t.test=DEG_t.test[rownames(nrDEG),]
    #logFC
    plot(DEG_t.test[,3],nrDEG[,1]) 
    #P.value
    plot(DEG_t.test[,4],nrDEG[,4]) 
    plot(-log10(DEG_t.test[,4]),-log10(nrDEG[,4]))
    
    图片.png
    图片.png
    图片.png
    library(ggplot2)
    library(ggpubr)
    my_comparisons <- list(c("stable", "progres."))
    dat=data.frame(group=group_list,
                   sampleID= names(exprSet['DLEU1',]),
                   values= as.numeric(exprSet['DLEU1',]))
    ggboxplot(
      dat, x = "group", y = "values",
      color = "group",
      add = "jitter"
    )+
      stat_compare_means(comparisons = my_comparisons, method = "t.test")
    
    图片.png
    ## 前25个基因的heatmap 
    library(pheatmap)
    choose_gene=head(rownames(nrDEG),25)
    choose_matrix=exprSet[choose_gene,]
    choose_matrix=t(scale(t(choose_matrix)))
    pheatmap(choose_matrix)
    
    图片.png

    相关文章

      网友评论

          本文标题:R_test

          本文链接:https://www.haomeiwen.com/subject/ixvyrktx.html