美文网首页
R语言中级题作业笔记

R语言中级题作业笔记

作者: Michael_fzy | 来源:发表于2019-04-09 19:51 被阅读0次

    题目链接,生信菜鸟团:http://www.bio-info-trainee.com/3750.html
    各位想要看题目的同学可以戳链接看,这里就不放题目了
    其实自己独立写代码还是有些困难,不过借助Jimmy老师的代码我还是磕磕碰碰地把题写完了。还是需要加油啊

    第一题

    rm(list = ls())
    options(stringsAsFactors = F)
    a = read.table(file = "data1.txt")#将基因ID整体读入
    library(stringr)
    b = str_split(a$V1,"[.]",simplify = T)#利用点号将ID分开,去掉没用的版本号,simplify=T以返回矩阵形式
    c = unlist(b)
    library(org.Hs.eg.db)
    g2s=toTable(org.Hs.egSYMBOL)
    g2e=toTable(org.Hs.egENSEMBL)
    colnames(c) = c("ensembl_id","useless")
    x1 = merge(c,g2e,by="ensembl_id",all.x = T)#根据ensembl_id列合并
    x2 = merge(x1,g2s,by="gene_id",all.x = T)#根据gene_id列合并
    x2 = x2[order(x2$gene_id),]
    
    数据结果1.JPG

    第二题

    rm(list = ls())#此题与之前的题思路雷同
    options(stringsAsFactors = F)
    library(hgu133plus2.db)
    ids=toTable(hgu133plus2SYMBOL)
    head(ids)
    a = read.table(file = "data2.txt")
    colnames(a)=c("probe_id")
    x1 = merge(a,ids,by="probe_id",all.x = T)
    

    第三题

    rm(list = ls())
    options(stringsAsFactors = F)
    suppressPackageStartupMessages(library(CLL))#获取表达矩阵
    data(sCLLex)
    sCLLex
    exprSet=exprs(sCLLex) 
    
    pd=pData(sCLLex)
    library(hgu95av2.db) 
    ids=toTable(hgu95av2SYMBOL)
    

    通过数据库来找到TP53的probe_id


    数据结果2.JPG
    boxplot(exprSet['1939_at',] ~ pd$Disease)
    boxplot(exprSet['1974_s_at',] ~ pd$Disease)
    boxplot(exprSet['201746_at',] ~ pd$Disease)
    

    第四题

    rm(list = ls())
    options(stringsAsFactors = F)
    a = read.table(file = "e4-plot.txt",header = T,
               fill = T,sep = "\t")
    colnames(a)=c('id','subtype','expression','mut')
    dat = a
    
    library(ggstatsplot)
    ggbetweenstats(data = dat,x = subtype,y=expression)
    library(ggplot2)
    ggsave('plot-again-BRCA1-TCGA-BRCA-cbioportal.png')
    
    数据结果3

    第五题

    rm(list = ls())
    options(stringsAsFactors = F)
    a = read.csv(file = "LAML_7490_50_50.csv")
    dat =a
    library(ggplot2)
    library(survival)
    library(survminer) 
    table(dat$Status)
    dat$Status=ifelse(dat$Status=='Dead',1,0)
    sfit <- survfit(Surv(Days, Status)~Group, data=dat)
    sfit
    summary(sfit)
    ggsurvplot(sfit, conf.int=F, pval=TRUE)
    

    第六题

    rm(list = ls())
    options(stringsAsFactors = F)
    load('GSE17215_eSet.Rdata')#这里使用了已经读好的数据,读数据的方法其实是一种套路,这里不详细描述
    a=gset[[1]]#获取表达矩阵
    dat=exprs(a)
    library(hgu133a.db)
    ids=toTable(hgu133aSYMBOL)
    head(ids)
    dat=dat[ids$probe_id,]
    ids$median = apply(dat,1,median)
    ids=ids[order(ids$symbol,ids$median,decreasing = T),]#这两行联动降序排列
    ids=ids[!duplicated(ids$symbol),]#删除重复值
    dat=dat[ids$probe_id,]
    rownames(dat)=ids$symbol
    dat[1:4,1:4]  
    dim(dat)
    
    ng='ACTR3B ANLN BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDCA1 CDH3 CENPF CEP55 CXXC5 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GPR160 GRB7 KIF2C KNTC2 KRT14 KRT17 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11 MYBL2 MYC NAT1 ORC6L PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TMEM45B TYMS UBE2C UBE2T'
    ng=strsplit(ng,' ')[[1]]#将数据分开
    table(ng %in%  rownames(dat))
    ng=ng[ng %in%  rownames(dat)]
    dat=dat[ng,]
    dat=log2(dat)
    pheatmap::pheatmap(dat,scale = 'row')
    

    第七题

    rm(list = ls())
    options(stringsAsFactors = F)
    # 注意查看下载文件的大小,检查数据 
    f='GSE24673_eSet.Rdata'
    
    library(GEOquery)
    # 这个包需要注意两个配置,一般来说自动化的配置是足够的。
    #Setting options('download.file.method.GEOquery'='auto')
    #Setting options('GEOquery.inmemory.gpl'=FALSE)
    if(!file.exists(f)){
      gset <- getGEO('GSE24673', destdir=".",
                     AnnotGPL = F,     ## 注释文件
                     getGPL = F)       ## 平台文件
      save(gset,file=f)   ## 保存到本地
    }
    load('GSE24673_eSet.Rdata')
    class(gset)
    length(gset)
    class(gset[[1]])
    # 因为这个GEO数据集只有一个GPL平台,所以下载到的是一个含有一个元素的list
    a=gset[[1]]
    dat = exprs(a)
    dim(dat)
    pd=pData(a)
    

    这以上的代码是常见的读入代码

    group_list=c('rbc','rbc','rbc',
                 'rbn','rbn','rbn',
                 'rbc','rbc','rbc',
                 'normal','normal')#添加分组信息
    dat[1:4,1:4]
    dat = dat[apply(dat,1,function(x) sum(x>1) > 5),]#这里试了一下视频里老师的方法,选取了差异大的数据
    dim(dat)
    dat = dat[names(sort(apply(dat,1,mad),decreasing = T)[1:500]),]
    dim(dat)
    M=cor(dat)
    
    pheatmap::pheatmap(M)
    tmp=data.frame(g=group_list)
    rownames(tmp)=colnames(M)
    pheatmap::pheatmap(M,annotation_col = tmp)
    
    Rplot1(筛选后).png
    Rplot03(未筛选).png

    第八题

    查找链接来源,码住
    https://mp.weixin.qq.com/s/sVSsV2fWZOQmNd72Vb3SmQ
    https://www.jianshu.com/p/40b560755cdf

    options()$repos
    options()$BioC_mirror 
    options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
    options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
    BiocManager::install("hugene10sttranscriptcluster.db",ask = F,update = F)
    options()$repos
    options()$BioC_mirror
    

    第九题

    rm(list=ls())
    options(stringsAsFactors = F)
    
    library(GEOquery)
    GSE <- "GSE42872"
    if(!file.exists(GSE)){
      geo <- getGEO(GSE, destdir = '.', getGPL = F, AnnotGPL = F)
      save(geo, file = paste0(GSE,'.eSet.Rdata'))
    }
    load(paste0(GSE,'.eSet.Rdata'))
    expr <- exprs(geo[[1]])
    
    dim(expr)
    expr[1:4,1:4]
    
    # 选出所有样本的(mean/sd/mad/)最大的探针
    
    sort(apply(expr,1,mean),decreasing = T)[1]
    sort(apply(expr,1,sd),decreasing = T)[1]
    sort(apply(expr,1,mad),decreasing = T)[1]
    

    第十题

    # 接第九题,得到表型信息,然后用limma做差异分析
    pdata <- pData(geo[[1]]) 
    grp <- unlist(lapply(pdata$title, function(x){
      strsplit(x, ' ')[[1]][4]
    }))
    
    suppressMessages(library(limma))
    #limma needs:表达矩阵(expr:需要用log后的矩阵)、分组矩阵(design)、比较矩阵(contrast)
    #先做一个分组矩阵~design,说明progres是哪几个样本,stable又是哪几个
    design <- model.matrix(~0+factor(grp))
    colnames(design) <- levels(factor(grp))
    rownames(design) <- colnames(expr)
    design
    #再做一个比较矩阵【一般是case比control】
    contrast<-makeContrasts(paste0(unique(grp),collapse = "-"),levels = design)
    contrast
    
    ##step1
    fit <- lmFit(expr,design)
    ##step2
    fit2 <- contrasts.fit(fit, contrast) 
    fit2 <- eBayes(fit2)  
    ##step3
    mtx = topTable(fit2, coef=1, n=Inf)
    DEG_mtx = na.omit(mtx) 
    View(DEG_mtx)
    
    # 火山图
    
    DEG=DEG_mtx
    
    if(T){
      logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
      
      DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
                                    ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
      )
      title <- paste0('log2FoldChange cutoff: ',round(logFC_cutoff,3),
                      '\nUp-regulated genes: ',nrow(DEG[DEG$change =='UP',]) ,
                      '\nDown-regulated genes: ',nrow(DEG[DEG$change =='DOWN',])
      )
    }
    library(ggplot2)
    vol1 = ggplot(data=DEG, aes(x=logFC, y=-log10(P.Value), color=change)) +
      geom_point(alpha=0.4, size=1.75) +
      theme_set(theme_set(theme_bw(base_size=20)))+
      xlab("log2FoldChange") + ylab("-log10 p-value") +
      ggtitle( title ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
      scale_colour_manual(values = c('blue','black','red')) # according to the levels(DEG$change)
    print(vol1)
    
    火山图

    相关文章

      网友评论

          本文标题:R语言中级题作业笔记

          本文链接:https://www.haomeiwen.com/subject/cjcnbqtx.html