美文网首页工作生活
2019-07-03R语言中级作业(修改)

2019-07-03R语言中级作业(修改)

作者: LiuYueRR | 来源:发表于2019-07-03 19:04 被阅读0次

    R语言作业(中级)题目链接:http://www.bio-info-trainee.com/3750.html

    作业1 根据R包org.Hs.eg.db找到下面ensembl 基因ID 对应的基因名(symbol)

    #1.加载这个注释包org.Hs.eg.db
    library(org.Hs.eg.db) 
    #了解一下这个包 ?org.Hs.eg.db 发现里面含有一个函数可以查看org.Hs.eg.db包的信息 columns()
    columns(org.Hs.eg.db)
    keytypes(org.Hs.eg.db)
    #了解一个函数toTable(),
    #toTable(x) turns Bimap object x into a data frame
    g2s=toTable(org.Hs.egSYMBOL)
    g2e=toTable(org.Hs.egENSEMBL)
    head(g2s)
    head(g2e)
    #根据R包org.Hs.eg.db找到下面ensembl基因ID对应的基因名
    ensembl_id <- c('ENSG00000000003.13','ENSG00000000005.5','ENSG00000000419.11','ENSG00000000457.12','ENSG00000000460.15','ENSG00000000938.11')
    #切割字符串,得到ensemble_id
    library(stringr)
    str_split(ensembl_id,pattern = '[.]',simplify = T)[,1]
    ensembl_id <- as.data.frame(str_split(ensembl_id,pattern = '[.]',simplify = T)[,1]
    )
    #合并之前,需要将列明进行改变,方便merge
    colnames(ensembl_id) <- c('ensembl_id')
    ensembl = merge(ensembl_id, g2e, by='ensembl_id')
    #根据geneid再次进行合并找到symbol
    work1 <- merge(ensembl,g2s,by='gene_id')
    
    

    作业2 根据R包hgu133a.db找到下面探针对应的基因名(symbol)

    options(BioC_mirror='https://mirrors.ustc.edu.cn/bioc')
    if(!require('hgu133a.db')) BiocManager::install('hgu133a.db',ask = F,update = F)
    library(hgu133a.db)
    columns(hgu133a.db)
    ids=toTable(hgu133aSYMBOL)
    head(ids)
    #找到下面探针对应的基因名(symbol)
    prb <- read.table('names.txt')
    head(prb)
    colnames(prb)='probe_id'
    head(prb)
    work2 <- merge(prb,ids,by='probe_id')
    
    

    作业3 找到R包CLL内置的数据集的表达矩阵里面的TP53基因的表达量,并且绘制在 progres.-stable分组的boxplot图

    #使用CLL包时,需要了解这个包 ?CLL  发现里面有一个data(sCLLex)
    library(CLL)
    data("sCLLex")
    exprSet=exprs(sCLLex)#提取表达矩阵
    dim(exprSet)#表达矩阵的维度
    pData(sCLLex)#sCLLex的分组情况
    sampleNames(sCLLex)#sCLLex的样本名称
    varMetadata(sCLLex)#sCLLex的标签描述
    varLabels(sCLLex)#sCLLex的观测变量
    #寻找TP53基因的表达量,并且绘制在 progres.-stable分组的boxplot图
    head(exprSet)
    #head一下发现exprSet里面是probe_id,需要进行id转换
    library(hgu95av2.db)
    columns(hgu95av2.db)
    g3s=toTable(hgu95av2SYMBOL)
    head(g3s)
    which(g3s[,2]=='TP53')
    g3s[c(966,997,1420),c(1,2)]
    #绘制在 progres.-stable分组的boxplot图
    pdata <- pData(sCLLex)
    boxplot(exprSet[966,] ~ pdata$Disease) 
    boxplot(exprSet[997,] ~ pdata$Disease)
    boxplot(exprSet[1420,] ~ pdata$Disease)
    #ggpubr美化图形
    mean(exprSet['1939_at',])
    mean(exprSet['1974_s_at',])
    mean(exprSet['31618_at',])
    TP53_exprSet <-cbind(as.data.frame(exprSet['1939_at',]),as.data.frame(pdata$Disease))
    colnames(TP53_exprSet)<-c("expression","Disease")
    library(ggpubr)
    p <- ggboxplot(TP53_exprSet, x = "Disease", y = "expression",
                   color = "Disease", palette =c("#00AFBB", "#E7B800"),
                   add = "jitter", shape = "Disease")
    my_comparisons <- list( c("progres.", "stable"))
    p <- p + stat_compare_means(comparisons = my_comparisons)
    p
    
    Rplot1.png
    Rplot2.png

    作业4 作业4 找到BRCA1基因在TCGA数据库的乳腺癌数据集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表达情况

    作业5 找到TP53基因在TCGA数据库的乳腺癌数据集的表达量分组看其是否影响生存

    作业6 下载数据集GSE17215的表达矩阵并且提取下面的基因画热图

    exprSet <- read.table('GSE17215_series_matrix.txt',header = T,comment.char = '!',stringsAsFactors = F)
    
    g6id <- c('ACTR3B','ANLN','BAG1','BCL2' ,'BIRC5' ,'BLVRA', 'CCNB1', 'CCNE1' ,'CDC20' ,'CDC6', 'CDCA1', 'CDH3', 'CENPF', 'CEP55', 'CXXC5', 'EGFR', 'ERBB2', 'ESR1', 'EXO1', 'FGFR4', 'FOXA1', 'FOXC1', 'GPR160', 'GRB7', 'KIF2C', 'KNTC2', 'KRT14', 'KRT17', 'KRT5', 'MAPT', 'MDM2', 'MELK', 'MIA', 'MKI67', 'MLPH', 'MMP11', 'MYBL2', 'MYC', 'NAT1','ORC6L' ,'PGR', 'PHGDH', 'PTTG1', 'RRM2', 'SFRP1', 'SLC39A6', 'TMEM45B', 'TYMS', 'UBE2C', 'UBE2T')
    g6id2 <- data.frame(g6id)
    colnames(g6id2) <- c('symbol')
    GSE17215_annotation <- data.table::fread('GPL3921.annot',header = T)
    g6s <- GSE17215_annotation[,c(1,3)]
    colnames(g6s) <- c('ID','symbol')
    g6id_merge <- merge(g6id2,g6s,by='symbol')
    colnames(exprSet)[1] <- 'REF_ID'
    colnames(g6id_merge)[2] <- 'REF_ID'
    g6_expression <- merge(g6id_merge,exprSet,by='REF_ID')
    g6_expression <- g6_expression[,-1]
    #绘制热图
    library(pheatmap)
    table(g6_expression[,1])
    length(unique(g6_expression[,1]))
    nrow(g6_expression)
    #发现探针ID对应了多个基因ID,因此要对基因ID进行删除重复
    library(dplyr)
    tmp <- g6_expression %>% select(symbol,1:(length(g6_expression))) %>% mutate(rowMean=rowMeans(.[grep('GSM',names(.))])) %>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean)
    g6_exp <- g6_expression %>% select(symbol,1:(length(g6_expression))) %>% mutate(rowMean=rowMeans(.[grep('GSM',names(.))])) %>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean)
    rownames(g6_exp) <- g6_exp[,1]
    g6_exp <- g6_exp[,-1]
    pheatmap(g6_exp,scale = 'row')
    
    Rplot5.png

    作业7 下载数据集GSE24673的表达矩阵计算样本的相关性并且绘制热图,需要标记上样本分组信息

    exprSet <- read.table('GSE24673_series_matrix.txt',header = T,comment.char = '!',row.names = 1)
    cor(exprSet)
    library(pheatmap)
    pheatmap(cor(exprSet))
    colnames(exprSet)
    col <- colnames(exprSet)
    #添加注释信息
    group_list=c(rep('RB_139_09_TR',3),rep('RB_535_09_TR',3),rep('RB_984_09_TR',3),rep('Human_Healthy_Retina',2))
    group <- data.frame(col,group_list)
    rownames(group) <- group[,1]
    group <- group[,-1]
    group <- as.character(group)
    group <- as.data.frame(group)
    rownames(group) <- colnames(exprSet)
    pheatmap(cor(exprSet),annotation_col = group,annotation_legend = T)
    
    Rplot3.png

    作业8 找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 对应的R的bioconductor注释包,并且安装它

    options()$repos
    options()$BioC_mirror 
    if(length(getOption('BioC_mirror'))==0) options(BioC_mirror='https://mirror.ustc.edu.cn/bioc/')
    if(!require('hugene10sttranscriptcluster.db')) BiocManager::install('hugene10sttranscriptcluster.db',ask = F,update = F)
    

    作业9 下载数据集GSE42872的表达矩阵,并且分别挑选出 所有样本的(平均表达量/sd/mad/)最大的探针,并且找到它们对应的基因。

    rm(list = ls())
    options(stringsAsFactors = F)
    exprSet <- read.table('GSE42872_series_matrix.txt',header = T,comment.char = '!',row.names = 1)
    tmp_mean <- apply(exprSet, 1, mean)
    tmp_mean_1 <- sort(tmp_mean,decreasing = T)[1]
    tmp_mean_1
    tmp_sd <- apply(exprSet, 1, sd)
    tmp_sd_1 <- sort(tmp_sd,decreasing = T)[1]
    tmp_sd_1
    tmp_mad <- apply(exprSet, 1, mad)
    tmp_mad_1 <- sort(tmp_mad,decreasing = T)[1]
    tmp_mad_1
    #寻找到探针对应的基因SYMBOL
    library(hugene10sttranscriptcluster.db)
    g9s <- toTable(hugene10sttranscriptclusterSYMBOL)
    which(g9s$probe_id=='7978905')
    which(g9s$probe_id=='8133876')
    g9s[16473,]
    

    作业10 下载数据集GSE42872的表达矩阵,并且根据分组使用limma做差异分析,得到差异结果矩阵

    library(hugene10sttranscriptcluster.db)
    g9s <- toTable(hugene10sttranscriptclusterSYMBOL)
    which(g9s$probe_id=='7978905')
    which(g9s$probe_id=='8133876')
    g9s[16473,]
    
    ####作业10下载数据集GSE42872的表达矩阵,并且根据分组使用limma做差异分析,得到差异结果矩阵
    rm(list=ls())
    exprSet <- read.table('GSE42872_series_matrix.txt',comment.char = '!',header = T)
    library(hugene10sttranscriptcluster.db)
    g10s <- toTable(hugene10sttranscriptclusterSYMBOL)
    colnames(exprSet)[1] <- 'probe_id'
    exprSet_merge <- merge(exprSet,g10s,by='probe_id')
    nrow(exprSet_merge)
    length(unique(exprSet_merge$symbol))
    table(exprSet_merge$symbol)
    ##探针转化以及去重,得到最终的表达矩阵
    library(dplyr)
    exprSet_expression <- exprSet_merge %>% select(-probe_id) %>%
      select(symbol,1:(length(exprSet)-1)) %>% mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean)
    rownames(exprSet_expression) <- exprSet_expression[,1]
    exprSet_expression <- exprSet_expression[,-1]
    #加载limma包进行差异表达分析
    library(limma)
    group <- c (rep('con',3),rep('treatment',3))
    design <- model.matrix(~factor(group))
    colnames(design) <- c('con','treatment')
    design
    fit <- lmFit(exprSet_expression,design)
    fit2 <- eBayes(fit)
    allDiff=topTable(fit2,adjust='fdr',coef=2,number = Inf)
    diffLab <- subset(allDiff,abs(logFC)>1 & adj.P.Val<0.05)
    save(diffLab,group,allDiff,file = 'diff.Rdata')
    #火山图的绘制
    library(ggplot2)
    allDiff[allDiff$adj.P.Val <0.05 & allDiff$logFC >1,ncol(allDiff)+1]="Up"
    allDiff[allDiff$adj.P.Val <0.05 & allDiff$logFC < -1,ncol(allDiff)]="Down"
    allDiff[allDiff$adj.P.Val>=0.05 | 1 > abs(allDiff$logFC),ncol(allDiff)]="Normal"
    colnames(allDiff)[ncol(allDiff)]="Regulate"
    allDiff$Regulate=factor(allDiff$Regulate,levels = c("Up","Down","Normal"),order=T)
    
    col=c("red","lightseagreen", "black")
    p_volcano=ggplot(allDiff,aes(x=logFC,y=-log10(adj.P.Val)))+
      geom_point(aes(color=allDiff$Regulate),alpha=0.5)+scale_color_manual(values =col)+
      geom_hline(yintercept=c(-log10(0.05)),linetype=4)+
      geom_vline(xintercept=c(-log2(2),log2(2)),linetype=4)+
      theme(
        legend.text = element_text(size = 10, face = "bold"),
        legend.position = 'right',
        legend.key.size=unit(0.5,'cm'),
        axis.text.x=element_text(size = 10,face = "bold", vjust = 0.5, hjust = 0.5),
        axis.text.y=element_text(size = 10,face = "bold", vjust = 0.5, hjust = 0.5),
        axis.title.x = element_text(size = 10,face = "bold", vjust = 0.5, hjust = 0.5),
        axis.title.y = element_text(size = 10,face = "bold", vjust = 0.5, hjust = 0.5),
        
        panel.background = element_rect(fill = "transparent",colour = "black"), 
        panel.grid.minor = element_line(color="lightgrey",size=0.1),
        panel.grid.major = element_line(color="lightgrey",size=0.1),
        plot.background = element_rect(fill = "transparent",colour = "black")
      )
    p_volcano
    
    Rplot4.png

    相关文章

      网友评论

        本文标题:2019-07-03R语言中级作业(修改)

        本文链接:https://www.haomeiwen.com/subject/dtachctx.html