美文网首页RNA-seq
TCGA数据下载系列之二:RTCGA

TCGA数据下载系列之二:RTCGA

作者: 白云梦_7 | 来源:发表于2018-07-10 16:30 被阅读82次

    library(RTCGA)

    library(RTCGA.clinical)

    library(RTCGA.rnaseq)

    library(RTCGA.mRNA)

    library(RTCGA.mutations)

    all_TCGA_cancers=infoTCGA()

    DT::datatable(all_TCGA_cancers)

    #指定任意基因从任意癌症里面获取芯片表达数据(这里是MRNA)

    expr<-expressionsTCGA(BRCA.mRNA,OV.mRNA,LUSC.mRNA,extract.cols = c("GATA3","PTEN","XBP1","ESR1","MUC1"))

    #简化表达模式

    expr$dataset<-gsub(".mRNA","",expr$dataset)#替换,dataset这一列,将.mRNA去掉

    expr$bcr_patient_barcode<-paste0(expr$dataset,c(1:150,1:561,1:154))#barcode换掉

    #绘制指定基因在不同癌症的表达量区别boxplot

    library(ggpubr)

    ggboxplot(expr,x="dataset",y="GATA3",title = "GATA3",ylab = "Expression",color = "dataset",palette = "jco")###expr中dataset作为横坐标,GATA3表达量作为纵坐标,颜色根据dataset分组,具体颜色为jco系列

    这里选择的是 ggsci 包的配色方案,包括: “npg”, “aaas”, “lancet”, “jco”, “ucscgb”, “uchicago”, “simpsons” and “rickandmorty”,针对常见的SCI杂志的需求开发的。

    #还可以加上不同癌症之间比较的p值

    my_comparisons<-list(c("BRCA","OV"),c("OV","LUSC"))

    ggboxplot(expr,x="dataset",y="GATA3",title = "GATA3",ylab = "Expression",color = "dataset",palette = "jco")+stat_compare_means(comparisons = my_comparisons)

    #用ggplot2也可以画箱型图

    library(ggplot2)

    p<-ggplot(expr,aes(x=expr$dataset,y=expr$GATA3))

    p<-p+geom_boxplot(aes(fill=expr$dataset))

    p+xlab("1")+ylab("2")+ggtitle("3")+guides(fill=guide_legend(title="4"))

    ###另附小技巧

    label.select.criteria <- list(criteria = "`y` > 3.9 & `x` %in% c('BRCA', 'OV')")

    ggboxplot(expr, x = "dataset",

             y = c("GATA3", "PTEN", "XBP1"),

             combine = TRUE,

             color = "dataset", palette = "jco",

             ylab = "Expression",

             label = "bcr_patient_barcode",              # column containing point labels

             label.select = label.select.criteria,       # Select some labels to display

             font.label = list(size = 9, face = "italic"), # label font

             repel = TRUE                                # Avoid label text overplotting

             )

    ##一般用到是几个同时呈现

    ggboxplot(expr,x="dataset",y=c("GATA3","PTEN","XBP1"),ylab = "Expression",color = "dataset",palette = "jco",combine = TRUE)

    #指定任意基因从任意癌症里面获取测序表达数据(rnaseq)

    expr <- expressionsTCGA(BRCA.rnaseq, OV.rnaseq, LUSC.rnaseq,extract.cols = c("GATA3|2625", "PTEN|5728", "XBP1|7494","ESR1|2099", "MUC1|4582"))#需要symbol还要entrez的ID:symbol|extrezID

    ggboxplot(expr,x="dataset",y="`GATA3|2625`",title = "GATA3|2625",ylab = "Expression",color = "dataset",palette = "jco")

    #用全部的rnaseq的表达数据来做主成分分析

    library(RTCGA.rnaseq)

    library(dplyr)#   R包dplyr可用于处理R内部或者外部的结构化数据,相较于plyr包,dplyr专注接受dataframe对象, 大幅提高了速度,并且提供了更稳健的数据库接口。同时,dplyr包可用于操作Spark的dataframe。本文只是基础的dplyr包学习笔记,所以并不会讨论一些高级应用,或者与data.table包的性能比较。

    expressionsTCGA(BRCA.rnaseq,OV.rnaseq,LUSC.rnaseq)%>%dplyr::rename(cohort=dataset)%>%filter(substr(bcr_patient_barcode,14,15)=="01")->BRCA.OV.LUSC.rnaseq.cancer#筛选出了“TCGA-GM-A2DA-01A-11R-A18M-07”bcr_barcode都是“01”的,%>%管道符,左边赋于右边

    pcaTCGA(BRCA.OV.LUSC.rnaseq.cancer, "cohort") -> pca_plot

    plot(pca_plot)

    #用突变数据做生存分析

    library(RTCGA.mutations)

    library(survminer)

    library(dplyr)

    mutationsTCGA(BRCA.mutations,OV.mutations,LUSC.mutations)%>%filter(Hugo_Symbol=="TP53")%>%filter(substr(bcr_patient_barcode,14,15)=="01")%>%mutate(bcr_patient_barcode=substr(bcr_patient_barcode,1,12))->BRCA_OV.mutations###斜体部分筛选了“01”样本中含TP53的,删除线部分是将第一列barcode重命名了,只取前12个字符

    survivalTCGA(BRCA.clinical,OV.clinical,extract.cols = "admin.disease_code")%>%dplyr::rename(disease=admin.disease_code)->BRCA_OV.clinical

    BRCA_OV.clinical %>%  left_join(    BRCA_OV.mutations,  by = "bcr_patient_barcode" ) %>%mutate(TP53 =  ifelse(!is.na(Variant_Classification), "Mut","WILDorNOINFO")) -> BRCA_OV.clinical_mutations#斜体是说按照barcode将clinical和mutations合并,删除线是说增加一列TP53 的信息,如果variant_classification是空值,则表示wildornoinfo,如果不是空值,则表示其mut

    BRCA_OV.clinical_mutations %>%

    select(times, patient.vital_status, disease, TP53) -> BRCA_OV.2plot#选取生存分析需要的内容

    kmTCGA(

       BRCA_OV.2plot,

       explanatory.names = c("TP53", "disease"),

       break.time.by = 400,

       xlim = c(0,2000),

        pval = TRUE) -> km_plot

    print(km_plot)


    相关文章

      网友评论

        本文标题:TCGA数据下载系列之二:RTCGA

        本文链接:https://www.haomeiwen.com/subject/iikxpftx.html