美文网首页
R|TCGA|m6AlncRNA

R|TCGA|m6AlncRNA

作者: 高大石头 | 来源:发表于2021-03-01 23:33 被阅读0次

m6A相关的lncRNA怎么进行计算?下面我们就来探究下

1.表达矩阵下载

UCSC Xena是University of California开发的针对多个数据库的综合网站,上面有针对TCGA数据库整理好的RNA-seq表达矩阵。

网址:http://xena.ucsc.edu/

常用的数据是FPKM,可以转换为TPM。

image.png

注意:

  • 1.FPKM是log2(FPKM+1)

  • 2.基因注释版本是gencode.v22.annotation

  • 3.表达矩阵是ENSG格式

2.区分mRNA和lncRNA

打开GENCODE中gtf文件的biotype的说明,可以发现lncRNA包含9种类型。

对lncRNA的说明:

Generic long non-coding RNA biotype that replaced the following biotypes: 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic and sense_overlapping.

# 读取并探索gtf文件
gtf <- rtracklayer::import("D:/jianshu/R-TCGA-m6AlncRNA/gencode.v22.annotation.gtf")
gtf <- as.data.frame(gtf)
head(gtf,6)
##   seqnames start   end width strand source       type score phase
## 1     chr1 11869 14409  2541      + HAVANA       gene    NA    NA
## 2     chr1 11869 14409  2541      + HAVANA transcript    NA    NA
## 3     chr1 11869 12227   359      + HAVANA       exon    NA    NA
## 4     chr1 12613 12721   109      + HAVANA       exon    NA    NA
## 5     chr1 13221 14409  1189      + HAVANA       exon    NA    NA
## 6     chr1 12010 13670  1661      + HAVANA transcript    NA    NA
##             gene_id                          gene_type gene_status gene_name
## 1 ENSG00000223972.5 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1
## 2 ENSG00000223972.5 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1
## 3 ENSG00000223972.5 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1
## 4 ENSG00000223972.5 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1
## 5 ENSG00000223972.5 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1
## 6 ENSG00000223972.5 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1
##   level          havana_gene     transcript_id
## 1     2 OTTHUMG00000000961.2              <NA>
## 2     2 OTTHUMG00000000961.2 ENST00000456328.2
## 3     2 OTTHUMG00000000961.2 ENST00000456328.2
## 4     2 OTTHUMG00000000961.2 ENST00000456328.2
## 5     2 OTTHUMG00000000961.2 ENST00000456328.2
## 6     2 OTTHUMG00000000961.2 ENST00000450305.2
##                      transcript_type transcript_status transcript_name   tag
## 1                               <NA>              <NA>            <NA>  <NA>
## 2               processed_transcript             KNOWN     DDX11L1-002 basic
## 3               processed_transcript             KNOWN     DDX11L1-002 basic
## 4               processed_transcript             KNOWN     DDX11L1-002 basic
## 5               processed_transcript             KNOWN     DDX11L1-002 basic
## 6 transcribed_unprocessed_pseudogene             KNOWN     DDX11L1-001 basic
##   transcript_support_level    havana_transcript exon_number           exon_id
## 1                     <NA>                 <NA>        <NA>              <NA>
## 2                        1 OTTHUMT00000362751.1        <NA>              <NA>
## 3                        1 OTTHUMT00000362751.1           1 ENSE00002234944.1
## 4                        1 OTTHUMT00000362751.1           2 ENSE00003582793.1
## 5                        1 OTTHUMT00000362751.1           3 ENSE00002312635.1
## 6                       NA OTTHUMT00000002844.2        <NA>              <NA>
##           ont protein_id ccdsid
## 1        <NA>       <NA>   <NA>
## 2        <NA>       <NA>   <NA>
## 3        <NA>       <NA>   <NA>
## 4        <NA>       <NA>   <NA>
## 5        <NA>       <NA>   <NA>
## 6 PGO:0000019       <NA>   <NA>

需要的列有:

gene_id:与TCGA数据ENSG格式是一致的

gene_type:区分lncRNA

gene_name:即gene symbol

type:区分gene和其他类型,gene只有60483个

table(gtf$type)
## 
##           gene     transcript           exon            CDS    start_codon 
##          60483         198442        1172082         699443          82228 
##     stop_codon            UTR Selenocysteine 
##          74337         276542            114
if(!file.exists("D:/jianshu/R-TCGA-m6AlncRNA/gtf_gene.Rdata")){
library(tidyverse)
gtf_gene <- gtf[gtf$type=="gene",] %>% #提取type为gene的行
  select(gene_id,gene_name,gene_type)
gtf_mRNA <- gtf_gene[gtf_gene$gene_type=="protein_coding",] #提取mRNA

lnc = c("3prime_overlapping_ncRNA", "antisense", "bidirectional_promoter_lncRNA", "lincRNA", "macro_lncRNA", "non_coding", "processed_transcript", "sense_intronic" , "sense_overlapping")
gtf_lncRNA <- gtf_gene[gtf_gene$gene_type %in% lnc,]
save(gtf_mRNA,gtf_lncRNA,file = "D:/jianshu/R-TCGA-m6AlncRNA/gtf_gene.Rdata")
}

3. m6A和lncRNA求相关性

  • 分别提取m6A基因表达矩阵、lncRNA表达矩阵

  • 删除正常样品

  • 相关性检验: ** a.检测lncRNA sd是否大于0.1; ** b.检验lncRNA 在正常与肿瘤组间是否有差异; ** b.判断p.value是否小于0.05; ** c.判断相关系数(cor)是否大于阈值

rm(list = ls())
library(limma)
corFilter=0.4 #设置相关系数阈值
pvalueFilter=0.001 #设置p值过滤标准

# lncRNA数据处理
rt <- data.table::fread("D:/jianshu/R-TCGA-m6AlncRNA/lncRNA.txt",data.table = F)
rt <- as.matrix(rt) #变为matrix
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames = dimnames)
data <- avereps(data)
data <- data[rowMeans(data)>0.1,] #去掉数值比较小的lncRNA

library(tidyverse)
lncRNA <- data[,str_sub(colnames(data),14,15)<10]

# m6A数据处理
rt1 <- data.table::fread("D:/jianshu/R-TCGA-m6AlncRNA/m6aGeneExp.txt",data.table = F)
rt1 <- as.matrix(rt1) #变为matrix
rownames(rt1) <- rt1[,1]
exp1 <- rt1[,2:ncol(rt1)]
dimnames1 <- list(rownames(exp1),colnames(exp1))
m6A <- matrix(as.numeric(as.matrix(exp1)),nrow=nrow(exp1),dimnames = dimnames1)
m6A <- avereps(m6A)
m6A <- m6A[rowMeans(m6A)>0.1,] #去掉数值比较小的lncRNA

m6A <- m6A[,str_sub(colnames(m6A),14,15)<10]

# m6A和lncRNA相关性检验
sampleType <- c(rep(1,ncol(data[,str_sub(colnames(data),14,15)>10])),
                rep(2,ncol(data[,str_sub(colnames(data),14,15)<10])))
outTab=data.frame()
for(i in row.names(lncRNA)){
    if(sd(lncRNA[i,])>0.1){
        test=wilcox.test(data[i,] ~ sampleType)
        if(test$p.value<0.05){
            for(j in row.names(m6A)){
                x=as.numeric(lncRNA[i,])
                y=as.numeric(m6A[j,])
                corT=cor.test(x,y)
                cor=corT$estimate
                pvalue=corT$p.value
                if((cor>corFilter) & (pvalue<pvalueFilter)){
                    outTab=rbind(outTab,cbind(m6A=j,lncRNA=i,cor,pvalue,Regulation="postive"))
                }
                if((cor< -corFilter) & (pvalue<pvalueFilter)){
                    outTab=rbind(outTab,cbind(m6A=j,lncRNA=i,cor,pvalue,Regulation="negative"))
                }
            }
        }
    }
}
write.table(outTab,file = "D:/jianshu/R-TCGA-m6AlncRNA/m6AlncRNA-network.txt",sep = "\t",quote = F,col.names = F)

参考文献:
从TCGA表达矩阵中分别提取mRNA和lncRNA→生信星球(公众号)

相关文章

网友评论

      本文标题:R|TCGA|m6AlncRNA

      本文链接:https://www.haomeiwen.com/subject/wqwzfltx.html