美文网首页《生物软件及应用》课程笔记RNA-seq转录组
第二次RNA-seq实战总结(3)-用DESeq2进行基因表达差

第二次RNA-seq实战总结(3)-用DESeq2进行基因表达差

作者: 邱俊辉 | 来源:发表于2018-11-04 14:41 被阅读7次

    DESeq2是一个用于分析基因表达差异的R包,具体操作姚在R语言中运行
    1.R语言安装DESeq2

    >source("https://bioconductor.org/biocLite.R")
    >biocLite("DESeq2")
    

    2.载入基因表达量文件,添加列名

    > setwd("C:\\Users\\18019\\Desktop\\counts")
    > options(stringsAsFactors=FALSE)
    > control1<-read.table("SRR957677_counts.txt",sep = "\t",col.names = c("gene_id","control1"))
    > head(control1)
                   gene_id control1
    1 ENSG00000000003.14_2     1576
    2  ENSG00000000005.5_2        0
    3 ENSG00000000419.12_2      756
    4 ENSG00000000457.13_3      301
    5 ENSG00000000460.16_5      764
    6 ENSG00000000938.12_2        0
    > control2<-read.table("SRR957678_counts.txt",sep = "\t",col.names = c("gene_id","control2"))
    > treat1<-read.table("SRR957679_counts.txt",sep = "\t",col.names = c("gene_id","treat1"))
    >treat2<-read.table("SRR957680_counts.txt",sep = "\t",col.names = c("gene_id","treat2"))
    

    3.数据整合

    > raw_count <- merge(merge(control1, control2, by="gene_id"), merge(treat1, treat2, by="gene_id"))
    > head(raw_count)
                     gene_id control1 control2  treat1  treat2
    1 __alignment_not_unique  7440131  2973831 7861484 8676884
    2            __ambiguous   976485   412543 1014239 1179051
    3           __no_feature  1860117   768637 1289737 1812056
    4          __not_aligned  1198545   572588 1256232 1348068
    5        __too_low_aQual        0        0       0       0
    6   ENSG00000000003.14_2     1576      713    1589    1969
    #删除前五行
    >raw_count_filt <- raw_count[-1:-5,]
    #因为我们无法在EBI数据库上直接搜索找到ENSMUSG00000024045.5这样的基因,只能是ENSMUSG00000024045的整数,没有小数点,所以需要进一步替换为整数的形式。
    #将_后面的数字替换为空赋值给a
    >a<- gsub("\\_\\d*", "", raw_count_filt$gene_id)
    #将.后面的数字替换为空赋值给ENSEMBL
    >ENSEMBL <- gsub("\\.\\d*", "", a)
    # 将ENSEMBL重新添加到raw_count_filt1矩阵
    >row.names(raw_count_filt) <- ENSEMBL
    > raw_count_filt <- cbind(ENSEMBL,raw_count_filt)
    > colnames(raw_count_filt)[1] <- c("ensembl_gene_id")
    >head(raraw_count_filt )
                    ensembl_gene_id              gene_id control1 control2 treat1 treat2
    ENSG00000000003 ENSG00000000003 ENSG00000000003.14_2     1576      713   1589   1969
    ENSG00000000005 ENSG00000000005  ENSG00000000005.5_2        0        0      0      1
    ENSG00000000419 ENSG00000000419 ENSG00000000419.12_2      756      384    806    984
    ENSG00000000457 ENSG00000000457 ENSG00000000457.13_3      301      151    217    324
    ENSG00000000460 ENSG00000000460 ENSG00000000460.16_5      764      312    564    784
    ENSG00000000938 ENSG00000000938 ENSG00000000938.12_2        0        0      0      0
    
    

    4.对基因进行注释-获取gene_symbol
    用bioMart对ensembl_id转换成gene_symbol

    > library("biomaRt")
    > library("curl")
    > mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
    > my_ensembl_gene_id <- row.names(raw_count_filt)
    >  options(timeout = 4000000)
    > hg_symbols<- getBM(attributes=c('ensembl_gene_id','hgnc_symbol',"chromosome_name", "start_position","end_position", "band"), filters= 'ensembl_gene_id', values = my_ensembl_gene_id, mart = mart)
    #将合并后的表达数据框raw_count_filt和注释得到的hg_symbols整合为一
    readcount <- merge(raw_count_filt, hg_symbols, by="ensembl_gene_id")
    > head(readcount)
      ensembl_gene_id              gene_id control1 control2 treat1 treat2 hgnc_symbol chromosome_name start_position
    1 ENSG00000000003 ENSG00000000003.14_2     1576      713   1589   1969      TSPAN6               X      100627109
    2 ENSG00000000005  ENSG00000000005.5_2        0        0      0      1        TNMD               X      100584802
    3 ENSG00000000419 ENSG00000000419.12_2      756      384    806    984        DPM1              20       50934867
    4 ENSG00000000457 ENSG00000000457.13_3      301      151    217    324       SCYL3               1      169849631
    5 ENSG00000000460 ENSG00000000460.16_5      764      312    564    784    C1orf112               1      169662007
    6 ENSG00000000938 ENSG00000000938.12_2        0        0      0      0         FGR               1       27612064
      end_position   band
    1    100639991  q22.1
    2    100599885  q22.1
    3     50958555 q13.13
    4    169894267  q24.2
    5    169854080  q24.2
    6     27635277  p35.3
    #输出count表达矩阵
    > write.csv(readcount, file='readcount_all,csv')
    > readcount<-raw_count_filt[ ,-1:-2]
    > write.csv(readcount, file='readcount.csv')
    > head(readcount)
                    control1 control2 treat1 treat2
    ENSG00000000003     1576      713   1589   1969
    ENSG00000000005        0        0      0      1
    ENSG00000000419      756      384    806    984
    ENSG00000000457      301      151    217    324
    ENSG00000000460      764      312    564    784
    ENSG00000000938        0        0      0      0
    

    5.DEseq2筛选差异表达基因并注释(bioMart)

    #载入数据(countData和colData)
    > mycounts<-readcount
    > head(mycounts)
                    control1 control2 treat1 treat2
    ENSG00000000003     1576      713   1589   1969
    ENSG00000000005        0        0      0      1
    ENSG00000000419      756      384    806    984
    ENSG00000000457      301      151    217    324
    ENSG00000000460      764      312    564    784
    ENSG00000000938        0        0      0      0
    > condition <- factor(c(rep("control",2),rep("treat",2)), levels = c("control","treat"))
    > condition
    [1] control control treat   treat  
    Levels: control treat
    > colData <- data.frame(row.names=colnames(mycounts), condition)
    > colData
             condition
    control1   control
    control2   control
    treat1       treat
    treat2       treat
    

    构建dds对象,开始DESeq流程

    >library("DESeq2")
    > dds <- DESeqDataSetFromMatrix(mycounts, colData, design= ~ condition)
    > dds <- DESeq(dds)
    estimating size factors
    estimating dispersions
    gene-wise dispersion estimates
    mean-dispersion relationship
    final dispersion estimates
    fitting model and testing
    > dds
    class: DESeqDataSet 
    dim: 60880 4 
    metadata(1): version
    assays(4): counts mu H cooks
    rownames(60880): ENSG00000000003 ENSG00000000005 ... ENSG00000285993 ENSG00000285994
    rowData names(22): baseMean baseVar ... deviance maxCooks
    colnames(4): control1 control2 treat1 treat2
    colData names(2): condition sizeFactor
    #查看总体结果
    > res = results(dds, contrast=c("condition", "control", "treat"))
    > res = res[order(res$pvalue),]
    > head(res)
    log2 fold change (MLE): condition control vs treat 
    Wald test p-value: condition control vs treat 
    DataFrame with 6 rows and 6 columns
                            baseMean   log2FoldChange             lfcSE             stat               pvalue
                           <numeric>        <numeric>         <numeric>        <numeric>            <numeric>
    ENSG00000178691 1025.66218695436 2.83012875791025 0.225513526042636 12.5497073615672 3.98981786210676e-36
    ENSG00000135535 2415.77359618136 1.22406336488047 0.183431131037356 6.67314952460929  2.5037106203736e-11
    ENSG00000164172 531.425786834548 1.30449018960413 0.207785830749451 6.27805170785243 3.42841906697453e-10
    ENSG00000172239 483.998634607265 1.31701332235233 0.215453141699223 6.11275988813803 9.79226522597759e-10
    ENSG00000237296 53.0114998109978 2.70139282483841 0.480033904207378 5.62750422660019 1.82835684560772e-08
    ENSG00000196504 3592.67315807893 1.09372324353448 0.200308218929736 5.46020153031335 4.75594407815571e-08
                                    padj
                               <numeric>
    ENSG00000178691 3.90682965057494e-32
    ENSG00000135535 1.22581671973492e-07
    ENSG00000164172 1.11903598346049e-06
    ENSG00000172239 2.39714652731931e-06
    ENSG00000237296                   NA
    ENSG00000196504 9.31404088266014e-05
    > summary(res)
    
    out of 33100 with nonzero total read count
    adjusted p-value < 0.1
    LFC > 0 (up)       : 78, 0.24%
    LFC < 0 (down)     : 15, 0.045%
    outliers [1]       : 0, 0%
    low counts [2]     : 23308, 70%
    (mean count < 135)
    [1] see 'cooksCutoff' argument of ?results
    [2] see 'independentFiltering' argument of ?results
    #这里可以看到有78个基因上调,15个基因下调
    #将分析结果输出
    > write.csv(res,file="All_results.csv")
    

    提取差异表达基因
    这里我用的方法是倍差法
    获取padj(p值经过多重校验校正后的值)小于0.05,表达倍数取以2为对数后大于1或者小于-1的差异表达基因

    > diff_gene_deseq2 <-subset(res, padj < 0.05 & abs(log2FoldChange) > 1)
    > dim(diff_gene_deseq2)
    [1] 21  6
    > head(diff_gene_deseq2)
    log2 fold change (MLE): condition control vs treat 
    Wald test p-value: condition control vs treat 
    DataFrame with 6 rows and 6 columns
                            baseMean   log2FoldChange             lfcSE             stat               pvalue
                           <numeric>        <numeric>         <numeric>        <numeric>            <numeric>
    ENSG00000178691 1025.66218695436 2.83012875791025 0.225513526042636 12.5497073615672 3.98981786210676e-36
    ENSG00000135535 2415.77359618136 1.22406336488047 0.183431131037356 6.67314952460929  2.5037106203736e-11
    ENSG00000164172 531.425786834548 1.30449018960413 0.207785830749451 6.27805170785243 3.42841906697453e-10
    ENSG00000172239 483.998634607265 1.31701332235233 0.215453141699223 6.11275988813803 9.79226522597759e-10
    ENSG00000196504 3592.67315807893 1.09372324353448 0.200308218929736 5.46020153031335 4.75594407815571e-08
    ENSG00000163848 633.066990185649 1.15489622775117 0.219655131372136 5.25777030810433 1.45812478575117e-07
                                    padj
                               <numeric>
    ENSG00000178691 3.90682965057494e-32
    ENSG00000135535 1.22581671973492e-07
    ENSG00000164172 1.11903598346049e-06
    ENSG00000172239 2.39714652731931e-06
    ENSG00000196504 9.31404088266014e-05
    ENSG00000163848 0.000230253090268928
    #输出差异基因
    > write.csv(diff_gene_deseq2,file= "DEG_treat_vs_control.csv")
    #用bioMart对差异表达基因进行注释
    > library("biomaRt")
    > library("curl")
    > hg_symbols<- getBM(attributes=c('ensembl_gene_id','external_gene_name',"description"),filters = 'ensembl_gene_id', values = my_ensembl_gene_id, mart = mart)
    > head(hg_symbols)
      ensembl_gene_id external_gene_name
    1 ENSG00000011405            PIK3C2A
    2 ENSG00000100731              PCNX1
    3 ENSG00000128512              DOCK4
    4 ENSG00000135535              CD164
    5 ENSG00000140526              ABHD2
    6 ENSG00000144228              SPOPL
                                                                                                      description
    1 phosphatidylinositol-4-phosphate 3-kinase catalytic subunit type 2 alpha [Source:HGNC Symbol;Acc:HGNC:8971]
    2                                                               pecanex 1 [Source:HGNC Symbol;Acc:HGNC:19740]
    3                                              dedicator of cytokinesis 4 [Source:HGNC Symbol;Acc:HGNC:19192]
    4                                                           CD164 molecule [Source:HGNC Symbol;Acc:HGNC:1632]
    5                                         abhydrolase domain containing 2 [Source:HGNC Symbol;Acc:HGNC:18717]
    6                                       speckle type BTB/POZ protein like [Source:HGNC Symbol;Acc:HGNC:27934]
    #合并数据:res结果hg_symbols合并成一个文件
    > ensembl_gene_id<-rownames(diff_gene_deseq2)
    > diff_gene_deseq2<-cbind(ensembl_gene_id,diff_gene_deseq2)
    > colnames(diff_gene_deseq2)[1]<-c("ensembl_gene_id")
    > diff_name<-merge(diff_gene_deseq2,hg_symbols,by="ensembl_gene_id")
    > head(diff_name)
    DataFrame with 6 rows and 9 columns
      ensembl_gene_id         baseMean   log2FoldChange             lfcSE             stat               pvalue
          <character>        <numeric>        <numeric>         <numeric>        <numeric>            <numeric>
    1 ENSG00000011405 1600.01408863821 1.07722909393382  0.24714564887963 4.35868120202462 1.30848557424083e-05
    2 ENSG00000100731 1162.93822827396  1.0006257630015 0.214393389946423 4.66724166846545 3.05270197242525e-06
    3 ENSG00000128512 368.442571635954 1.19657846347522 0.262780839813213  4.5535224878867 5.27550292947225e-06
    4 ENSG00000135535 2415.77359618136 1.22406336488047 0.183431131037356 6.67314952460929  2.5037106203736e-11
    5 ENSG00000140526 796.447227235737 1.05296203760187  0.23492350092969 4.48214858639031 7.38952622958053e-06
    6 ENSG00000144228 293.746859588111 1.10903132755747 0.283181091639851 3.91633255290906  8.9906210067011e-05
                      padj external_gene_name
                 <numeric>        <character>
    1  0.00533862114290257            PIK3C2A
    2    0.002491004809499              PCNX1
    3  0.00319558231320097              DOCK4
    4 1.22581671973492e-07              CD164
    5  0.00364483675888146              ABHD2
    6   0.0214722343652725              SPOPL
                                                                                                      description
                                                                                                      <character>
    1 phosphatidylinositol-4-phosphate 3-kinase catalytic subunit type 2 alpha [Source:HGNC Symbol;Acc:HGNC:8971]
    2                                                               pecanex 1 [Source:HGNC Symbol;Acc:HGNC:19740]
    3                                              dedicator of cytokinesis 4 [Source:HGNC Symbol;Acc:HGNC:19192]
    4                                                           CD164 molecule [Source:HGNC Symbol;Acc:HGNC:1632]
    5                                         abhydrolase domain containing 2 [Source:HGNC Symbol;Acc:HGNC:18717]
    6                                       speckle type BTB/POZ protein like [Source:HGNC Symbol;Acc:HGNC:27934]
    #输出含注释的差异基因文件
    write.csv(diff_name,file= "diff_gene.csv")
    

    到此为止就完成了RNA-seq的数据处理流程,下一步就是用pheatmap绘制热图了

    相关文章

      网友评论

        本文标题:第二次RNA-seq实战总结(3)-用DESeq2进行基因表达差

        本文链接:https://www.haomeiwen.com/subject/vgglxqtx.html