原始数据准备

学习生信我认为首先要具有全局观，首先要弄清楚做什么分析，需要准备什么文件，然后把这些文件变成什么样子。当准备工作做好后，直接调用R包进行分析和作图。为了方便看，下面先将原始数据及需要整理好的文件先列出来，然后再进行差异分析及作图。

从TCGA下载数据的样式：

image.png
基因注释文件：

image.png
整理后的数据样式：
整理成列为样本名，行为基因名
在此期间需要对基因进行的处理：只保留编码蛋白的基因，删除重复基因（在这过程中，需要使用基因注释文件）。

基因表达矩阵：

image.png
样本信息
样本注释信息：data.fram形式，样品名称，组别信息或处理信息。
当然，上面的表述过于简单，具体操作还需要看代码，后面我会列出来，且我运行不会报错的代码。
image.png
到这儿数据准备就结束了，下面我将从前到后整体走一遍。

原始数据整理

counts <- read.table("counts.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
#加载基因注释文件
Ginfo_0 <- read.table("gene_length_Table.txt",sep = "\t",check.names = F,stringsAsFactors = F,header = T,row.names = 1)
#只要编码RNA
Ginfo <- Ginfo_0[which(Ginfo_0$genetype == "protein_coding"),] 
#取行名交集
comgene <- intersect(rownames(counts),rownames(Ginfo))
counts <- counts[comgene,]
Ginfo <- Ginfo[comgene,] #为了保证提取的行名和顺序一致
#判断是否一致
a <- rownames(counts)
b <- rownames(Ginfo)
identical(a,b) #返回TRUE代表行名一致
counts$Gene <- as.character(Ginfo$genename) #新增Gene Symbol
counts <- counts[!duplicated(counts$Gene),]   #去重复
rownames(counts) <- counts$Gene   #将行名变为Gene Symbol
counts <- counts[,-ncol(counts)]   #去除最后一列
#数据整理完毕，保存文件。
write.table(counts, file = "LIHC_counts_mRNA_all.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#保存癌症患者的counts
tumor <- colnames(counts)[substr(colnames(counts),14,16) == "01A"]
counts_01A <- counts[,tumor]
write.table(counts_01A, file = "LIHC_counts_mRNA_01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)

#对样本信息进行注释
conditions=data.frame(sample=colnames(counts),           group=factor(ifelse(substr(colnames(counts),14,16) == "01A","T","N"),levels = c("N","T"))) %>% 
  column_to_rownames("sample")

DESeq2差异分析

library(tidyverse)
library(DESeq2)

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = conditions,
  design = ~ group)
dds <- DESeq(dds)
resultsNames(dds)
res <- results(dds)
save(res,file = "LIHC_DEG.rda")#一定要保存！
res_deseq2 <- as.data.frame(res)%>% 
  arrange(padj) %>% 
  dplyr::filter(abs(log2FoldChange) > 2, padj<0.05)#根据自己需要
#保存差异基因
write.table(res_deseq2, file = "res_deseq2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)

结果输出：

image.png

差异分析的可视化

TCGA差异分析热图

library(pheatmap)
#根据差异表达水平进行分组
load("LIHC_DEG.rda")
logFC_cutoff <- 2
DEG <- as.data.frame(res)%>% 
  arrange(padj) %>% 
  dplyr::filter(abs(log2FoldChange) > 2, padj < 0.05)

type1 = (DEG$padj < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
type2 = (DEG$padj < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(type1,"DOWN",ifelse(type2,"UP","NOT"))
table(DEG$change)
cg = rownames(DEG)[DEG$change !="NOT"] #提取上调或下调显著的基因列表
counts <-read.table("LIHC_counts_mRNA_all.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T) 
exp_diff <- counts[cg,]
group_list=factor(ifelse(substr(colnames(counts),14,16) == "01A","T","N"),levels = c("N","T"))
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(exp_diff) #对样本名标注类别信息
pheatmap(exp_diff,
         annotation_col=annotation_col,
         scale = "row",
         show_rownames = F,
         show_colnames =F,
         color = colorRampPalette(c("navy", "white", "red"))(50),
         cluster_cols =F,
         fontsize = 10,
         fontsize_row=3,
         fontsize_col=3)

输出的结果（样本太多，基因太多，图太丑了。不过真正的项目分析，还需要精简和打磨，这个可以参考其他优秀的作者，很多人分享相关的代码）

000003.png

TCGA差异分析热图

library(tidyverse)
library(ggpubr)
library(ggthemes)

load("LIHC_DEG.rda")

DEG <- as.data.frame(res)%>% 
  arrange(padj) %>% 
  dplyr::filter(abs(log2FoldChange) > 1, padj < 0.05)

logFC_cutoff <- 1
type1 = (DEG$padj < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
type2 = (DEG$padj < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(type1,"DOWN",ifelse(type2,"UP","NOT"))
table(DEG$change)

DEG$logP <- -log10(DEG$padj)
#添加基因标签信息
DEG$Label = ""   #新加一列label
DEG <- DEG[order(DEG$padj), ]   #对差异基因的p值进行从小到大的排序
DEG$Gene <- rownames(DEG)
#高表达的基因中，选择fdr值最小的5个(注意排序是按照-log10(padj)排的，不是Foldchange,这部分酌情考虑更改)
up.genes <- head(DEG$Gene[which(DEG$change == "UP")], 5)
#低表达的基因中，选择fdr值最小的5个
down.genes <- head(DEG$Gene[which(DEG$change == "DOWN")], 5)
#将up.genes和down.genes合并，并加入到Label中
DEG.top5.genes <- c(as.character(up.genes), as.character(down.genes))
DEG$Label[match(DEG.top5.genes, DEG$Gene)] <- DEG.top5.genes
#作图
ggscatter(DEG, x = "log2FoldChange", y = "logP",
          color = "change",
          palette = c("blue", "red", "black"),
          size = 1,
          label = DEG$Label,
          font.label = 8,
          repel = T,
          xlab = "log2FoldChange",
          ylab = "-log10(Adjust P-value)") +
  theme_base() +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed")

结果输出：