1、质量控制:
fastqc -o ./ -t 1 /public/home/zyhu/hfd/${i}_R1_001.fastq.gz /public/home/zyhu/hfd/${i}_R2_001.fastq.gz
合并质控结果
multiqc ./
Trimmomatic修剪去掉前端低质量碱基
java -jar \$EBROOTTRIMMOMATIC/trimmomatic-0.32.jar PE -phred33 ${i}_R1_001.fastq.gz ${i}_R2_001.fastq.gz \
${i}_R1_001_paired.fastq.gz ${i}_R1_001_unpaired.fastq.gz ${i}_R2_001_paired.fastq.gz ${i}_R2_001_unpaired.fastq.gz \
LEADING:3 TRAILING:3 HEADCROP:10 SLIDINGWINDOW:4:15 MINLEN:36
2、比对到参考基因组(hisat2)
hisat2 -p 4 --dta -x /public/home/zyhu/Mouse/reference/GRCm39 -1 /public/home/zyhu/Mouse_RNA-seq/02.trim/${i}_R1_001_paired.fastq.gz -2 /public/home/zyhu/Mouse_RNA-seq/02.trim/${i}_R2_001_paired.fastq.gz -S ${i}.sam
3、featureCounts计数(meta层面)
featureCounts -T 4 -a /public/home/zyhu/Mouse/annotation/Mus_musculus.GRCm39.103.gtf -o /public/home/zyhu/Mouse_RNA-seq/06.featureCounts/meta-featureCounts/featureCounts.txt -p -B -C -t exon -g gene_id *.bam
提取基因名和reads count
cut -f 1,7-41 featureCounts.txt|grep -v '#' > A_B_C_M.matrix.txt
4、差异分析
rm(list=ls())
options(stringsAsFactors = F)
library('rio')
library('ggplot2')
library('DESeq2')
library('pheatmap')
library('apeglm')
CountMatrix <- read.table("C:/Users/Cool-N/OneDrive/桌面/A_B_C_M.matrix.txt",header = T,row.names= 1)
colnames(CountMatrix) <- c("A_10_S37","A_1_S30","A_2_S31","A_3_S32","A_4_S33","A_6_S34","A_7_S35","A_9_S36","B_1_S38","B_2_S39","B_3_S40","B_4_S41","B_5_S42","B_6_S43","B_7_S44","B_9_S45","C_2_S46","C_3_S47","C_4_S48","C_5_S49","C_6_S50","C_7_S51","C_8_S52","C_9_S53","M_2_S54","M_3_S55","M_4_S56","M_5_S57","M_6_S58","M_7_S59","M_8_S60","M_9_S61")
countdata = CountMatrix[,c(1:8,17:24)]
#保留非全0的基因
countdata <- countdata[rowSums(countdata) > 0,]
condition = factor(c(rep("A", 8), rep("C", 8)), levels = c("A","C"))
coldata = data.frame(row.names = colnames(countdata), condition)
dds = DESeqDataSetFromMatrix(countdata, colData = coldata, design = ~ condition)
#PCA分析--其他包
# library(ggord)
# library(FactoMineR)
# dat <- as.data.frame(t(countdata))
# dat.pca <- PCA(dat, graph = FALSE)
# ggord(dat.pca,group_list,coord_fix = FALSE,
# arrow = 0,vec_ext = 0,txt = NULL)
# ggsave("PCA.pdf")
#PCA--使用DESeq2的plotPCA包;样本数<30:rlog; >30:vst(归一化)
rld<-rlog(dds,blind=FALSE)
assay(rld) <- limma::removeBatchEffect(assay(rld), rld$condition)#去除批次效应,可选项
pdf("PCA_plot.pdf")
plotPCA(rld,intgroup = "condition")
dev.off()
#差异表达分析
dds = DESeq(dds)
#sizeFactors(dds)
res <- results(dds)
res <- res[order(res$padj),]
#table(res$padj<0.01)
#将DEG转换为数据框格式,并去掉含NA的行
DEG <- as.data.frame(res)
DEG <- na.omit(DEG)
#################火山图##############
library(ggrepel)
#确定差异表达倍数,设置动态阈值,也可以设置静态阈值,例如log2FoldChange=1或2
logFC_t <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)))
#取前两位小数
logFC_t <- round(logFC_t, 2)
#确定上下调基因
DEG$change = as.factor(ifelse(DEG$padj < 0.05 & abs(DEG$log2FoldChange) > logFC_t,
ifelse(DEG$log2FoldChange > logFC_t ,'UP','DOWN'),'STABLE'))
#table(DEG$change)
DEG$symbol <- rownames(DEG)
#设置阈值显示超过阈值的基因名
DEG$label <- ifelse(DEG$padj < 0.00001 & abs(DEG$log2FoldChange) >= 3,DEG$symbol,"")
#画火山图
pdf("volcano_plot.pdf")
ggplot(DEG, aes(x=log2FoldChange, y=-log10(padj),color=change)) +
geom_point(alpha=0.4, size=2) +
theme_classic() +
xlab("log2 fold change") +
ylab("-log10 padj") +
theme(plot.title = element_text(size=15,hjust = 0.5)) +
scale_colour_manual(values = c('blue','black','red')) +
geom_hline(yintercept = -log10(0.05), lty = 2) +
geom_vline(xintercept = c(-logFC_t, logFC_t), lty = 2) +
geom_label_repel(data = DEG, aes(label = label),
size = 3,box.padding = unit(0.5, "lines"),
point.padding = unit(0.8, "lines"),
segment.color = "black",
show.legend = FALSE, max.overlaps = 10000)
dev.off()
############# pheatmap热图 ############
dat <- countdata
FC <- DEG$log2FoldChange
names(FC) <- rownames(DEG)
#排序差异倍数,提取前100和后100的基因名
DEG_200 <- c(names(head(sort(FC),100)),names(tail(sort(FC),100)))
#将提取的基因归一化
dat <- t(scale(t(dat[DEG_200,])))
#添加注释条
group <- data.frame(group=condition)
rownames(group)=colnames(dat)
pdf("pheatmap_plot.pdf")
pheatmap(dat, cluster_cols = T,
show_colnames =F,show_rownames = F,
annotation_col=group,
color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
dev.off()
#大于2的赋值为2
dat[dat > 2] <- 2
#低于2的赋值为-2
dat[dat < -2] <- -2
pdf("pheatmap_alter_plot.pdf")
pheatmap(dat, cluster_cols = T,
show_colnames =F,show_rownames = F,
annotation_col=group,
color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
dev.off()
########### KEGG富集分析 ##########
library('dplyr')
library(org.Mm.eg.db)#选取物种数据库
library(clusterProfiler)
#### 第一步,确定上下调基因,已经完成
#### 第二步,从org.Mm.eg.db提取ENSG的ID和GI号对应关系
# 如果ENSG的ID具有小数点,用下面两句代码舍掉小数点
# library(stringr)
# rownames(DEG) <- str_sub(rownames(DEG), start = 1, end = 15)
#DEG添加一列ENSEMBL
DEG$ENSEMBL <- rownames(DEG)
# bitr in clusterProfiler,将ENSEMBL的ID转换为GI号保存为df
df <- bitr(rownames(DEG),
fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb = org.Mm.eg.db)
#### 第三步,将GI号合并到DEG数据框内
DEG <- inner_join(DEG, df, by="ENSEMBL")
#### 第四步,提取上调和下调的GI集
gene_up <- DEG[DEG$change == "UP", "ENTREZID"]
gene_down <- DEG[DEG$change == "DOWN", "ENTREZID"]
gene_diff <- c(gene_up, gene_down)
gene_all <- as.character(DEG[,"ENTREZID"])
geneList <- DEG$log2FoldChange
names(geneList) <- DEG$ENTREZID
geneList <- sort(geneList, decreasing = T)
geneList <- geneList[geneList != 0]
## KEGG pathway analysis
kk.up <- enrichKEGG(gene = gene_up,
organism = "mmu",
universe = gene_all,
pvalueCutoff = 0.8,
qvalueCutoff = 0.8)
kk.down <- enrichKEGG(gene = gene_down,
organism = "mmu",
universe = gene_all,
pvalueCutoff = 0.8,
qvalueCutoff = 0.8)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg <- kegg_down_dt[kegg_down_dt$pvalue < 0.05,]
down_kegg$group = -1
up_kegg <- kegg_up_dt[kegg_up_dt$pvalue < 0.05,]
up_kegg$group = 1
dat <- rbind(up_kegg, down_kegg)
dat$pvalue <- -log10(dat$pvalue)
dat$pvalue <- dat$pvalue * dat$group
dat <- dat[order(dat$pvalue, decreasing = F),]
pdf("KEGG.pdf")
ggplot(dat, aes(x = reorder(Description, order(pvalue, decreasing=F)), y = pvalue, fill = group)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "blue", high = "red", guide = FALSE) +
scale_x_discrete(name = "Pathway names") +
scale_y_continuous(name = "log10P-value") +
coord_flip() + theme_bw() + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Pathway Enrichment")
dev.off()
############ GO富集分析 #########
ego_up <- enrichGO(gene = gene_up,
OrgDb = org.Mm.eg.db,
ont = "ALL" ,
universe = gene_all,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
head(ego_up[,1:6])
ego_up_result <- ego_up@result[order(ego_up@result$pvalue),]
ego_up_sig <- rbind(head(subset(ego_up_result, ONTOLOGY == "BP"), 10),
head(subset(ego_up_result, ONTOLOGY == "CC"), 10),
head(subset(ego_up_result, ONTOLOGY == "MF"), 10))
ego_up_sig$pvalue <- -log10(ego_up_sig$pvalue)
ego_up_sig$Description <- factor(ego_up_sig$Description, levels = as.character(rev(ego_up_sig$Description)))
pdf("GO_up.pdf")
ggplot(ego_up_sig, aes(x = Description, y = pvalue, fill = ONTOLOGY)) +
geom_col() +
coord_flip() +
theme_classic() +
facet_grid(ONTOLOGY~., scales = "free") +
xlab("Up regulated")
dev.off()
ego_down <- enrichGO(gene = gene_down,
OrgDb = org.Mm.eg.db,
ont = "ALL" ,
universe = gene_all,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
head(ego_down[,1:6])
ego_down_result <- ego_down@result[order(ego_down@result$pvalue),]
ego_down_sig <- rbind(head(subset(ego_down_result, ONTOLOGY == "BP"), 10),
head(subset(ego_down_result, ONTOLOGY == "CC"), 10),
head(subset(ego_down_result, ONTOLOGY == "MF"), 10))
ego_down_sig$pvalue <- -log10(ego_down_sig$pvalue)
ego_down_sig$Description <- factor(ego_down_sig$Description, levels = as.character(rev(ego_down_sig$Description)))
pdf("GO_down.pdf")
ggplot(ego_down_sig, aes(x = Description, y = pvalue, fill = ONTOLOGY)) +
geom_col() +
coord_flip() +
theme_classic() +
facet_grid(ONTOLOGY~., scales = "free") +
xlab("Down regulated")
dev.off()
网友评论