setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop")
library("readxl")
data <- read_excel("RNA-seq.xlsx")
library(dplyr)
library(ggplot2)
library(ggrepel)
![](https://img.haomeiwen.com/i27100867/2c7123e559df34e3.png)
#转成tibble便于后续使用,去掉不需要的列;
data <- data[c(-10,-11,-14,-15,-16,-19,-20,-21,-22)] #不转试试
#data <- as_tibble(data[c(-10,-11,-14,-15,-16,-19,-20,-21,-22)])
data$padj<-as.numeric(as.matrix(data$padj))
#对Q值取对数;
data$log10FDR <- -log10(data$padj)
#生成显著上下调数据的分组标签;
data$group <- case_when(data$log2FoldChange > 1 & data$padj < 0.05 ~ "Up",
data$log2FoldChange < -1 & data$padj < 0.05 ~ "Down",
abs(data$log2FoldChange) <= 1 ~ "None",
data$padj >= 0.05 ~ "None")
#获取表达差异最显著的10个基因;
top10sig <- filter(data,group!="None") %>% distinct(Symbol,.keep_all = T) %>% top_n(10,abs(log2FoldChange))
#将差异表达Top10的基因表格拆分成up和down两部分;
up <- filter(top10sig,group=="Up")
down <- filter(top10sig,group=="Down")
#新增一列,将Top10的差异基因标记为2,其他的标记为1;
# data$gene_id<-as.numeric(as.matrix(data$gene_id))
data$size <- case_when(!(data$gene_id %in% top10sig$gene_id)~ 1,data$gene_id %in% top10sig$gene_id ~ 2)
#提取非Top10的基因表格;
df <- filter(data,size==1)
#指定绘图顺序,将group列转成因子型;
df$group <- factor(df$group,
levels = c("Up","Down","None"),
ordered = T)
#开始绘图,建立映射;
p0 <-ggplot(data=df,aes(log2FoldChange,log10FDR,color=group))
#添加散点;
p1 <- p0+geom_point(size=3)
p1
#自定义半透明颜色(红绿);
mycolor <- c("#FF99CC","#99CC00","gray80")
p22 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.7))
p22
![](https://img.haomeiwen.com/i27100867/1d684236a2367d3a.png)
#继续添加Top10基因对应的点;
p2 <- p22+geom_point(data=up,aes(log2FoldChange,log10FDR),
color="#FF9999",size=4,alpha=0.9)+
geom_point(data=down,aes(log2FoldChange,log10FDR),
color="#7cae00",size=4,alpha=0.9)
p2
![](https://img.haomeiwen.com/i27100867/3356aa808fad3ccd.png)
#设置坐标轴范围两端空白区域的大小
p3<-p2+labs(y="-log10FDR")+
+ scale_y_continuous(limits = c(0, 15))+scale_x_continuous(limits = c(-5,25))
p3
![](https://img.haomeiwen.com/i27100867/4818e32683efa240.png)
接下来为top10基因对应的散点添加指引线,我们主要用到geom_text_repel()和geom_label_repel()这两个函数,下面是比较重要的一些参数:
nudge_x/y:数据点与相应数据标签的距离,例如1表示标签在点右/上的1个单位处,而-2.2表示标签在点左/下2.2个单位处;
direction:标签分布方向,x表水平分布,y 表示垂直分布,both 表示随机分布;
segment.size:指定线段的粗细;
point.padding:表示点周围的空余区域,决定连接线端点到到数据点中心的距离,单位为line。
p3<-p2+labs(y="-log10FDR")+scale_y_continuous(limits = c(0, 15))+scale_x_continuous(limits = c(-25,25))
set.seed(007)
p4 <- p3+geom_text_repel(data=top10sig,aes(log2FoldChange,log10FDR,label=Symbol),
force=120,color="grey20",size=5,
point.padding = 0.8,hjust = 0.5,
arrow = arrow(length = unit(0.01, "npc"),
type = "open", ends = "last"),
segment.color="grey20",
segment.size=0.5,
segment.alpha=0.8,
nudge_x=0,
nudge_y=0.5)
![](https://img.haomeiwen.com/i27100867/25f6f87ab3509afb.png)
p5 <- p4+geom_hline(yintercept = c(-log10(0.05)),
size = 1.4,
color = "orange",
lty = "dashed")+
geom_vline(xintercept = c(-1,1),
size = 1.4,
color = "orange",
lty = "dashed")
p5
![](https://img.haomeiwen.com/i27100867/5cff4071ff0f756c.png)
p6 <- p5+geom_label_repel(
data = up,aes(log2FoldChange,log10FDR,label=Symbol),
nudge_x = 2,
nudge_y = 1,
color = "white",
alpha = 0.9,
point.padding = 0.8,
size = 5,
fill = "#96C93D",
segment.size = 0.8,
segment.color = "grey50",
direction = "y",
hjust = 0.5) +
geom_label_repel(
data = down,aes(log2FoldChange,log10FDR,label=Symbol),
nudge_x = -10,
nudge_y = 2,
color = "white",
alpha = 0.9,
point.padding = 0.8,
size = 5,
fill = "#9881F5",
segment.size = 0.8,
segment.color = "grey50",
direction = "y",
hjust = 0.5)
> p6
![](https://img.haomeiwen.com/i27100867/484f7a0b9fb3eef0.png)
根据情况将nudge_x = -10的:
![](https://img.haomeiwen.com/i27100867/49d9bc21b4ae3bc5.png)
我也不知道哪张合适:) 感觉还是不拉长线好,反正目的是突出哪些基因下调
- 可以截断坐标轴来将离群的值合在一张图上,但是觉得直接截图然后设定剩下集群值的坐标轴好像更方便、美观
生成离群部分:
p6 <- p5+geom_label_repel(
data = up,aes(log2FoldChange,log10FDR,label=Symbol),
nudge_x = 2,
nudge_y = 1,
color = "white",
alpha = 0.9,
point.padding = 0.8,
size = 5,
fill = "#96C93D",
segment.size = 0.8,
segment.color = "grey50",
direction = "y",
hjust = 0.5) +
geom_label_repel(
data = down,aes(log2FoldChange,log10FDR,label=Symbol),
nudge_x = -1,
nudge_y = 0.5,
color = "white",
alpha = 0.9,
point.padding = 0.8,
size = 5,
fill = "#9881F5",
segment.size = 0.8,
segment.color = "grey50",
direction = "y",
hjust = 0.5)
生成聚集部分:
p3<-p2+labs(y="-log10FDR")+ scale_y_continuous(limits = c(0, 5))+scale_x_continuous(limits = c(-5,5))
p4 <- p3+geom_hline(yintercept = c(-log10(0.05)),
size = 1.4,
color = "orange",
lty = "dashed")+
geom_vline(xintercept = c(-1,1),
size = 1.4,
color = "orange",
lty = "dashed")
p5 <- p4+geom_label_repel(
+ data = up,aes(log2FoldChange,log10FDR,label=Symbol),
+ nudge_x = 0.5,
+ nudge_y = 0.5,
+ color = "white",
+ alpha = 0.9,
+ point.padding = 0.8,
+ size = 5,
+ fill = "#96C93D",
+ segment.size = 0.8,
+ segment.color = "grey50",
+ direction = "y",
+ hjust = 0.5) +
+ geom_label_repel(
+ data = down,aes(log2FoldChange,log10FDR,label=Symbol),
+ nudge_x = -0.5,
+ nudge_y = 0.5,
+ color = "white",
+ alpha = 0.9,
+ point.padding = 0.8,
+ size = 5,
+ fill = "#9881F5",
+ segment.size = 0.8,
+ segment.color = "grey50",
+ direction = "y",
+ hjust = 0.5)
![](https://img.haomeiwen.com/i27100867/b67fb56d78c1f85d.png)
用另一个教程,从差异基因的分类开始
# 新加一列Label
data$Label = ""
# 对差异表达基因的log2FC值进行从大到小排序取top20
data <- data[order(abs(data$log2FoldChange),decreasing = T), ]
log2FoldChange.genes <- head(data$Symbol, 20)
# 对差异FDR进行log10转换
data$logQ <- -log10(data$padj)
# 对差异表达基因的p值进行从小到大排top 20
#data <- data[order(data$logQ)]
# 高表达的基因中,选择FDR最小的20
data <- data[order(abs(data$padj),decreasing = T), ]
fdr.genes <- head(data$Symbol, 20)
# 将log2FC.genes 和fdr.genes合并,并加入到Label
top20.genes <- c(as.character(log2FoldChange.genes), as.character(fdr.genes))
data$Label[match(top20.genes, data$Symbol)] <- top20.genes
p1 <- ggscatter(data, x = "log2FoldChange", y = "padj",
color = "type",
palette = c("#00BA38", "#BBBBBB", "#F8766D"),
size = 2,
label = data$Label,
font.label = 8,
repel = T,
xlab = "log2FoldChange",
ylab = "-log10(padj)") +
theme_base() +
geom_hline(yintercept = 1.30, linetype="dashed") +
geom_vline(xintercept = c(-2,2), linetype="dashed")
![](https://img.haomeiwen.com/i27100867/f3c1550e5cb2f4be.png)
mycolor <- c("#FF9999","#99CC00","gray80")
p21 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.9))
p21
![](https://img.haomeiwen.com/i27100867/eff2b6e634b895ac.png)
#其他配色方案:
mycolor <- c("#FF99CC","#99CC00","gray80")
p22 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.7))
p22
![](https://img.haomeiwen.com/i27100867/b6d68e3ffe607662.png)
#继续添加Top10基因对应的点;
#p2 <- p22+geom_point(data=up,aes(log2FoldChange,log10FDR),
color="#FF9999",size=3,alpha=0.9)+
geom_point(data=down,aes(log2FoldChange,log10FDR),
color="#7cae00",size=3,alpha=0.9)
#p2
#expansion函数设置坐标轴范围两端空白区域的大小;mult为“倍数”模式,add为“加性”模式;
p3<-p2+labs(y="-log10FDR")+
scale_y_continuous(expand=expansion(add = c(2, 0)),
limits = c(0, 40),
breaks = c(0,10,20,30,40),
label = c("0","10","20","30","40"))+
scale_x_continuous(limits = c(-4, 4),
breaks = c(-4,-2,0,2,4),
label = c("-4","-2","0","2","4"))
p3
![](https://img.haomeiwen.com/i27100867/e0e8d59382988e30.png)
set.seed(007)
p4 <-p22+geom_text_repel(data=top10sig,aes(log2FoldChange,log10FDR,label=Symbol),
force=80,color="grey20",size=3,
point.padding = 0.5,hjust = 0.5,
arrow = arrow(length = unit(0.01, "npc"),
type = "open", ends = "last"),
segment.color="grey20",
segment.size=0.2,
segment.alpha=0.8,
nudge_x=0,
nudge_y=1)
p4
第三:
# 对差异表达基因的log2FC值进行从大到小排序取top 10
data <- data[order(abs(data$log2FoldChange),decreasing = T), ]
top10sig <- head(data$Symbol, 10)
#将差异表达Top10的基因表格拆分成up和down两部分;
data$group<-as.numeric(as.matrix(data$group))
up <- filter(top10sig,group=="Up")
down <- filter(top10sig,group=="Down")
#获取表达差异最显著的10个基因;
top10sig <- filter(data,group!="None") %>% distinct(Symbol,.keep_all = T) %>% top_n(10,abs(data$log2FoldChange))
top10sig <- filter(data,group!="None") %>% distinct(Symbol,.keep_all = T)
top_n <- top_n(10,data$log2FoldChange)
top10sig <- top_n(10,abs(data$log2FoldChange))
Error in UseMethod("tbl_vars") :
no applicable method for 'tbl_vars' applied to an object of class "c('double', 'numeric')"
重新开始:
library(ggpubr)
library(ggthemes)
setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop")
data <- read.table("RNA-seq - 1.txt",header = T,row.names = 1,sep="\t",check.names = F)
data$logQ <- -log10(data$padj)
ggscatter(data, x = "log2FoldChange", y = "logQ") + theme_base()
data$Group = "normal"
# 将adj.P.Val小于0.05,logFC大于2
# 将adj.P.Val小于0.05,logFC小于2的基因设为显著下调基因
data$Group[which( (data$padj < 0.01) & (data$log2FoldChange > 1) )] = "up"
data$Group[which( (data$padj < 0.01) & (data$log2FoldChange < -1) )] = "down"
# 绘制新的火山图
ggscatter(data, x = "log2FoldChange", y = "logQ",
+ color = "Group") + theme_base()
![](https://img.haomeiwen.com/i27100867/5a02d7556ca6c7bd.png)
# 为火山图添加logP分界线条(geom_hline)和logFC分界线条(geom_vline)
geom_hline(yintercept = 1.30, linetype="dashed") +
geom_vline(xintercept = c(-2,2), linetype="dashed")
![](https://img.haomeiwen.com/i27100867/2bca490d48667176.png)
# 新加一列Label
data$Label = ""
# 对差异表达基因的log2FC值进行从大到小排序取top 20
data <- data[order(abs(data$log2FoldChange),decreasing = T), ]
log2FoldChange.genes <- head(data$ID, 20)
# 对差异表达基因的p值进行从小到大排top 20
# data <- data[order(data$logQ)]
# 高表达的基因中,选择FDR最小的20
data <- data[order(abs(data$logQ),decreasing = T), ]
fdr.genes <- head(data$ID, 20)
![](https://img.haomeiwen.com/i27100867/c7dbef0a9e1e046a.png)
网友评论