美文网首页
2021-11-09基于RNA-seq表格做火山图(第二次)

2021-11-09基于RNA-seq表格做火山图(第二次)

作者: 千容安 | 来源:发表于2021-11-10 21:50 被阅读0次
setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop")
library("readxl")
data <- read_excel("RNA-seq.xlsx")
library(dplyr)
library(ggplot2)
library(ggrepel)
data
#转成tibble便于后续使用,去掉不需要的列;
data <- data[c(-10,-11,-14,-15,-16,-19,-20,-21,-22)] #不转试试
#data <- as_tibble(data[c(-10,-11,-14,-15,-16,-19,-20,-21,-22)])
data$padj<-as.numeric(as.matrix(data$padj))
  #对Q值取对数;
data$log10FDR <- -log10(data$padj)
   #生成显著上下调数据的分组标签;
data$group <- case_when(data$log2FoldChange > 1 & data$padj < 0.05 ~ "Up",
                      data$log2FoldChange < -1 & data$padj < 0.05 ~ "Down",
                      abs(data$log2FoldChange) <= 1 ~ "None",
                      data$padj >= 0.05 ~ "None") 
#获取表达差异最显著的10个基因;
top10sig <- filter(data,group!="None") %>% distinct(Symbol,.keep_all = T) %>% top_n(10,abs(log2FoldChange))
#将差异表达Top10的基因表格拆分成up和down两部分;
up <- filter(top10sig,group=="Up")
down <- filter(top10sig,group=="Down")
#新增一列,将Top10的差异基因标记为2,其他的标记为1;
# data$gene_id<-as.numeric(as.matrix(data$gene_id))
data$size <- case_when(!(data$gene_id %in% top10sig$gene_id)~ 1,data$gene_id %in% top10sig$gene_id ~ 2)
#提取非Top10的基因表格;
df <- filter(data,size==1)
#指定绘图顺序,将group列转成因子型;
df$group <- factor(df$group,
                        levels = c("Up","Down","None"),
                        ordered = T)
#开始绘图,建立映射;
p0 <-ggplot(data=df,aes(log2FoldChange,log10FDR,color=group))
#添加散点;
p1 <- p0+geom_point(size=3)
p1
#自定义半透明颜色(红绿);
mycolor <- c("#FF99CC","#99CC00","gray80")
p22 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.7))
p22
无top10基因
#继续添加Top10基因对应的点;
p2 <- p22+geom_point(data=up,aes(log2FoldChange,log10FDR),
                    color="#FF9999",size=4,alpha=0.9)+
  geom_point(data=down,aes(log2FoldChange,log10FDR),
             color="#7cae00",size=4,alpha=0.9)
p2
加了top10
#设置坐标轴范围两端空白区域的大小
p3<-p2+labs(y="-log10FDR")+
+     scale_y_continuous(limits = c(0, 15))+scale_x_continuous(limits = c(-5,25))
p3
调整坐标轴,删去空白区域

接下来为top10基因对应的散点添加指引线,我们主要用到geom_text_repel()和geom_label_repel()这两个函数,下面是比较重要的一些参数:
nudge_x/y:数据点与相应数据标签的距离,例如1表示标签在点右/上的1个单位处,而-2.2表示标签在点左/下2.2个单位处;
direction:标签分布方向,x表水平分布,y 表示垂直分布,both 表示随机分布;
segment.size:指定线段的粗细;
point.padding:表示点周围的空余区域,决定连接线端点到到数据点中心的距离,单位为line。

p3<-p2+labs(y="-log10FDR")+scale_y_continuous(limits = c(0, 15))+scale_x_continuous(limits = c(-25,25))
set.seed(007)
p4 <- p3+geom_text_repel(data=top10sig,aes(log2FoldChange,log10FDR,label=Symbol),
                         force=120,color="grey20",size=5,
                         point.padding = 0.8,hjust = 0.5,
                         arrow = arrow(length = unit(0.01, "npc"),
                                       type = "open", ends = "last"),
                         segment.color="grey20",
                         segment.size=0.5,
                         segment.alpha=0.8,
                         nudge_x=0,
                         nudge_y=0.5)
有标签
p5 <- p4+geom_hline(yintercept = c(-log10(0.05)),
                    size = 1.4,
                    color = "orange",
                    lty = "dashed")+
    geom_vline(xintercept = c(-1,1),
               size = 1.4,
               color = "orange",
               lty = "dashed")
p5
有辅助线
p6 <- p5+geom_label_repel(
data = up,aes(log2FoldChange,log10FDR,label=Symbol),
nudge_x = 2,
nudge_y = 1,
color = "white",
alpha = 0.9,
point.padding = 0.8,
size = 5,
fill = "#96C93D",
segment.size = 0.8,
segment.color = "grey50",
direction = "y",
hjust = 0.5) +
geom_label_repel(
data = down,aes(log2FoldChange,log10FDR,label=Symbol),
nudge_x = -10,
nudge_y = 2,
color = "white",
alpha = 0.9,
point.padding = 0.8,
size = 5,
fill = "#9881F5",
segment.size = 0.8,
segment.color = "grey50",
direction = "y",
hjust = 0.5)
> p6

根据情况将nudge_x = -10的:



我也不知道哪张合适:) 感觉还是不拉长线好,反正目的是突出哪些基因下调

  • 可以截断坐标轴来将离群的值合在一张图上,但是觉得直接截图然后设定剩下集群值的坐标轴好像更方便、美观
    生成离群部分:
p6 <- p5+geom_label_repel(
    data = up,aes(log2FoldChange,log10FDR,label=Symbol),
    nudge_x = 2,
    nudge_y = 1,
    color = "white",
    alpha = 0.9,
    point.padding = 0.8,
    size = 5,
    fill = "#96C93D",
    segment.size = 0.8,
    segment.color = "grey50",
    direction = "y",
    hjust = 0.5) +
    geom_label_repel(
        data = down,aes(log2FoldChange,log10FDR,label=Symbol),
        nudge_x = -1,
        nudge_y = 0.5,
        color = "white",
        alpha = 0.9,
        point.padding = 0.8,
        size = 5,
        fill = "#9881F5",
        segment.size = 0.8,
        segment.color = "grey50",
        direction = "y",
        hjust = 0.5)

生成聚集部分:

p3<-p2+labs(y="-log10FDR")+ scale_y_continuous(limits = c(0, 5))+scale_x_continuous(limits = c(-5,5))
p4 <- p3+geom_hline(yintercept = c(-log10(0.05)),
                    size = 1.4,
                    color = "orange",
                    lty = "dashed")+
    geom_vline(xintercept = c(-1,1),
               size = 1.4,
               color = "orange",
               lty = "dashed")
p5 <- p4+geom_label_repel(
+     data = up,aes(log2FoldChange,log10FDR,label=Symbol),
+     nudge_x = 0.5,
+     nudge_y = 0.5,
+     color = "white",
+     alpha = 0.9,
+     point.padding = 0.8,
+     size = 5,
+     fill = "#96C93D",
+     segment.size = 0.8,
+     segment.color = "grey50",
+     direction = "y",
+     hjust = 0.5) +
+     geom_label_repel(
+         data = down,aes(log2FoldChange,log10FDR,label=Symbol),
+         nudge_x = -0.5,
+         nudge_y = 0.5,
+         color = "white",
+         alpha = 0.9,
+         point.padding = 0.8,
+         size = 5,
+         fill = "#9881F5",
+         segment.size = 0.8,
+         segment.color = "grey50",
+         direction = "y",
+         hjust = 0.5)
image.png
用另一个教程,从差异基因的分类开始
# 新加一列Label
data$Label = ""
# 对差异表达基因的log2FC值进行从大到小排序取top20
data <- data[order(abs(data$log2FoldChange),decreasing = T), ]
log2FoldChange.genes <- head(data$Symbol, 20)

# 对差异FDR进行log10转换
data$logQ <- -log10(data$padj)

# 对差异表达基因的p值进行从小到大排top 20
#data <- data[order(data$logQ)]

# 高表达的基因中,选择FDR最小的20
data <- data[order(abs(data$padj),decreasing = T), ]
fdr.genes <- head(data$Symbol, 20)

# 将log2FC.genes 和fdr.genes合并,并加入到Label
top20.genes <- c(as.character(log2FoldChange.genes), as.character(fdr.genes))
data$Label[match(top20.genes, data$Symbol)] <- top20.genes

p1 <- ggscatter(data, x = "log2FoldChange", y = "padj",
          color = "type", 
          palette = c("#00BA38", "#BBBBBB", "#F8766D"),
          size = 2,
          label = data$Label, 
          font.label = 8, 
          repel = T,
          xlab = "log2FoldChange", 
          ylab = "-log10(padj)") + 
  theme_base() + 
  geom_hline(yintercept = 1.30, linetype="dashed") +
  geom_vline(xintercept = c(-2,2), linetype="dashed")
mycolor <- c("#FF9999","#99CC00","gray80")
p21 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.9))
p21
#其他配色方案:
mycolor <- c("#FF99CC","#99CC00","gray80")
p22 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.7))
p22
#继续添加Top10基因对应的点;
#p2 <- p22+geom_point(data=up,aes(log2FoldChange,log10FDR),
                    color="#FF9999",size=3,alpha=0.9)+
  geom_point(data=down,aes(log2FoldChange,log10FDR),
             color="#7cae00",size=3,alpha=0.9)
#p2

#expansion函数设置坐标轴范围两端空白区域的大小;mult为“倍数”模式,add为“加性”模式;
p3<-p2+labs(y="-log10FDR")+
  scale_y_continuous(expand=expansion(add = c(2, 0)),
                     limits = c(0, 40),
                     breaks = c(0,10,20,30,40),
                     label = c("0","10","20","30","40"))+
  scale_x_continuous(limits = c(-4, 4),
                     breaks = c(-4,-2,0,2,4),
                     label = c("-4","-2","0","2","4"))
p3
image.png
set.seed(007)
p4 <-p22+geom_text_repel(data=top10sig,aes(log2FoldChange,log10FDR,label=Symbol),
force=80,color="grey20",size=3,
point.padding = 0.5,hjust = 0.5,
arrow = arrow(length = unit(0.01, "npc"),
type = "open", ends = "last"),
segment.color="grey20",
segment.size=0.2,
segment.alpha=0.8,
nudge_x=0,
nudge_y=1)
p4

第三:

# 对差异表达基因的log2FC值进行从大到小排序取top 10
data <- data[order(abs(data$log2FoldChange),decreasing = T), ]
top10sig <- head(data$Symbol, 10)

#将差异表达Top10的基因表格拆分成up和down两部分;
data$group<-as.numeric(as.matrix(data$group))
up <- filter(top10sig,group=="Up")
down <- filter(top10sig,group=="Down")



  #获取表达差异最显著的10个基因;
top10sig <- filter(data,group!="None") %>% distinct(Symbol,.keep_all = T) %>% top_n(10,abs(data$log2FoldChange))

top10sig <- filter(data,group!="None") %>% distinct(Symbol,.keep_all = T) 

top_n <- top_n(10,data$log2FoldChange)
top10sig <- top_n(10,abs(data$log2FoldChange))

Error in UseMethod("tbl_vars") : 
  no applicable method for 'tbl_vars' applied to an object of class "c('double', 'numeric')"

重新开始:

library(ggpubr)
library(ggthemes)
setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop")
data <- read.table("RNA-seq - 1.txt",header = T,row.names = 1,sep="\t",check.names = F)
data$logQ <- -log10(data$padj)
ggscatter(data, x = "log2FoldChange", y = "logQ") + theme_base()
data$Group = "normal"
# 将adj.P.Val小于0.05,logFC大于2
# 将adj.P.Val小于0.05,logFC小于2的基因设为显著下调基因
data$Group[which( (data$padj < 0.01) & (data$log2FoldChange > 1) )] = "up"
data$Group[which( (data$padj < 0.01) & (data$log2FoldChange < -1) )] = "down"
# 绘制新的火山图
ggscatter(data, x = "log2FoldChange", y = "logQ",
+           color = "Group") + theme_base()
# 为火山图添加logP分界线条(geom_hline)和logFC分界线条(geom_vline)
geom_hline(yintercept = 1.30, linetype="dashed") +
geom_vline(xintercept = c(-2,2), linetype="dashed")
# 新加一列Label
data$Label = ""
# 对差异表达基因的log2FC值进行从大到小排序取top 20
data <- data[order(abs(data$log2FoldChange),decreasing = T), ]
log2FoldChange.genes <- head(data$ID, 20)
# 对差异表达基因的p值进行从小到大排top 20
# data <- data[order(data$logQ)]
# 高表达的基因中,选择FDR最小的20
data <- data[order(abs(data$logQ),decreasing = T), ]
fdr.genes <- head(data$ID, 20)

相关文章

网友评论

      本文标题:2021-11-09基于RNA-seq表格做火山图(第二次)

      本文链接:https://www.haomeiwen.com/subject/ltumzltx.html