Pan permutation

作者: 胡童远 | 来源:发表于2021-09-08 19:24 被阅读0次

rarefaction对PAV数据进行permutation

rarefaction: Rarefaction curves for a pan-genome
heaps: Heaps law estimate

library(micropan)
library(ggplot2)
library(tidyr)

输入

data(xmpl.panmat)
# gene clsuter/family - genome pav table
df = xmpl.panmat
nrow(df)  # 5 基因组
ncol(df)  # 642 基因cluster

排列数
factorial(5) * choose(5, 5)

heaps law model

# open or closed: If alpha < 1 it indicates an open pan-genome
heaps(df, n.perm = 120)

排列和计算pan
# permutation
rare <- rarefaction(df, n.perm = 120)

整理数据
# rare[1:5, 1:5] # 检查
rare = data.frame(rare)
rare = rare[-1,-1]
rownames(rare) = 1:nrow(rare)
rare = t(rare)

rare 数据分布

table(rare[,1])
table(rare[,2])
table(rare[,3])
table(rare[,4])
table(rare[,5])

len_uni = c()
for(i in 1:ncol(rare))
{
    len_uni = c(len_uni, length(unique(rare[, i])))
}
plot(len_uni)

箱型图

## ggplot
library(stringr)
data = data.frame(rare)
data$id = rownames(data)
data = melt(data, id='id')
gid = c()
for(i in data$variable)
{
    gid = c(gid, unlist(strsplit(i, "X"))[2])
}
data$GID = gid

ggplot(data, aes(x = GID, y = value)) +
  geom_boxplot(width = 0.5, color = "deepskyblue3",
               size = 1) +
  geom_point(size = 2, color = "deepskyblue3",
             position = position_dodge(0.2)) +
  theme_classic() +
  labs(x="", y="Number of Gene Families") +
  theme(axis.title = element_text(size = 25),
        axis.text.y = element_text(size = 18),
        axis.text.x = element_text(size = 20),
        axis.line = element_line(size = 1),
        axis.ticks = element_line(size = 1))

拟合曲线 -案例

PEPPAN pangenome数据整理

setwd("xudong/peppan/")

library(micropan)
library(ggplot2)
library(tidyr)

df = read.table("peppan.PEPPAN.gene_content.Rtab", head=T, sep="\t", row.names=1)
# gene clsuter/family - genome pav table
df = data.frame(t(df))
# permutation number 组合数
factorial(74) * choose(74, 74)
# open or closed: If alpha < 1 it indicates an open pan-genome
heaps(df, n.perm = 1000)
# permutation
rare <- rarefaction(df, n.perm = 10000)
# 整理数据
# rare[1:5, 1:5] # 检查
rare = data.frame(rare)
rare = rare[-1,-1]
rownames(rare) = 1:nrow(rare)
rare = t(rare)
# rare unique 数据分布
len_uni = c()
for(i in 1:ncol(rare))
{
    len_uni = c(len_uni, length(unique(rare[, i])))
}
plot(len_uni)

nls拟合曲线,利用BPGA的pan curve估计值

# nls 拟合曲线 a*x^b
data$ID = as.numeric(data$GID)
f = function(x, a, b, c){a*x^b+c}
result = nls(data$value ~ f(data$ID, a, b, c), 
             data = data, 
             start = list(a = 1824, b = 0.05, c = 1))
summary(result)

绘制曲线 -案例

# 箱图 + 曲线
a = 12.78; b = 0.8383; c = 1854
fun <- function(x){a*x^b+c}
add = data.frame(GID=factor(1:74), value=fun(1:74))

res = 
ggplot(NULL, aes(x, y)) +
  geom_boxplot(data, mapping = aes(x = GID, y = value),
               width = 0.5, color = "deepskyblue3",
               size = 0.5, outlier.size = 0.2,
               outlier.color = NA,
               outlier.shape = NA) +
  theme_classic() +
  labs(x="Number of Genomes", 
       y="Number of Gene Families",
       title = "y=12.78x^0.8383+1854, heap's alpha < 1") +
  theme(axis.title = element_text(size = 25),
        axis.text.y = element_text(size = 18),
        axis.text.x = element_text(size = 8, angle=60, hjust=1),
        axis.line = element_line(size = 1),
        axis.ticks = element_line(size = 1)) +
  scale_y_continuous(limits = c(1800, 2400),
     breaks = c(1800,1900,2000,2100,2200,2300,2400)) +
  geom_point(add, mapping = aes(x=GID, y=value), 
             size = 2, color = "red")

ggsave(res, file="rarefaction.pdf", width = 10)
ggsave(res, file="rarefaction.png", width = 10)

置信区间

## 置信区间
library(Rmisc)
upper = c()
lower = c()
for(i in 1:74)
{
    tmp = data[data$ID==i,]
    tmp_data = tmp$value
    tmp_ci = CI(tmp_data, ci=0.95)
    upper = c(upper, tmp_ci[1])
    lower = c(lower, tmp_ci[3])
    print(i)
}
add$upper = upper
add$lower = lower

# 绘图
input = melt(add, id='GID')

ggplot(input, mapping = aes(x=GID, y=value, color = variable)) +
  theme_classic() +
  labs(x="Number of Genomes", 
       y="Number of Gene Families",
       title = "y=12.78x^0.8383+1854, heap's alpha < 1") +
  theme(axis.title = element_text(size = 25),
        axis.text.y = element_text(size = 18),
        axis.text.x = element_text(size = 8, angle=60, hjust=1),
        axis.line = element_line(size = 1),
        axis.ticks = element_line(size = 1)) +
  scale_y_continuous(limits = c(1800, 2400),
     breaks = c(1800,1900,2000,2100,2200,2300,2400)) +
  geom_line(aes(group = variable))

perm = 1的随机监督曲线

####################### NO permutation
## peppan curve data #############################################
#######################
setwd("xudong/peppan/")

library(micropan)
library(ggplot2)
library(tidyr)

tb = read.table("peppan_add.txt", head=T, sep="\t", row.names=1)
group = data.frame(id = rownames(tb), group = tb$Source)
tb = tb[, -1]

# gene clsuter/family - genome pav table
#####################
## 自编 pangenome 计数
#####################
pan = c()
gene = ncol(tb)
for(i in 1:nrow(tb))
{
    tmp = tb[1:i,]
    num = apply(tmp, 2, FUN=sum)
    count = 0
    for(j in 1:length(num)){if(num[j]==0){count=count+1}}
    pan = c(pan, gene - count)
}
group$pan = pan

# permutation
# tb2 <- rarefaction(tb, n.perm = 1)
# 方法类似,但是需要自己整理数据
## ggplot
group$group = factor(group$group, levels = unique(group$group))
group$id = factor(group$id, levels = group$id)
group$x = factor(rownames(group), levels = rownames(group))

col_list = read.table("C:/Users/hutongyuan/Desktop/group_color.list",sep="\t", check.names=F, na.string="", stringsAsFactors=F, quote="", comment.char="")
colors = col_list$V1[1:7]
names(colors) <- unique(group$group)

p = 
ggplot(group, aes(x = id, y=pan, color = group)) +
  geom_point() +
  geom_line(aes(group = group)) +
  theme_classic() +
  labs(x="Number of Genomes", 
       y="Number of Gene Families",
       color = "Source") +
  theme(axis.title = element_text(size = 25),
        axis.text.y = element_text(size = 18),
        axis.text.x = element_text(size = 8, angle=60, hjust=1),
        axis.line = element_line(size = 1),
        axis.ticks = element_line(size = 1),
        legend.title=element_text(face="bold")) +
  scale_y_continuous() +
  scale_color_manual(values = colors) +
  scale_y_continuous(limits = c(1800, 2400),
     breaks = c(1800,1900,2000,2100,2200,2300,2400))

ggsave(p, file="rarefaction_p1.pdf", width = 10)
ggsave(p, file="rarefaction_p1.png", width = 10)

rarecurve绘制reads-otus稀释曲线
rarefy: Rarefaction Species Richness

library(vegan)
data(BCI)
raremax <- min(rowSums(BCI))
rarefy(BCI, raremax)
rarecurve(BCI, step = 20, sample = raremax, col = "blue", cex = 0.6)

rarecurve基于如此丰度表绘制rarefaction curve


文献案例

标题:Pyrosequencing enumerates and contrasts soil microbial diversity
杂志:ISME
时间:2007
文中提到DOTUR采用Michaelis–Menten (MM) equation拟合reads-OTUs稀释曲线

更多:
How can I fit rarefaction curves?
幂律分布的参数估计方法及R实现
R语言曲线拟合函数(绘图)
R: fitting power law curve to data

相关文章

网友评论

    本文标题:Pan permutation

    本文链接:https://www.haomeiwen.com/subject/wffpwltx.html