rarefaction对PAV数据进行permutation
rarefaction: Rarefaction curves for a pan-genome
heaps: Heaps law estimate
library(micropan)
library(ggplot2)
library(tidyr)
输入
data(xmpl.panmat)
# gene clsuter/family - genome pav table
df = xmpl.panmat
nrow(df) # 5 基因组
ncol(df) # 642 基因cluster
data:image/s3,"s3://crabby-images/fd245/fd245f5d1e4296482db8e27908afb69f8a99d253" alt=""
排列数
factorial(5) * choose(5, 5)
heaps law model
# open or closed: If alpha < 1 it indicates an open pan-genome
heaps(df, n.perm = 120)
data:image/s3,"s3://crabby-images/6d35b/6d35b018825ae0605100e9c2c5e50838231cd587" alt=""
排列和计算pan
# permutation
rare <- rarefaction(df, n.perm = 120)
data:image/s3,"s3://crabby-images/d24c9/d24c952571f100acc6541523a54256afd40154e8" alt=""
整理数据
# rare[1:5, 1:5] # 检查
rare = data.frame(rare)
rare = rare[-1,-1]
rownames(rare) = 1:nrow(rare)
rare = t(rare)
rare 数据分布
table(rare[,1])
table(rare[,2])
table(rare[,3])
table(rare[,4])
table(rare[,5])
len_uni = c()
for(i in 1:ncol(rare))
{
len_uni = c(len_uni, length(unique(rare[, i])))
}
plot(len_uni)
data:image/s3,"s3://crabby-images/acf78/acf786e793f6189d1bf59d273f57560b10c7c7bc" alt=""
data:image/s3,"s3://crabby-images/d6d43/d6d43808bd54d9cade03cab4a9ed70d946de0620" alt=""
箱型图
## ggplot
library(stringr)
data = data.frame(rare)
data$id = rownames(data)
data = melt(data, id='id')
gid = c()
for(i in data$variable)
{
gid = c(gid, unlist(strsplit(i, "X"))[2])
}
data$GID = gid
ggplot(data, aes(x = GID, y = value)) +
geom_boxplot(width = 0.5, color = "deepskyblue3",
size = 1) +
geom_point(size = 2, color = "deepskyblue3",
position = position_dodge(0.2)) +
theme_classic() +
labs(x="", y="Number of Gene Families") +
theme(axis.title = element_text(size = 25),
axis.text.y = element_text(size = 18),
axis.text.x = element_text(size = 20),
axis.line = element_line(size = 1),
axis.ticks = element_line(size = 1))
data:image/s3,"s3://crabby-images/c157c/c157c3e697bd358b0e6aea8bf9d63211e0c05489" alt=""
拟合曲线 -案例
PEPPAN pangenome数据整理
setwd("xudong/peppan/")
library(micropan)
library(ggplot2)
library(tidyr)
df = read.table("peppan.PEPPAN.gene_content.Rtab", head=T, sep="\t", row.names=1)
# gene clsuter/family - genome pav table
df = data.frame(t(df))
# permutation number 组合数
factorial(74) * choose(74, 74)
# open or closed: If alpha < 1 it indicates an open pan-genome
heaps(df, n.perm = 1000)
# permutation
rare <- rarefaction(df, n.perm = 10000)
# 整理数据
# rare[1:5, 1:5] # 检查
rare = data.frame(rare)
rare = rare[-1,-1]
rownames(rare) = 1:nrow(rare)
rare = t(rare)
# rare unique 数据分布
len_uni = c()
for(i in 1:ncol(rare))
{
len_uni = c(len_uni, length(unique(rare[, i])))
}
plot(len_uni)
nls拟合曲线,利用BPGA的pan curve估计值
# nls 拟合曲线 a*x^b
data$ID = as.numeric(data$GID)
f = function(x, a, b, c){a*x^b+c}
result = nls(data$value ~ f(data$ID, a, b, c),
data = data,
start = list(a = 1824, b = 0.05, c = 1))
summary(result)
data:image/s3,"s3://crabby-images/a6dd0/a6dd00297f3410dffd99cf4200e255470d8c591e" alt=""
绘制曲线 -案例
# 箱图 + 曲线
a = 12.78; b = 0.8383; c = 1854
fun <- function(x){a*x^b+c}
add = data.frame(GID=factor(1:74), value=fun(1:74))
res =
ggplot(NULL, aes(x, y)) +
geom_boxplot(data, mapping = aes(x = GID, y = value),
width = 0.5, color = "deepskyblue3",
size = 0.5, outlier.size = 0.2,
outlier.color = NA,
outlier.shape = NA) +
theme_classic() +
labs(x="Number of Genomes",
y="Number of Gene Families",
title = "y=12.78x^0.8383+1854, heap's alpha < 1") +
theme(axis.title = element_text(size = 25),
axis.text.y = element_text(size = 18),
axis.text.x = element_text(size = 8, angle=60, hjust=1),
axis.line = element_line(size = 1),
axis.ticks = element_line(size = 1)) +
scale_y_continuous(limits = c(1800, 2400),
breaks = c(1800,1900,2000,2100,2200,2300,2400)) +
geom_point(add, mapping = aes(x=GID, y=value),
size = 2, color = "red")
ggsave(res, file="rarefaction.pdf", width = 10)
ggsave(res, file="rarefaction.png", width = 10)
data:image/s3,"s3://crabby-images/64a9a/64a9a47c0e60b99ee6dd9065b6a4a50112c9a9d3" alt=""
置信区间
## 置信区间
library(Rmisc)
upper = c()
lower = c()
for(i in 1:74)
{
tmp = data[data$ID==i,]
tmp_data = tmp$value
tmp_ci = CI(tmp_data, ci=0.95)
upper = c(upper, tmp_ci[1])
lower = c(lower, tmp_ci[3])
print(i)
}
add$upper = upper
add$lower = lower
# 绘图
input = melt(add, id='GID')
ggplot(input, mapping = aes(x=GID, y=value, color = variable)) +
theme_classic() +
labs(x="Number of Genomes",
y="Number of Gene Families",
title = "y=12.78x^0.8383+1854, heap's alpha < 1") +
theme(axis.title = element_text(size = 25),
axis.text.y = element_text(size = 18),
axis.text.x = element_text(size = 8, angle=60, hjust=1),
axis.line = element_line(size = 1),
axis.ticks = element_line(size = 1)) +
scale_y_continuous(limits = c(1800, 2400),
breaks = c(1800,1900,2000,2100,2200,2300,2400)) +
geom_line(aes(group = variable))
data:image/s3,"s3://crabby-images/32c2f/32c2f126241766ca38ad54f8768f499b9920f418" alt=""
perm = 1的随机监督曲线
####################### NO permutation
## peppan curve data #############################################
#######################
setwd("xudong/peppan/")
library(micropan)
library(ggplot2)
library(tidyr)
tb = read.table("peppan_add.txt", head=T, sep="\t", row.names=1)
data:image/s3,"s3://crabby-images/e1f1c/e1f1ce082b6296cf0e9f08469ce8aedbd237b988" alt=""
group = data.frame(id = rownames(tb), group = tb$Source)
tb = tb[, -1]
# gene clsuter/family - genome pav table
#####################
## 自编 pangenome 计数
#####################
pan = c()
gene = ncol(tb)
for(i in 1:nrow(tb))
{
tmp = tb[1:i,]
num = apply(tmp, 2, FUN=sum)
count = 0
for(j in 1:length(num)){if(num[j]==0){count=count+1}}
pan = c(pan, gene - count)
}
group$pan = pan
# permutation
# tb2 <- rarefaction(tb, n.perm = 1)
# 方法类似,但是需要自己整理数据
## ggplot
group$group = factor(group$group, levels = unique(group$group))
group$id = factor(group$id, levels = group$id)
group$x = factor(rownames(group), levels = rownames(group))
col_list = read.table("C:/Users/hutongyuan/Desktop/group_color.list",sep="\t", check.names=F, na.string="", stringsAsFactors=F, quote="", comment.char="")
colors = col_list$V1[1:7]
names(colors) <- unique(group$group)
p =
ggplot(group, aes(x = id, y=pan, color = group)) +
geom_point() +
geom_line(aes(group = group)) +
theme_classic() +
labs(x="Number of Genomes",
y="Number of Gene Families",
color = "Source") +
theme(axis.title = element_text(size = 25),
axis.text.y = element_text(size = 18),
axis.text.x = element_text(size = 8, angle=60, hjust=1),
axis.line = element_line(size = 1),
axis.ticks = element_line(size = 1),
legend.title=element_text(face="bold")) +
scale_y_continuous() +
scale_color_manual(values = colors) +
scale_y_continuous(limits = c(1800, 2400),
breaks = c(1800,1900,2000,2100,2200,2300,2400))
ggsave(p, file="rarefaction_p1.pdf", width = 10)
ggsave(p, file="rarefaction_p1.png", width = 10)
data:image/s3,"s3://crabby-images/bf770/bf77056a1b46c9abd6c0bcc62723344b64a473af" alt=""
rarecurve绘制reads-otus稀释曲线
rarefy: Rarefaction Species Richness
library(vegan)
data(BCI)
raremax <- min(rowSums(BCI))
rarefy(BCI, raremax)
rarecurve(BCI, step = 20, sample = raremax, col = "blue", cex = 0.6)
data:image/s3,"s3://crabby-images/d920f/d920f41facab92af4ef2ca43c5a16414dfc56ae3" alt=""
data:image/s3,"s3://crabby-images/608b5/608b59c5c021a95b7982c4a170a502a747791a55" alt=""
rarecurve基于如此丰度表绘制rarefaction curve
文献案例
标题:Pyrosequencing enumerates and contrasts soil microbial diversity
杂志:ISME
时间:2007
文中提到DOTUR采用Michaelis–Menten (MM) equation拟合reads-OTUs稀释曲线
更多:
How can I fit rarefaction curves?
幂律分布的参数估计方法及R实现
R语言曲线拟合函数(绘图)
R: fitting power law curve to data
网友评论