如下A,这是一个基因组基因PAV(presence absence variation)表。如果基因有family,基因组有cluster,就可以内卷一下成B(数字随便给的,内卷算法根据需要)。
内卷方式可以是:
core mark 2: 每个样方列(内卷块)和 “全大于1” (红色)
absence mark 0: 每个样方(内卷块)列和 “全等于0” (白色)
dispensable mark 1: 其他 (蓝色)
1 PAV表
2 分组表
3 读表到构表
核心思路:
利用分组信息构造存放内卷后数据的表格,预先用0填充
## read table
df = read.table("data.txt", sep="\t", header=T, row.names=1)
g_gene = read.table("data_gene.txt", sep="\t", header=T)
g_genome = read.table("data_genome.txt", sep="\t", header=T)
# gene genome list
uni_gene = unique(g_gene$g_gene)
uni_genome = unique(g_genome$g_genome)
# gene genome 构造新表
row = length(uni_gene)
col = length(uni_genome)
fill = rep(0, row*col)
tb = data.frame(matrix(fill, row, col))
rownames(tb) = uni_gene
colnames(tb) = uni_genome
4 内卷方式和方法
核心思路:
遍历基因family和基因组cluster,
提取family cluster中的所有元素,
用这些元素从PAV表(唯一数据表)提取内卷块(样方),
gene family中至少一个基因出现在一个cluster中的所有基因组
记数字2(core gene), all(sum > 0)
gene family中任何一个基因未出现在任何一个cluster中任何的基因组
记数字0(absence gene), all(sum == 0)
部分基因部分出现则记1(dispensable gene) else
# pan genome 分类判定
# core mark 2: 每个样方列和 “全大于1”
# absence mark 0: 每个样方列和 “全等于0”
# dispensable mark 1: 其他
for(i in uni_gene)
{
for(j in uni_genome)
{
# 获取样方坐标
tmp_gene = g_gene[g_gene$g_gene==i,]
tmp_gene = tmp_gene$gene # as.character()转数字到字符
tmp_genome = g_genome[g_genome$g_genome==j, ]
tmp_genome = tmp_genome$genome
# 获取样方
tmp_df = df[c(tmp_gene), c(tmp_genome)]
# 统计列和
sum = apply(tmp_df, 2, FUN=sum)
if(all(sum > 0))
{tb[i, j] = 2}
else if(all(sum == 0))
{tb[i, j] = 0}
else
{tb[i, j] = 1}
}
}
t(tb)
5 基因计数作为mark
## count mark
tb = data.frame(matrix(fill, row, col))
rownames(tb) = uni_gene
colnames(tb) = uni_genome
for(i in uni_gene)
{
for(j in uni_genome)
{
# 获取样方坐标
tmp_gene = g_gene[g_gene$g_gene==i,]
tmp_gene = tmp_gene$gene
tmp_genome = g_genome[g_genome$g_genome==j, ]
tmp_genome = tmp_genome$genome
# 获取样方
tmp_df = df[c(tmp_gene), c(tmp_genome)]
# 统计列和
sum = apply(tmp_df, 2, FUN=sum)
tb[i, j] = sum(sum)
}
}
input_mark = t(tb)
# 0 处理 ""
for(i in 1:nrow(input_mark))
{
for(j in 1:ncol(input_mark))
{
if(input_mark[i, j] == 0)
{
input_mark[i, j] = ""
}
else
{
input_mark[i, j] = as.character(input_mark[i, j])
}
}
}
6 pheatmap热图
# pheatmap
library("pheatmap")
input = t(tb)
pheatmap(input,
cluster_col = F,
color = colorRampPalette(colors = c("white", "deepskyblue1", "indianred1"))(3),
fontsize_col = 15,
fontsize_row = 15,
cellwidth = 30,
cellheight = 30,
display_numbers = input_mark,
fontsize_number = 12,
angle_col = 45,
filename = "pan_reuterin_count.pdf")
网友评论