美文网首页bioinformatics
R: Table行列分组内卷

R: Table行列分组内卷

作者: 胡童远 | 来源:发表于2021-07-23 15:32 被阅读0次

    如下A,这是一个基因组基因PAV(presence absence variation)表。如果基因有family,基因组有cluster,就可以内卷一下成B(数字随便给的,内卷算法根据需要)。

    内卷方式可以是:
    core mark 2: 每个样方列(内卷块)和 “全大于1” (红色)
    absence mark 0: 每个样方(内卷块)列和 “全等于0” (白色)
    dispensable mark 1: 其他 (蓝色)

    A 分组信息 B

    1 PAV表

    2 分组表

    3 读表到构表

    核心思路:
    利用分组信息构造存放内卷后数据的表格,预先用0填充

    ## read table
    df = read.table("data.txt", sep="\t", header=T, row.names=1)
    g_gene = read.table("data_gene.txt", sep="\t", header=T)
    g_genome = read.table("data_genome.txt", sep="\t", header=T)
    
    # gene genome list
    uni_gene = unique(g_gene$g_gene)
    uni_genome = unique(g_genome$g_genome)
    
    # gene genome 构造新表
    row = length(uni_gene)
    col = length(uni_genome)
    fill = rep(0, row*col)
    tb = data.frame(matrix(fill, row, col))
    rownames(tb) = uni_gene
    colnames(tb) = uni_genome
    

    4 内卷方式和方法

    核心思路:
    遍历基因family和基因组cluster,
    提取family cluster中的所有元素,
    用这些元素从PAV表(唯一数据表)提取内卷块(样方),
    gene family中至少一个基因出现在一个cluster中的所有基因组
    记数字2(core gene), all(sum > 0)
    gene family中任何一个基因未出现在任何一个cluster中任何的基因组
    记数字0(absence gene), all(sum == 0)
    部分基因部分出现则记1(dispensable gene) else

    # pan genome 分类判定
    # core mark 2: 每个样方列和 “全大于1”
    # absence mark 0: 每个样方列和 “全等于0”
    # dispensable mark 1: 其他
    for(i in uni_gene)
    {
        for(j in uni_genome)
        {
            # 获取样方坐标
            tmp_gene = g_gene[g_gene$g_gene==i,]
            tmp_gene = tmp_gene$gene  # as.character()转数字到字符
            tmp_genome = g_genome[g_genome$g_genome==j, ]
            tmp_genome = tmp_genome$genome
            # 获取样方
            tmp_df = df[c(tmp_gene), c(tmp_genome)]
            # 统计列和
            sum = apply(tmp_df, 2, FUN=sum)
            if(all(sum > 0))
            {tb[i, j] = 2}
            else if(all(sum == 0))
            {tb[i, j] = 0}
            else
            {tb[i, j] = 1}
        }
    }
    
    t(tb)

    5 基因计数作为mark

    ## count mark
    tb = data.frame(matrix(fill, row, col))
    rownames(tb) = uni_gene
    colnames(tb) = uni_genome
    
    for(i in uni_gene)
    {
        for(j in uni_genome)
        {
            # 获取样方坐标
            tmp_gene = g_gene[g_gene$g_gene==i,]
            tmp_gene = tmp_gene$gene
            tmp_genome = g_genome[g_genome$g_genome==j, ]
            tmp_genome = tmp_genome$genome
            # 获取样方
            tmp_df = df[c(tmp_gene), c(tmp_genome)]
            # 统计列和
            sum = apply(tmp_df, 2, FUN=sum)
            tb[i, j] = sum(sum)
        }
    }
    
    input_mark = t(tb)
    
    # 0 处理 ""
    for(i in 1:nrow(input_mark))
    {
        for(j in 1:ncol(input_mark))
        {
            if(input_mark[i, j] == 0)
            {
                input_mark[i, j] = ""
            }
            else
            {
                input_mark[i, j] = as.character(input_mark[i, j])
            }
        }
    }
    

    6 pheatmap热图

    # pheatmap
    library("pheatmap")
    input = t(tb)
    
    pheatmap(input, 
     cluster_col = F,
     color = colorRampPalette(colors = c("white", "deepskyblue1", "indianred1"))(3),
     fontsize_col = 15,  
     fontsize_row = 15,  
     cellwidth = 30,  
     cellheight = 30,
     display_numbers = input_mark,
     fontsize_number = 12,
     angle_col = 45,
     filename = "pan_reuterin_count.pdf")
    

    相关文章

      网友评论

        本文标题:R: Table行列分组内卷

        本文链接:https://www.haomeiwen.com/subject/jrfpmltx.html