安装依赖模块:pands
python3 -m pip install pandas
计算思路:
1 读取"一组元素",读取"多个列表"的名字
2 构造"元素数*列表数"的数据框,分别取行名、列名
3 for循环readlines"多个列表"作为库,for读取各个元素判断"有/无"给表格赋值
4 to_csv保存表格到文本
df.iloc数字定位,df.loc文本定位
python3 代码:
import re,sys,os
import pandas as pd
import numpy as np
# 读取文件,去除换行符给新列表
with open("total/cgr2.total.uniq2uhgg", 'r') as list_genes:
list_genes = list_genes.readlines()
list_genes_enter = []
for each in list_genes:
list_genes_enter.append(each.strip())
with open("genome_cgr2.list", 'r') as list_genomes:
list_genomes = list_genomes.readlines()
list_genomes_enter = []
for each in list_genomes:
list_genomes_enter.append(each.strip())
# 重复命名不规范
# TypeError: expected str, bytes or os.PathLike object, not list
# 构造数据框
num_row = len(list_genes_enter)
num_col = len(list_genomes_enter)
num_total = num_row * num_col
df = pd.DataFrame(np.arange(num_total).reshape((num_row, num_col)),
columns = list_genomes_enter,
index = list_genes_enter)
# 遍历所有基因集,遍历所有行名(基因)是否存在于各基因集(CGR2),重新给表格赋值
for each_genome in list_genomes_enter:
target_file = "Prokka/{}/{}.gene".format(each_genome, each_genome)
# 读取基因集
with open(target_file, 'r') as target:
target_db = target.readlines()
for each_gene in list_genes_enter:
# 判断行名基因是否在基因集,并给表格元素赋值
# loc: 字符定位表格元素
# iloc: 数字定位
if "{}\n".format(each_gene) in target_db:
df.loc[each_gene, each_genome] = "yes"
else:
df.loc[each_gene, each_genome] = "no"
print("\033[32m _____ {} DONE!\033[0m".format(each_gene))
print("\033[32m {} DONE!\033[0m".format(each_genome))
# 表格保存
df.to_csv('cgr2_uniq_freq.txt', sep='\t', index = True)
把表格中的yes/no替换成1/0
绘制热图:
data = read.table("data.txt", header=T, row.names=1, sep="\t", check.names=F, na.string="", stringsAsFactors=F, quote="", comment.char="")
library(pheatmap)
pheatmap(data, cellheight=0.2, cellwidth=2, fontfamily="serif",
colorRampPalette(c("snow", "red"))(2),
fontsize_row = 0.2,
fontsize_col=2, fontsize=2, legend=F,
filename="out.pdf"
)
pheatmap(tmp, filename="strain_number_star.png", cluster_row=F, cluster_col=F, cellheight=3, cellwidth=20, fontfamily="serif", annotation_row=group, colorRampPalette(c("snow", "red"))(50), display_numbers = mark, annotation_colors = colors, labels_row="142 Genera", fontsize_row=15, fontsize_col=15, fontsize=12, fontsize_number=4)
网友评论