美文网首页
染色体位置如何注释到基因名

染色体位置如何注释到基因名

作者: rochiman | 来源:发表于2021-03-14 15:02 被阅读0次

    1. 环境准备

    python
    gff3文件:https://www.gencodegenes.org/human/release_19.html
    python包:

    pip3 install pandas, pysam, numpy
    

    2. 输入文件

    #Chr    Begin   End Size(Mb)    Nb_variants Percentage_homozygosity
    chr1    37356469    39797055    2.44    57  89.47
    chr1    89731959    91742055    2.01    31  90.32
    chr1    92327126    94544234    2.22    59  91.53
    chr1    172100831   176102969   4   78  91.03
    chr2    11718591    15732002    4.01    26  100
    chr2    39262068    41384390    2.12    26  92.31
    chr2    59614572    62047433    2.43    33  93.94
    chr2    80809027    85143406    4.33    27  92.59
    chr2    143787161   159672168   15.89   163 93.87
    

    3. 输出文件

    样式1

    chr start   end Size(Mb)    Nb_variants Percentage_homozygosity genes
    chr1    37356469    39797055    2.44    57  89.47   GRIK3(5.87);ZC3H12A(0.4);MEAF6(0.91);SNIP1(0.81);DNALI1(0.41);GNL2(1.19);RSPO1(0.97);C1orf109(0.44);CDCA8(0.71);EPHA10(2.1);MANEAL(0.3);YRDC(0.21);C1orf122(0.1);MTF1(2.05);INPP5B(3.54);SF3A3(1.39);FHL3(0.36);UTP11L(0.64);POU3F1(0.12);RRAGC(0.89);MYCBP(0.76);GJA9(0.7);RHBDL2(2.29);AKIRIN1(0.61);NDUFS5(0.34);MACF1(10.25)
    chr1    89731959    91742055    2.01    31  90.32   GBP5(0.33);GBP6(1.11);LRRC8B(3.63);LRRC8C(6.81);LRRC8D(5.75);ZNF326(2.01);BARHL2(0.28);ZNF644(5.32);HFM1(0.78)
    chr1    92327126    94544234    2.22    59  91.53   TGFBR3(2.02);BRDT(2.93);EPHX4(1.51);BTBD8(3.05);KIAA1107(0.8);C1orf146(1.26);GLMN(2.37);RPAP2(4.65);GFI1(0.55);EVI5(12.8);RPL5(0.45);FAM69A(5.38);MTF2(2.7);TMED5(1.4);CCDC18(4.46);DR1(1.06);FNBP1L(4.8);BCAR3(12.87);DNTTIP2(0.55);GCLM(1.09);ABCA4(3.87)
    chr1    172100831   176102969   4   78  91.03   DNM3(7.17);PIGC(1.85);C1orf105(1.2);SUCO(1.99);FASLG(0.2);TNFSF18(0.27);TNFSF4(0.59);PRDX6(0.29);SLC9C2(2.56);ANKRD45(1.51);KLHL20(1.79);CENPL(0.63);DARS2(0.85);ZBTB37(0.89);SERPINC1(0.34);RC3H1(2.28);RABGAP1L(20.89);GPR52(0.04);CACYBP(0.31);MRPS14(0.32);TNN(2.0);KIAA0040(0.9);TNR(10.71);RFWD2(4.72)
    chr2    11718591    15732002    4.01    26  100 GREB1(1.6);NTSR2(0.3);LPIN1(3.73);TRIB2(0.64);FAM84A(0.45);AC011897.1(0.39);NBAS(9.83);DDX1(0.02)
    chr2    39262068    41384390    2.12    26  92.31   SOS1(4.21);CDKL4(2.54);MAP4K3(8.86);TMEM178A(2.5);THUMPD2(2.04);SLC8A1(24.21)
    chr2    59614572    62047433    2.43    33  93.94   BCL11A(4.21);PAPOLG(1.88);REL(2.06);PUS10(3.21);PEX13(1.43);KIAA1841(4.07);C2orf74(0.81);AHSA2(0.57);USP34(11.64);XPO1(2.5)
    chr2    80809027    85143406    4.33    27  92.59   CTNNA2(1.54);SUCLG1(0.84);DNAH6(6.99);TRABD2A(1.97);TMSB10(0.02)
    chr2    143787161   159672168   15.89   163 93.87   KYNU(0.08);ARHGAP15(4.26);GTDC1(2.48);ZEB2(0.88);ACVR2A(0.54);ORC4(0.57);MBD5(3.13);EPC2(0.9);KIF5C(1.58);LYPD6B(1.12);LYPD6(0.91);MMADHC(0.11);RND3(0.45);RBM43(0.09);NMI(0.12);TNFAIP6(0.14);RIF1(0.62);NEB(1.57);ARL5A(0.25);CACNB4(1.68);STAM2(0.37);FMNL2(1.98);PRPF40A(0.42);ARL6IP6(0.27);RPRM(0.01);GALNT13(3.66);KCNJ3(1.01);NR4A2(0.11);GPD2(1.12);GALNT5(0.36);ERMN(0.06);CYTIP(0.47);ACVR1C(0.64);ACVR1(0.88);UPP2(1.63);CCDC148(1.8);PKP4(1.42);DAPL1(0.13)
    

    输出样式2

    chr start   end Size(Mb)    Nb_variants Percentage_homozygosity gene_name   gene_coverage
    chr1    37356469    39797055    2.44    57  89.47   GRIK3   5.87
    chr1    37356469    39797055    2.44    57  89.47   ZC3H12A 0.4
    chr1    37356469    39797055    2.44    57  89.47   MEAF6   0.91
    chr1    37356469    39797055    2.44    57  89.47   SNIP1   0.81
    chr1    37356469    39797055    2.44    57  89.47   DNALI1  0.41
    chr1    37356469    39797055    2.44    57  89.47   GNL2    1.19
    chr1    37356469    39797055    2.44    57  89.47   RSPO1   0.97
    chr1    37356469    39797055    2.44    57  89.47   C1orf109    0.44
    chr1    37356469    39797055    2.44    57  89.47   CDCA8   0.71
    chr1    37356469    39797055    2.44    57  89.47   EPHA10  2.1
    chr1    37356469    39797055    2.44    57  89.47   MANEAL  0.3
    chr1    37356469    39797055    2.44    57  89.47   YRDC    0.21
    

    3. 提取gff3文件的python代码

    # 导入包
    import pandas as pd
    import numpy as np
    import pysam
    # 读取gff3文件
    gencode = pd.read_table("/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3", comment="#",
                            sep = "\t", names = ['seqname', 'source', 'feature', 'start' , 'end', 'score', 'strand', 'frame', 'attribute'])
    gencode.head()
    

    提取gff3文件中所需要的信息

    gencode_genes = gencode[(gencode.feature == "gene")][['seqname', 'start', 'end', 'attribute']].copy().reset_index().drop('index', axis=1) # Extract genes
    gencode_genes.head()
    

    提取基因名,基因类型,基因状态,基因水平(1verified loci,2manually annotated loci,3automatically annotated loci)

    def gene_info(x):
        # Extract gene names
        g_name = list(filter(lambda x: 'gene_name' in x,  x.split(";")))[0].split("=")[1]
        g_type = list(filter(lambda x: 'gene_type' in x,  x.split(";")))[0].split("=")[1]
        g_status = list(filter(lambda x: 'gene_status' in x,  x.split(";")))[0].split("=")[1]
        g_leve = int(list(filter(lambda x: 'level' in x,  x.split(";")))[0].split("=")[1])
        return (g_name, g_type, g_status, g_leve)
    
    gencode_genes["gene_name"], gencode_genes["gene_type"], gencode_genes["gene_status"], gencode_genes["gene_level"] = zip(*gencode_genes.attribute.apply(lambda x: gene_info(x)))
    gencode_genes.head()
    

    查看基因类型分布

    gencode_genes['gene_type'].drop_duplicates()
    

    提取所有已知的蛋白编码基因

    gencode_genes = gencode_genes[gencode_genes['gene_status'] == 'KNOWN'].reset_index().drop('index', axis=1)
    gencode_genes = gencode_genes[gencode_genes['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)
    gencode_genes.info()
    
    # 移除重复的记录并保存
    ## Sort gene_leve in each chromosome (ascending oder) and remove duplicates
    gencode_genes = gencode_genes.sort_values(['gene_level', 'seqname'], ascending=True).drop_duplicates('gene_name', keep='first').reset_index().drop('index', axis=1) 
    gencode_genes.to_csv('/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt', index=False, header = False, sep="\t")
    gencode_genes.info()
    

    压缩

    %%bash -s /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt
    cut -f 1,2,3,5 $1 | sortBed -i > /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed
    bgzip /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed
    tabix -p bed /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz
    ls -l
    

    4. 注释染色体位置包含的基因名

    定义重叠位置计算函数

    def overlap(q_st, q_end, res_st, res_end):
        o  = min(q_end, res_end)-max(q_st, res_st)
        return o
    

    定义输入与输入文件目录

    import os
    input_path = "/mnt/f/projects/gene/patient1/"
    input_file_name = "patient1.HomRegions.tsv"
    input_file = os.path.join(input_path,input_file_name)
    # 读取输入文件
    ### Read the sample file in to a pandas dataframe
    df = pd.read_table(input_file, 
                       names=['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity'], skiprows=1)
    df.head()
    

    计算重叠基因及重叠部位占染色体位置的百分比

    def gencode_all_known_genes(a, tb):
        genes = []
    
        try:
            for region in tb.fetch(a['chr'], int(a['start']), int(a['end'])):
                if region:
                    r = region.split('\t')
                    overlap_len = overlap(int(a['start']), int(a['end']), int(r[1]), int(r[2]))
                    ret_val = '{}({})'.format(r[3], np.round(overlap_len/float(int(a['end'])-int(a['start']))*100, 2)) ### Percentage of the input interval that overlap with the gene
                    genes.append(ret_val) 
    
            if len(genes)>0:
                return ";".join(genes)
            else:
                return "NA(0)"
        except ValueError:
            return "NA(0)"
    
    import pysam
    gencode_v19 = pysam.TabixFile('/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz')
    
    df['genes'] = df.apply(lambda x: gencode_all_known_genes(x[['chr', 'start', 'end']], gencode_v19), axis=1)
    df.head()
    

    输出样式1

    df.to_csv(os.path.join(input_path,'out_df.tsv'), index=False, header = True, sep="\t")
    

    转为一个基因一行的格式

    # 移除无基因的染色体位置
    ## Remove all the intervals that do not overlap with genes
    df2 = df[df['genes'] != "NA(0)"].reset_index(drop=True)
    # 每个基因一行
    new_rows = []
    for i,r in df2.iterrows():
        g_list = r['genes'].split(";")
        for g in g_list:
            g = g.replace(" ","")
            new_rows.append(np.append(r[['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity', 'genes']].values, g))
    df_perGene = pd.DataFrame()
    df_perGene = df_perGene.append(pd.DataFrame(new_rows, columns=['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity', 'genes', 'gene_ID'])).reset_index().drop('index', axis=1)
    
    df_perGene['gene_name'] = df_perGene['gene_ID'].apply(lambda x: x.split("(")[0])
    df_perGene['gene_coverage'] = df_perGene['gene_ID'].apply(lambda x: x.split("(")[1].replace(")", ""))
    
    ## 移除之前的重复列 drop the genes column
    df_perGene = df_perGene.drop(["genes", "gene_ID"], axis=1)
    df_perGene.head()
    
    ## 输出
    df_perGene.to_csv(os.path.join(input_path,'out_df_perGene.tsv'), index=False, header = True, sep="\t")
    

    来源:https://gist.github.com/PubuduSaneth/6275f5df25868e0a8132d7e40c8d317f#file-01-format_gencode_gff3-ipynb

    相关文章

      网友评论

          本文标题:染色体位置如何注释到基因名

          本文链接:https://www.haomeiwen.com/subject/asukcltx.html