美文网首页Linux与生物信息组学比较基因组
比较基因组从入门到放弃(3)

比较基因组从入门到放弃(3)

作者: Morriyaty | 来源:发表于2021-06-16 16:05 被阅读0次

    今来做一哈基因家族收缩与扩张吼
    用到的软件是 CAFE [基因课 提供的sif环境]

    准备输入文件
    orthofinder 文件夹中的 Orthogroups.GeneCount.tsv
    mcmctree 生成的 超度量树
    
    #修改树文件(这里应该没有空格)
    cat FigTree.tre  | sed  's/\[[^]]\+\]//g'| awk -F "=" '/UTREE/{print $2} '  > input.tree.nwk
    sed  -e 's/:/\n:/g' -e 's/\([),]\)/\n\1/g'  input.tree.nwk  |awk '{if($1~/:$/){printf ":"100*$2} else {printf $0}}' |sed 's/\s\+//g' > input.tree
    #(a: 99.7080, (h: 96.6662, ((g: 7.1266, hu: 7.1266): 72.8073, (r: 72.4080, (ha: 29.3095, (ca: 24.8914, (m: 15.2361, ra: 15.2361): 9.6553): 4.4181): 43.0985): 7.5259): 16.7322): 3.0418);
    #修改genecount文件
    dos2unix Orthogroups.GeneCount.tsv
    sed 's/[a-zA-Z0-9]\+$//' Orthogroups.GeneCount.tsv | awk '{print $1"\t"$0}' > input.tab
    
    #运行CAFE
    singularity exec PhyloTools.sif cafe5 --infile input.tab --tree FigTree.tre --output_prefix cafe_ortho --cores 10 -P 0.01
    
    #得到某一物种快速进化的基因家族名称
    cat Base_asr.tre | grep "=" | sed 's/(/ /g' | sed 's/)/ /g' | sed 's/,/ /g' | awk '{print$2,$11}' | grep "*" | awk '{print$1}' > x.all.list
    #查看所有扩张的基因家族
    cat Base_change.tab | awk '{print$1,$5}' | grep "OG" | grep "+" | grep -v "+0"  | awk '{print$1}'> x.expand.list
    #取交集
    sort x.all.list x.expand.list | uniq -d > x.rapidlyexpand.list
    #得到收缩的基因家族
    cat Base_change.tab | awk '{print$1,$3}' | grep "OG" | grep "-" | awk '{print $1}' > ../acomys.contract.OGname.list
    
    #得到该物种基因家族对应的基因名
    python3 sample_Orthogroup.py Orthogroups.txt newortho.txt
    cat newortho.txt | grep  "|" | sed 's/x|//g' > x.txt
    sh seq.sh | awk '{$1="";print $0}' | tr " " "\n" | sed '/^\s*$/d' | sort | uniq > xgene.list
    (seq.sh:
    grep    OG0001243       x.txt
    .....)
    
    #得到某一物种扩张的基因家族的GO ID (显著扩张同理)
    python3 OG2gene.py acomys.expand.OGname.list Orthogroups.txt ca acomys.expand.gene.list
    python3 gene2GO.py acomys.expand.gene.list acomys-geneID2GoID.txt acomys.expand.go.list
    cat acomys.expand.go.list | tr "," "\n" | sort | uniq > acomys.expand.finalgo.txt
    

    实验室祖传pipeline

    ## step by step:
    ## (python2):
    python2.7 cafetutorial_clade_and_size_filter.py -i unfiltered_cafe_input.txt -o filtered_cafe_input.txt -s  # 其中输入文件是 input.tab 修改表头得到
    ## 运行两次 CAFE
    #第一次输入文件为 filtered_cafe_input.txt 为了得到 lambda 值
    #第二次输入文件为 large_filtered_cafe_input.txt 输入-l (L)参数指定 lambda 值
    ## 一般只描述第一次结果 第二次结果可以看看有没有需要 
    
    #OG2gene.py
    '''
    WYJ YouHahahaha
    This script was used in cafe v5
    Extract gene name based on OG name
    usage: python3 this_script OG-list Orthogroups.txt species-name output 
    '''
    
    import sys
    
    f1 = open(sys.argv[1],'r')
    f2 = open(sys.argv[2],'r')
    f3 = sys.argv[3]
    f4 = open(sys.argv[4],'w')
    
    og = f1.read().splitlines()
    q = f3 + "|"
    
    for line in f2:
        line = line.strip()
        n = line.split(":")[0]
        if n in og:
            f4.write(n+"\t")
            x = line.split()[1:]
            for i in x :
                if q in i:
                    f4.write(i+"\t")
            f4.write("\n")
    
    f1.close()
    f2.close()
    f4.close()
    
    '''
    This script was uesd trans gene2GO
    usage: python3 this_script file1 file2 output
    file 1 : 
        OG0000000       evm.model.LG01.1        evm.model.LG02.1062 .....
    file 2 :
        evm.model.LG17.814      GO:0005125
    '''
    
    import sys
    
    f1 = open(sys.argv[1],'r')
    f2 = open(sys.argv[2],'r')
    f3 = open(sys.argv[3],'w')
    
    d = []
    p = {}
    
    for line in f1:
        x = line.strip().split()[1:]
        for i in x:
            if i not in d:
                d.append(i)
    
    for line in f2:
        line = line.strip().split()
        p[line[0]] = line[1]
    
    for i in d:
        if i in p:
            x = p[i]
            f3.write(x+"\n")
    
    f1.close()
    f2.close()
    f3.close()
    
    # 根据go.annotation.xls 得到 GO ID  GO Name GO Category
    cat cishu.go.annotation.xls | awk -F "\t" '{print$2}' | grep -v "^-" | sed 's/),/XXX/g' | sed 's/XXX/\n/g' | sed 's/(/\t/' | sed 's/)$//' |  sort | uniq | grep -v "^B" |awk -F "\t" '{print$1"\t"$2"\t""Biological process"}' > GOID-NAME-BP.list
    #CC MF 类似
    cat GOID-NAME-BP.list GOID-NAME-CC.list GOID-NAME-MF.list  > GOID-NAME-ALL.list
    awk -F "\t" 'NR==FNR{a[$1]=$0; next} {print a[$1]}'  GOID-NAME-ALL.list  acomys.expand.finalgo.txt  > acomys.expand.GOID.txt
    

    后续富集工作不再赘述

    相关文章

      网友评论

        本文标题:比较基因组从入门到放弃(3)

        本文链接:https://www.haomeiwen.com/subject/tibxyltx.html