美文网首页生信猿
【Perl】日常代码记录

【Perl】日常代码记录

作者: oddxix | 来源:发表于2018-12-14 20:48 被阅读7次

    欢迎关注公众号:oddxix

    最近一段时间整理流程,要经常写perl,先前一直很逃避学习perl,不过写过几次之后发现没那么抵触了,学习最快速的方式还是去解决问题,开始我写的时候写什么都报错,慢慢看了写别人写的程序,尝试去改写,后来就能有的放矢,自由发挥了,先记录部分,后续慢慢补充。都有个过程,现在写的可能还不是最简化的,要是有更简单的方式,欢迎交流。

    1.想统计比对到基因组上的相关信息

    数据形式:

    ## net.sf.picard.metrics.StringHeader
    # net.sf.picard.sam.MarkDuplicates INPUT=[./3.mapping/SUNchao.sort.bam] OUTPUT=./5.varcalling/SUNchao.sort.rmdup.bam METRICS_FILE=./5.varcalling/SUNchao.rmdup.metrics REMOVE_DUPLICATES=true ASSUME_SORTED=true MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=10000 TMP_DIR=[./tmp] VALIDATION_STRINGENCY=SILENT    PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 SORTING_COLLECTION_SIZE_RATIO=0.25 READ_NAME_REGEX=[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).* OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false
    ## net.sf.picard.metrics.StringHeader
    # Started on: Tue Nov 27 13:54:59 CST 2018
    
    ## METRICS CLASS        net.sf.picard.sam.DuplicationMetrics
    LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED     UNMAPPED_READS  UNPAIRED_READ_DUPLICATES        READ_PAIR_DUPLICATES    READ_PAIR_OPTICAL_DUPLICATES    PERCENT_DUPLICATION     ESTIMATED_LIBRARY_SIZE
    SUNchao 65839   46502770        74979   27346   5123866 3758682 0.1104  654836274
    

    第一个任务是从*.rmdup.metrics后缀文件提取出那个0.1104的数据,

    代码如下:

    ##perl语言报错警告
    use warnings;
    use strict;
    
    #定义哈希
    my %h;
    #匹配相关文件保存到数组中
    my @PCR_dup = glob " *.rmdup.metrics";
    
    #用文件句柄输入输出,>表示输出,<表示输入
    open FHB, ">3.mapping_stastics.txt" ;
    #输出表头
    print FHB "Samples","\t","PCR_dup Rates","\t","Reference_Mapped_reads","\t","Reference Mapped Data (Mb)","\n" ;
    
    #foreach循环读入匹配到的文件
    foreach my $PCR_dup (@PCR_dup){
            #匹配文件前缀
            my $prefix=$1, if ($PCR_dup=~ /^(.*?)\.rmdup\.metrics/);
            open FHA,"<",$PCR_dup ;
            #开始按行读入文件
            while(my $data=<FHA>){
                    #正则表达式抓取需要的数据
                    if($data=~ /^$prefix.*\t(\S+)\t(\d+)\n$/){
                            my $a=$1;
                            push @{$h{$prefix}},$a;
                    }
            }
    
    }
    
    my @depth_stat = glob "*.depth.stat" ;
    
    #每个foreach循环是对一类文件进行数据提取,大致思想都一样
    foreach my $depth_stat(@depth_stat){
            my $prefix=$1, if ($depth_stat=~ /^(.*?)\.depth\.stat/);
            open FHC, "<", $depth_stat ;
    
            while(my $line=<FHC>){
                    if ($line =~ /Depth\t(.*?)$/){
    #                       print FHB $1,"\n";
    
                            my $Reference_Mapped_reads= $1;
                            my $tmp=$Reference_Mapped_reads/1000000;
                            my $Reference_Mapped_Data=sprintf "%.1f",$tmp;
                            push @{$h{$prefix}},$Reference_Mapped_reads;
                            push @{$h{$prefix}},$Reference_Mapped_Data;
                            
                    }
            }
    }
    
    
    
            foreach my $k(keys %h){
                    print FHB $k;
                    foreach my $s (@{$h{$k}}){
                    print FHB "\t",$s;
                    }
                    print FHB "\n";
            }
    
    
    

    写出这个程序我学到的几点:
    1.glob的运用,类似还可以用ls
    2.哈希和数组的结合,push @{h{prefix}},$a;

    批量生成batch.pl

    主要用于批量生成shell,有时候样本数据很多,需要对单样本进行批量操作。批量生成shell后,使用for i in *.sh; do nohup bash $i &done就可以

    use strict;
    use warnings;
    
    my @file=`ls ../rawdata/*_R1.fastq`;
    
    foreach my $d(@file){
            chomp $d;
            print $d,"\n";
            my $n;
            if ($d=~/\.\.\/rawdata\/(.*)_R1\.fastq/){
                    $n=$1;
            }
    
            `sed 'saaa/$n/g' aaa.sh > $n.sh`;
    }
    
    

    Parallel.pl 主要用于批量操作并控制并行线程数目

    #!/usr/bin/perl
    use strict;
    use warnings;
    use Parallel::ForkManager;
    
    my @files = glob "*_R1.fq.gz";
    #下面括号的数字是进程数
    my  $pm=new Parallel::ForkManager(10);
    foreach my $file (@files){
        $pm->start and next;
            my $file2=$file=~s/_R1/_R2/r;
            my $name=$file=~s/_R1\.fq\.gz//r;
    #里面的命令是想要批量操作命令
            `perl ../single_QC_map_snp.pl $file $file2 ../bed.xls ../chr.pos ../1.map/$name/$name`;    
            $pm->finish; 
    }
    $pm->wait_all_children;
    
    

    use warnings;
    use strict;
    
    my %i;
    my %sum;
    my %dep;
    my %h;
    my @chr=("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY");
    
    
    my @file=glob "*.exome-depth";
    foreach my $file(@file){
            my $prefix= $1 ,if ($file=~ /^(.*?)\.exome-depth/);
            open FHA,"<",$file;
            open(FHB,">$prefix.chr.barplot");
            print FHB "Chr","\t",$prefix,"\n";
    
            while(my $line=<FHA>){
                    my @line=split /\t/,$line;
                    foreach my $chr (@chr){
                            if ($line[0] eq $chr){
                                    $sum{$chr}+=$line[2];
                                    $i{$chr} +=1;           
                                    $dep{$chr}=$sum{$chr}/$i{$chr};
    #                       push @{$dep{$1}},$dep;
                            }
                    }               
    
            }
    #       print FHB $prefix,"\t";
            
            foreach my $k (@chr){
    #               print FHB $prefix,"\t";
                    print FHB $k,"\t",sprintf("%.2f",$dep{$k}),"\n";
                    $sum{$k}= 0;
                    $i{$k}=0;
            }       
    
    }
    
    

    相关文章

      网友评论

        本文标题:【Perl】日常代码记录

        本文链接:https://www.haomeiwen.com/subject/ikpghqtx.html