欢迎关注Bioinfor 生信云 微信公众号!
上一步定量,我们是对每一个样本进行的,现在需要将它合并到一个矩阵。这里通过一个脚本merge_to_matrix.pl 来合并。
脚本
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use File::Basename;
use Getopt::Long qw(:config no_ignore_case bundling pass_through);
my $usage = <<__EOUSAGE__;
############################################################
#
# Usage: $0 --est_method <method> sample1.results sample2.results ...
#
# or $0 --est_method <method> --quant_files file.listing_target_files.txt
#
# Note, if only a single input file is given, it's expected to contain the paths to all the target abundance estimation files.
#
# Required:
#
# --est_method <string> featureCounts|RSEM|eXpress|kallisto|salmon (needs to know what format to expect)
#
# Options:
#
# --cross_sample_norm <string> TMM|UpperQuartile|none (default: TMM)
#
# --name_sample_by_basedir name sample column by dirname instead of filename
# --basedir_index <int> default(-2)
#
# --out_prefix <string> default: 'matrix'
#
# --quant_files <string> file containing a list of all the target files.
#
############################################################
__EOUSAGE__
;
my $help_flag;
my $est_method;
my $val_type;
my $cross_sample_norm = "TMM";
my $name_sample_by_basedir = 0;
my $out_prefix = "matrix";
my $basedir_index = -2;
my $quant_files = "";
&GetOptions('help|h' => \$help_flag,
'est_method=s' => \$est_method,
'cross_sample_norm=s' => \$cross_sample_norm,
'name_sample_by_basedir' => \$name_sample_by_basedir,
'out_prefix=s' => \$out_prefix,
'basedir_index=i' => \$basedir_index,
'quant_files=s' => \$quant_files,
);
unless ($est_method && (@ARGV || $quant_files)) {
die $usage;
}
unless ($est_method =~ /^(featureCounts|RSEM|eXpress|kallisto|salmon)/i) {
die "Error, dont recognize --est_method $est_method ";
}
unless ($cross_sample_norm =~ /^(TMM|UpperQuartile|none)$/i) {
die "Error, dont recognize --cross_sample_norm $cross_sample_norm ";
}
my @files;
if ($quant_files) {
# allow for a file listing the various files.
@files = `cat $quant_files`;
chomp @files;
}
elsif (@ARGV) {
@files = @ARGV;
}
else {
die $usage;
}
=data_formats
## featureCounts
0 gene_id
1 counts
2 fpkm
3 tpm
## RSEM:
0 transcript_id
1 gene_id
2 length
3 effective_length
4 expected_count
5 TPM
6 FPKM
7 IsoPct
## eXpress v1.5:
1 target_id
2 length
3 eff_length
4 tot_counts
5 uniq_counts
6 est_counts
7 eff_counts
8 ambig_distr_alpha
9 ambig_distr_beta
10 fpkm
11 fpkm_conf_low
12 fpkm_conf_high
13 solvable
14 tpm
## kallisto:
0 target_id
1 length
2 eff_length
3 est_counts
4 tpm
## salmon:
0 Name
1 Length
2 EffectiveLength
3 TPM
4 NumReads
=cut
;
my ($acc_field, $counts_field, $fpkm_field, $tpm_field);
if ($est_method =~ /^rsem$/i) {
$acc_field = 0;
$counts_field = "expected_count";
$fpkm_field = "FPKM";
$tpm_field = "TPM";
}
elsif ($est_method =~ /^express$/i) { # as of v1.5
$acc_field = "target_id";
$counts_field = "eff_counts";
$fpkm_field = "fpkm";
$tpm_field = "tpm";
}
elsif ($est_method =~ /^kallisto$/i) {
$acc_field = "target_id";
$counts_field = "est_counts";
$fpkm_field = "tpm";
$tpm_field = "tpm";
}
elsif ($est_method =~ /^salmon/) {
$acc_field = "Name";
$counts_field = "NumReads";
$fpkm_field = "TPM";
$tpm_field = "TPM";
}
elsif ($est_method =~ /^featureCounts/) {
$acc_field = "gene_id";
$counts_field = "counts";
$fpkm_field = "fpkm";
$tpm_field = "tpm";
}
else {
die "Error, dont recognize --est_method [$est_method] ";
}
main: {
my %data;
foreach my $file (@files) {
print STDERR "-reading file: $file\n";
open (my $fh, $file) or die "Error, cannot open file $file";
my $header = <$fh>;
chomp $header;
my %fields = &parse_field_positions($header);
#use Data::Dumper; print STDERR Dumper(\%fields);
while (<$fh>) {
chomp;
my @x = split(/\t/);
my $acc = $x[ $fields{$acc_field} ];
my $count = $x[ $fields{$counts_field} ];
my $fpkm = $x[ $fields{$fpkm_field} ];
my $tpm = $x[ $fields{$tpm_field} ];
$data{$acc}->{$file}->{count} = $count;
$data{$acc}->{$file}->{FPKM} = $fpkm;
$data{$acc}->{$file}->{TPM} = $tpm;
}
close $fh;
}
my @filenames = @files;
foreach my $file (@filenames) {
if ($name_sample_by_basedir) {
my @path = split(m|/|, $file);
$file = $path[$basedir_index];
}
else {
$file = basename($file);
}
}
print STDERR "\n\n* Outputting combined matrix.\n\n";
my $counts_matrix_file = "$out_prefix.counts.matrix";
my $TPM_matrix_file = "$out_prefix.TPM.not_cross_norm";
open (my $ofh_counts, ">$counts_matrix_file") or die "Error, cannot write file $counts_matrix_file";
open (my $ofh_TPM, ">$TPM_matrix_file") or die "Error, cannot write file $TPM_matrix_file";
{ # check to see if they're unique
my %filename_map = map { + $_ => 1 } @filenames;
if (scalar keys %filename_map != scalar @filenames) {
die "Error, the column headings: @filenames are not unique. Should you consider using the --name_sample_by_basedir parameter?";
}
}
脚本用法
perl merge_to_matrix.pl \
--est_method featureCounts \#定量使用的软件
--quant_files genes.quant_files.txt \#定量的样本
--out_prefix genes #输出文件前缀
输入:每个样本的定量结果
输出:
reads count 矩阵: genes.counts.matrix #原始矩阵
表达量矩阵:genes.TMM.EXPR.matrix #标准化后的矩阵
网友评论