在之前的基础上:TCGA clinical_data.json中临床信息的提取,对之前那的perl脚本进行改进。
#! /usr/bin/perl -w
unless(@ARGV ==2){
die "usage: perl $0 <clinical.json> <fileout.txt> $!";
}
my($file1,$file2)=@ARGV;
my %hash;
open FILEIN, "$file1" || die "cannot open file : $!";
open FILEOUT, ">$file2" ||die "cannot write file $!";
while(<FILEIN>){
chomp;
if(/(\S+)\:\s(.*)\,/){
if($2 =~ /\"(\S+)(\_diagnosis)\"$/){$hash{"new_id"} .= "\t$1"}
else{$hash{$1} .="\t$2"};
}
}
close FILEIN;
foreach my $k (sort keys %hash){
print FILEOUT "$k \t $hash{$k}\n";
}
close FILEOUT;
这个是处理之后的脚本,相比之前,清爽了许多
1 "age_at_diagnosis" 28714 22792 25300 22883 27506 28037 28919 29107 29441 27594 30178 1831
2 "age_at_index" 78 62 69 62 75 76 79 79 80 75 82 50 72
3 "ajcc_pathologic_m" "M0" "M0" "M0" "M0" "M0" "M0" "M0" "MX" "M0" "M0" "M0" "M1"
4 "ajcc_pathologic_n" "N1" "N0" "N0" "N3a" "N2" "N0" "N0" "N3" "N1" "N0" "N0" "N1"
5 "ajcc_pathologic_stage" "Stage IIIA" "Stage IB" "Stage II" "Stage IIIB" "Stage IIIB" "Sta
6 "ajcc_staging_system_edition" "6th" "7th" "7th" "7th" "7th" "7th" "7th" "7th" "6th" "7th" "7th
7 "alcohol_history" "Not Reported" "Not Reported" "Not Reported" "Not Reported" "Not Reported" "Not Reporte
8 "alcohol_intensity" null null null null null null null null null null null null
9 "bmi" null null null null null null null null null null null null null null
10 "case_id" "f72a26e8-7f96-4d86-b37b-7dc35f681133" "6e03b415-84a1-4b91-8717-1a41edd4a255" "9ef7582b-d4c1-4036-
11 "cigarettes_per_day" null null null null null null null null null null null null
12 "classification_of_tumor" "not reported" "not reported" "not reported" "not reported" "not reported" "not
13 "created_datetime" null "2019-04-28T15:49:21.905058-05:00" null null null null null "201
14 "days_to_birth" -28714 -22792 -25300 -22883 -27506 -28037 -28919 -29107 -29441 -27594 -30178 -183
15 "days_to_death" 113 359 661 24 284 476 439 300 52 570 356 300
16 "days_to_diagnosis" 0 0 0 0 0 0 0 0 0 0 0 0
17 "days_to_last_follow_up" null 356 1072 11 838 754 null null 0 23 694
18 "days_to_last_known_disease_status" null null null null null null null null null null
19 "days_to_recurrence" null null null null null null null null null null null null
20 "days_to_treatment_end" null null null null null null null null null null null
网友评论