美文网首页Perl学习笔记
clinical.json 数据中注释信息提取

clinical.json 数据中注释信息提取

作者: dming1024 | 来源:发表于2019-10-06 20:57 被阅读0次

    在之前的基础上:TCGA clinical_data.json中临床信息的提取,对之前那的perl脚本进行改进。

    #! /usr/bin/perl -w
    unless(@ARGV ==2){
     die "usage: perl $0 <clinical.json> <fileout.txt> $!";
    }
    
    my($file1,$file2)=@ARGV;
    
    my %hash;
    open FILEIN, "$file1" || die "cannot open file : $!";
    open FILEOUT, ">$file2" ||die "cannot write file $!";
    while(<FILEIN>){
     chomp;
     if(/(\S+)\:\s(.*)\,/){
      if($2 =~ /\"(\S+)(\_diagnosis)\"$/){$hash{"new_id"} .= "\t$1"}
      else{$hash{$1} .="\t$2"};
    }
    }
    
    close FILEIN;
    
    foreach my $k (sort keys %hash){
    print FILEOUT "$k \t $hash{$k}\n";
    }
    close FILEOUT;
    

    这个是处理之后的脚本,相比之前,清爽了许多

          1 "age_at_diagnosis"              28714   22792   25300   22883   27506   28037   28919   29107   29441   27594   30178   1831
          2 "age_at_index"          78      62      69      62      75      76      79      79      80      75      82      50      72
          3 "ajcc_pathologic_m"             "M0"    "M0"    "M0"    "M0"    "M0"    "M0"    "M0"    "MX"    "M0"    "M0"    "M0"    "M1"
          4 "ajcc_pathologic_n"             "N1"    "N0"    "N0"    "N3a"   "N2"    "N0"    "N0"    "N3"    "N1"    "N0"    "N0"    "N1"
          5 "ajcc_pathologic_stage"                 "Stage IIIA"    "Stage IB"      "Stage II"      "Stage IIIB"    "Stage IIIB"    "Sta
          6 "ajcc_staging_system_edition"           "6th"   "7th"   "7th"   "7th"   "7th"   "7th"   "7th"   "7th"   "6th"   "7th"   "7th
          7 "alcohol_history"               "Not Reported"  "Not Reported"  "Not Reported"  "Not Reported"  "Not Reported"  "Not Reporte
          8 "alcohol_intensity"             null    null    null    null    null    null    null    null    null    null    null    null
          9 "bmi"           null    null    null    null    null    null    null    null    null    null    null    null    null    null
         10 "case_id"               "f72a26e8-7f96-4d86-b37b-7dc35f681133"  "6e03b415-84a1-4b91-8717-1a41edd4a255"  "9ef7582b-d4c1-4036-
         11 "cigarettes_per_day"            null    null    null    null    null    null    null    null    null    null    null    null
         12 "classification_of_tumor"               "not reported"  "not reported"  "not reported"  "not reported"  "not reported"  "not
         13 "created_datetime"              null    "2019-04-28T15:49:21.905058-05:00"      null    null    null    null    null    "201
         14 "days_to_birth"                 -28714  -22792  -25300  -22883  -27506  -28037  -28919  -29107  -29441  -27594  -30178  -183
         15 "days_to_death"                 113     359     661     24      284     476     439     300     52      570     356     300
         16 "days_to_diagnosis"             0       0       0       0       0       0       0       0       0       0       0       0
         17 "days_to_last_follow_up"                null    356     1072    11      838     754     null    null    0       23      694
         18 "days_to_last_known_disease_status"             null    null    null    null    null    null    null    null    null    null
         19 "days_to_recurrence"            null    null    null    null    null    null    null    null    null    null    null    null
         20 "days_to_treatment_end"                 null    null    null    null    null    null    null    null    null    null    null
    

    相关文章

      网友评论

        本文标题:clinical.json 数据中注释信息提取

        本文链接:https://www.haomeiwen.com/subject/awgupctx.html