美文网首页Perl学习笔记TCGA数据分析
TCGA clinical_data.json中临床信息的提取

TCGA clinical_data.json中临床信息的提取

作者: dming1024 | 来源:发表于2019-09-29 21:47 被阅读0次

    上一次讲解了,如何使用perl脚本进行metadata中分组信息的提取,对于更详细的临床病例信息还需要从clinical_data.json中提取进行分析。当然,用R语言也可以,晚些我会把R的代码也加进去。

    1.先来看下clinical_data.json数据结构
    是一个array,hash相互嵌套的数据结构,我们要提取的有:
    year_of_diagnosis
    classification_of_tumor
    last_known_disease_status
    primary_diagnosis
    等所有的这些信息,

    1 [{
          2   "diagnoses": [
          3     {
          4       "year_of_diagnosis": 2007,
          5       "classification_of_tumor": "not reported",
          6       "last_known_disease_status": "not reported",
          7       "updated_datetime": "2019-08-08T17:35:29.350497-05:00",
          8       "primary_diagnosis": "Carcinoma, diffuse type",
          9       "submitter_id": "TCGA-MX-A5UG_diagnosis",
         10       "tumor_stage": "stage iiia",
         11       "age_at_diagnosis": 28714,
         12       "morphology": "8145/3",
         13       "days_to_last_known_disease_status": null,
         14       "created_datetime": null,
         15       "prior_treatment": "No",
         16       "ajcc_pathologic_n": "N1",
         17       "ajcc_pathologic_m": "M0",
         18       "state": "released",
         19       "days_to_last_follow_up": null,
         20       "days_to_recurrence": null,
         21       "diagnosis_id": "b566c2ed-3445-57f1-a432-97b53cda1733",
         22       "tumor_grade": "not reported",
         23       "treatments": [
         24         {
         25           "days_to_treatment_start": null,
         26           "updated_datetime": "2019-08-01T00:08:31.012165-05:00",
         27           "treatment_effect": null,
         28           "initial_disease_status": null,
         29           "treatment_type": "Pharmaceutical Therapy, NOS",
         30           "submitter_id": "TCGA-MX-A5UG_treatment_1",
         31           "treatment_id": "545fce87-ad86-5437-9b12-507644c3e28d",
         32           "created_datetime": "2019-04-28T15:49:21.905058-05:00",
         33           "state": "released",
         34           "therapeutic_agents": null,
         35           "regimen_or_line_of_therapy": null,
         36           "treatment_intent_type": null,
         37           "treatment_anatomic_site": null,
         38           "treatment_outcome": null,
         39           "days_to_treatment_end": null,
         40           "treatment_or_therapy": "no"
         41         },
         42         {
         43           "updated_datetime": "2019-08-01T00:08:31.012165-05:00",
         44           "created_datetime": null,
         45           "treatment_type": "Radiation Therapy, NOS",
         46           "submitter_id": "TCGA-MX-A5UG_treatment",
         47           "treatment_id": "f179fb0a-397e-53a7-a6b5-884b177402db",
         48           "state": "released",
         49           "therapeutic_agents": null,
         50           "treatment_intent_type": null,
    

    2. 利用Perl提取信息
    perl语言具有强大的处理文本功能,对于该段json文件,提取代码如下

    #! /usr/bin/perl -w
    my($file1,$file2)=@ARGV;
    my $array;
    my %hash;
    open FILEIN, "$file1" || die "cannot open file : $!";
    open FILEOUT, ">$file2" ||die "cannot write file $!";
    while(<FILEIN>){
     chomp;
     if(/(\S+)\:\s(\S+)/){
      if($2 =~ /\"(\S+)(\_diagnosis\"\,)$/){$hash{new_id} .= $1}
      else{$hash{$1} .= $2};
    }
    }
    close FILEIN;
    foreach my $k (sort keys %hash){
    print FILEOUT "$k $hash{$k}\n";
    }
    close FILEOUT;
    

    3. 结果
    获得所有的临床信息,就可以根据所需进一步选择就可以了

     perl annotation2.pl clinical.cart.2019-09-28.json x3.txt
     cat x3.txt | less -SN
          1 "age_at_diagnosis"       28714,22792,25300,22883,27506,28037,28919,29107,29441,27594,30178,18318,26474,27549,32024,25506,28791,21936,25913,26290,25637,20700,21753,21244,2849
          2 "age_at_index"   78,62,69,62,75,76,79,79,80,75,82,50,72,75,87,69,78,60,70,71,70,56,59,58,78,66,70,64,66,78,64,44,58,63,55,74,59,62,66,54,77,63,45,72,83,60,86,86,57,43,69,84,
          3 "ajcc_pathologic_m"      "M0","M0","M0","M0","M0","M0","M0","MX","M0","M0","M0","M1","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0
          4 "ajcc_pathologic_n"      "N1","N0","N0","N3a","N2","N0","N0","N3","N1","N0","N0","N1","N1","N2","NX","N3a","N1","N1","N2","NX","N0","N1","N3","N1","N0","N2","N2","N1","N0","
          5 "ajcc_pathologic_stage"          "Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"S
          6 "ajcc_pathologic_t"      "T3""T2""T3""T3""T4a""T2""T3""T4a""TX""T1b""T3""T4""T4""T4a""TX""T4b""T3""T3""T4b""T2""T3""T3""T4""T4""T3""T2""T4b""T2""T3""TX""T1b""T4""T3""T4""T3"
          7 "ajcc_staging_system_edition"    "6th","7th","7th","7th","7th","7th","7th","7th","6th","7th","7th","6th","7th","7th","6th","7th","6th","7th","7th","6th","7th","7th","6th","5
          8 "alcohol_history"        "Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not
          9 "alcohol_intensity"      null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,nul
         10 "bmi"    null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null
         11 "case_id"        "f72a26e8-7f96-4d86-b37b-7dc35f681133","6e03b415-84a1-4b91-8717-1a41edd4a255","9ef7582b-d4c1-4036-a1ed-ef65aa46fc60","4020b1b1-576d-4869-9ff5-552e3afb3ab5",
         12 "cigarettes_per_day"     null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,nul
         13 "classification_of_tumor"        "not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not
         14 "created_datetime"       null,"2019-04-28T15:49:21.905058-05:00",null,null,null,null,null,"2019-04-28T15:42:39.023165-05:00",null,null,null,null,"2019-04-28T15:52:49.326357-
    

    转自“医学统计园”微信公众号,使用代码或转载请注明出处。

    相关文章

      网友评论

        本文标题:TCGA clinical_data.json中临床信息的提取

        本文链接:https://www.haomeiwen.com/subject/acstpctx.html