上一次讲解了,如何使用perl脚本进行metadata中分组信息的提取,对于更详细的临床病例信息还需要从clinical_data.json中提取进行分析。当然,用R语言也可以,晚些我会把R的代码也加进去。
1.先来看下clinical_data.json数据结构
是一个array,hash相互嵌套的数据结构,我们要提取的有:
year_of_diagnosis
classification_of_tumor
last_known_disease_status
primary_diagnosis
等所有的这些信息,
1 [{
2 "diagnoses": [
3 {
4 "year_of_diagnosis": 2007,
5 "classification_of_tumor": "not reported",
6 "last_known_disease_status": "not reported",
7 "updated_datetime": "2019-08-08T17:35:29.350497-05:00",
8 "primary_diagnosis": "Carcinoma, diffuse type",
9 "submitter_id": "TCGA-MX-A5UG_diagnosis",
10 "tumor_stage": "stage iiia",
11 "age_at_diagnosis": 28714,
12 "morphology": "8145/3",
13 "days_to_last_known_disease_status": null,
14 "created_datetime": null,
15 "prior_treatment": "No",
16 "ajcc_pathologic_n": "N1",
17 "ajcc_pathologic_m": "M0",
18 "state": "released",
19 "days_to_last_follow_up": null,
20 "days_to_recurrence": null,
21 "diagnosis_id": "b566c2ed-3445-57f1-a432-97b53cda1733",
22 "tumor_grade": "not reported",
23 "treatments": [
24 {
25 "days_to_treatment_start": null,
26 "updated_datetime": "2019-08-01T00:08:31.012165-05:00",
27 "treatment_effect": null,
28 "initial_disease_status": null,
29 "treatment_type": "Pharmaceutical Therapy, NOS",
30 "submitter_id": "TCGA-MX-A5UG_treatment_1",
31 "treatment_id": "545fce87-ad86-5437-9b12-507644c3e28d",
32 "created_datetime": "2019-04-28T15:49:21.905058-05:00",
33 "state": "released",
34 "therapeutic_agents": null,
35 "regimen_or_line_of_therapy": null,
36 "treatment_intent_type": null,
37 "treatment_anatomic_site": null,
38 "treatment_outcome": null,
39 "days_to_treatment_end": null,
40 "treatment_or_therapy": "no"
41 },
42 {
43 "updated_datetime": "2019-08-01T00:08:31.012165-05:00",
44 "created_datetime": null,
45 "treatment_type": "Radiation Therapy, NOS",
46 "submitter_id": "TCGA-MX-A5UG_treatment",
47 "treatment_id": "f179fb0a-397e-53a7-a6b5-884b177402db",
48 "state": "released",
49 "therapeutic_agents": null,
50 "treatment_intent_type": null,
2. 利用Perl提取信息
perl语言具有强大的处理文本功能,对于该段json文件,提取代码如下
#! /usr/bin/perl -w
my($file1,$file2)=@ARGV;
my $array;
my %hash;
open FILEIN, "$file1" || die "cannot open file : $!";
open FILEOUT, ">$file2" ||die "cannot write file $!";
while(<FILEIN>){
chomp;
if(/(\S+)\:\s(\S+)/){
if($2 =~ /\"(\S+)(\_diagnosis\"\,)$/){$hash{new_id} .= $1}
else{$hash{$1} .= $2};
}
}
close FILEIN;
foreach my $k (sort keys %hash){
print FILEOUT "$k $hash{$k}\n";
}
close FILEOUT;
3. 结果
获得所有的临床信息,就可以根据所需进一步选择就可以了
perl annotation2.pl clinical.cart.2019-09-28.json x3.txt
cat x3.txt | less -SN
1 "age_at_diagnosis" 28714,22792,25300,22883,27506,28037,28919,29107,29441,27594,30178,18318,26474,27549,32024,25506,28791,21936,25913,26290,25637,20700,21753,21244,2849
2 "age_at_index" 78,62,69,62,75,76,79,79,80,75,82,50,72,75,87,69,78,60,70,71,70,56,59,58,78,66,70,64,66,78,64,44,58,63,55,74,59,62,66,54,77,63,45,72,83,60,86,86,57,43,69,84,
3 "ajcc_pathologic_m" "M0","M0","M0","M0","M0","M0","M0","MX","M0","M0","M0","M1","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0
4 "ajcc_pathologic_n" "N1","N0","N0","N3a","N2","N0","N0","N3","N1","N0","N0","N1","N1","N2","NX","N3a","N1","N1","N2","NX","N0","N1","N3","N1","N0","N2","N2","N1","N0","
5 "ajcc_pathologic_stage" "Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"S
6 "ajcc_pathologic_t" "T3""T2""T3""T3""T4a""T2""T3""T4a""TX""T1b""T3""T4""T4""T4a""TX""T4b""T3""T3""T4b""T2""T3""T3""T4""T4""T3""T2""T4b""T2""T3""TX""T1b""T4""T3""T4""T3"
7 "ajcc_staging_system_edition" "6th","7th","7th","7th","7th","7th","7th","7th","6th","7th","7th","6th","7th","7th","6th","7th","6th","7th","7th","6th","7th","7th","6th","5
8 "alcohol_history" "Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not
9 "alcohol_intensity" null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,nul
10 "bmi" null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null
11 "case_id" "f72a26e8-7f96-4d86-b37b-7dc35f681133","6e03b415-84a1-4b91-8717-1a41edd4a255","9ef7582b-d4c1-4036-a1ed-ef65aa46fc60","4020b1b1-576d-4869-9ff5-552e3afb3ab5",
12 "cigarettes_per_day" null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,nul
13 "classification_of_tumor" "not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not
14 "created_datetime" null,"2019-04-28T15:49:21.905058-05:00",null,null,null,null,null,"2019-04-28T15:42:39.023165-05:00",null,null,null,null,"2019-04-28T15:52:49.326357-
转自“医学统计园”微信公众号,使用代码或转载请注明出处。
网友评论