之前以介绍过如何使用perl,R处理由TCGA上下载的metadata.json文件,现在尝试采用python脚本来处理,总有一款适用你。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 27 10:42:07 2019
@author: qiangqiangfan
"""
import re
data=open("metadata.json","r")
patern=re.compile(r'(\S+):\s(.*),')
result={}
content=[]
for f in data.readlines():
m=patern.match(f.strip())
#判断是否匹配成功
if m:
#判断key是否在hash中
if m.group(1) in result:
result[m.group(1)].append(m.group(2))
else:
content=m.group(2).split()
result[m.group(1)]=content
data.close()
#现在所有的结果都在result组成的hash中
#for key,value in result.items():
# print('{}:{}'.format(key,value))
#file_id,entity_id,case_id,entity_submitter_id,file_name
import pandas as pd
#过滤以.gz结尾的列表
file_name=list(filter(lambda x: re.match(r'.*gz',x),
result['"file_name"']))
#还有submitterid
entity_submitter_id=result['"entity_submitter_id"']
df_result=pd.DataFrame([file_name,entity_submitter_id],
index=['file_name','submitter_id'])
#输出到当前文件夹下的result.csv
df_result.T.to_csv("result.csv")
运行完成的python脚本,会在当前目录下产生一个result.csv文件,打开之后是这个样子:
head -10 result.csv
,file_name,submitter_id
0,"""b7abd283-60c7-415c-97da-61473d81e4ad.htseq.counts.gz""","""TCGA-DD-AAD0-01A-11R-A41C-07"""
1,"""c7fd15b5-1f15-4fe9-b3ce-77bf232bd110.htseq.counts.gz""","""TCGA-DD-A114-01A-11R-A131-07"""
2,"""d11a42d8-6486-4288-bf5c-864448bb1fd6.htseq.counts.gz""","""TCGA-CC-A9FV-01A-11R-A37K-07"""
3,"""28ef84e7-f566-40a7-b239-26d3e23c878a.htseq.counts.gz""","""TCGA-DD-A1EC-01A-21R-A131-07"""
4,"""eb7813d6-77f1-41e4-a2c2-a4f4ec42d594.htseq.counts.gz""","""TCGA-EP-A3RK-01A-11R-A22L-07"""
5,"""491e50ed-21d3-4dc6-ae6c-e56f94e22ca4.htseq.counts.gz""","""TCGA-DD-A1EE-11A-11R-A131-07"""
6,"""52d8680b-0df6-4605-bca4-3539f3e0e4f0.htseq.counts.gz""","""TCGA-G3-AAV7-01A-11R-A38B-07"""
7,"""16bbd77f-b39b-4f19-9d6b-58a21dde3e84.htseq.counts.gz""","""TCGA-DD-AAE7-01A-11R-A41C-07"""
8,"""94425c86-3918-4de4-a821-3912a27f098d.htseq.counts.gz""","""TCGA-CC-A7IE-01A-21R-A38B-07"""
网友评论