美文网首页Tcga生信TCGA
TCGA转录组数据及临床数据下载及整理

TCGA转录组数据及临床数据下载及整理

作者: 萍智医信 | 来源:发表于2021-07-26 16:53 被阅读0次

    一、TCGA转录组数据下载及整理

    ①进去先看cart是否为空,没有要清空。

    ②下载转录组数据

    20210726163909.png

    基因表达量包括LncRNA和mRNA,用矫正后的FPKM
    下载3个文件


    20210726164909.png

    ③下载数据处理

    下载压缩包先以该压缩包名称解压,然后用perl将各个文件夹中的压缩包汇成到一个文件夹内,perl代码如下

    use strict;
    #
    
    use File::Copy;
    
    my $newDir="files";
    unless(-d $newDir){
        mkdir $newDir or die $!;
    }
    
    opendir(RD, ".") or die $!;
    my @allFiles=readdir(RD);
    closedir(RD);
    
    foreach my $subDir(@allFiles)
    {   
        next if($subDir eq '.');
        next if($subDir eq '..');
        if((-d $subDir) && ($subDir ne $newDir))
        {
            opendir(SUB,"./$subDir") or die $!;
            while(my $file=readdir(SUB))
            {
                if($file=~/\.gz$/)
                {
                    #`cp ./$subDir/$file ./$newDir`;
                    copy("$subDir/$file","$newDir") or die "Copy failed: $!";
                }
            }
            close(SUB);
        }
    }
    

    然后再将文件中的压缩包解压,结果如下

    每个样品基因表达量.png

    将所有文件合并,正常样本在前面,肿瘤样品在后面

    操作方法:将metadata.cart文件和perl脚本放到上图每个样品基因表达量中,进行perl运算,并用txt文件记录正常和肿瘤样品数目

    ![9[WXJLT]X2F@SWXZ4FZZOO.png
    perl代码如下

    use strict;
    #
    
    my $file=$ARGV[0];
    
    #use Data::Dumper;
    use JSON;
    
    my $json = new JSON;
    my $js;
    
    my %hash=();
    my @normalSamples=();
    my @tumorSamples=();
    
    open JFILE, "$file";
    while(<JFILE>) {
        $js .= "$_";
    }
    my $obj = $json->decode($js);
    my @samp1e=(localtime(time));
    for my $i(@{$obj})
    {
        
            my $file_name=$i->{'file_name'};
            my $file_id=$i->{'file_id'};
            my $entity_submitter_id=$i->{'associated_entities'}->[0]->{'entity_submitter_id'};
            $file_name=~s/\.gz//g;
            if(-f $file_name)
            {
                my @idArr=split(/\-/,$entity_submitter_id);
                if($idArr[3]=~/^0/)
                {
                    push(@tumorSamples,$entity_submitter_id);
                }
                else
                {
                  push(@normalSamples,$entity_submitter_id);
              }         
                open(RF,"$file_name") or die $!;
                while(my $line=<RF>)
                {
                    next if($line=~/^\n/);
                    next if($line=~/^\_/);
                    chomp($line);
                    my @arr=split(/\t/,$line);
                    ${$hash{$arr[0]}}{$entity_submitter_id}=$arr[1];
                }
                close(RF);
            }
    }
    #print Dumper $obj
    
    open(WF,">mRNAmatrix.txt") or die $!;
    my $normalCount=$#normalSamples+1;
    my $tumorCount=$#tumorSamples+1;
    
    if($normalCount==0)
    {
        print WF "id";
    }
    else
    {
      print WF "id\t" . join("\t",@normalSamples);
    }
    print WF "\t" . join("\t",@tumorSamples) . "\n";
    foreach my $key(keys %hash)
    {
        print WF $key;
        foreach my $normal(@normalSamples)
        {
            print WF "\t" . ${$hash{$key}}{$normal};
        }
        foreach my $tumor(@tumorSamples)
        {
            print WF "\t" . ${$hash{$key}}{$tumor};
        }
        print WF "\n";
    }
    close(WF);
    
    print "normal count: $normalCount\n";
    print "tumor count: $tumorCount\n";
    
    合并结果.png

    ④ID转换,转换成基因名

    输入文件

    ![输入文件.png

    perl 代码如下

    use strict;
    
    my $gtfFile="human.gtf";
    my $expFile="mRNAmatrix.txt";
    my $outFile="symbol.txt";
    
    my %hash=();
    open(RF,"$gtfFile") or die $!;
    while(my $line=<RF>)
    {
        chomp($line);
        if($line=~/gene_id \"(.+?)\"\;.+gene_name "(.+?)"\;.+gene_biotype \"(.+?)\"\;/)
        {
            $hash{$1}=$2;
        }
    }
    close(RF);
    
    open(RF,"$expFile") or die $!;
    open(WF,">$outFile") or die $!;
    my @samp1e=(localtime(time));
    while(my $line=<RF>)
    {
        
        if($.==1)
        {
            print WF $line;
            next;
        }
        chomp($line);
        my @arr=split(/\t/,$line);
        $arr[0]=~s/(.+)\..+/$1/g;
        if(exists $hash{$arr[0]})
        {
            $arr[0]=$hash{$arr[0]};
            print WF join("\t",@arr) . "\n";
        }
    }
    close(WF); 
    close(RF);
    
    转换结果.png

    二、TCGA临床数据下载及整理

    ①先清空cart

    Cases.png
    Files.png

    ②添加到cart,下载临床数据

    20210729205432.png

    ③提取临床信息

    生存状态0为存活,1为死亡

    结果图.png

    perl代码如下

    use strict;
    #
    
    use XML::Simple;
    
    opendir(RD, ".") or die $!;
    my @dirs=readdir(RD);
    closedir(RD);
    open(WF,">clinical.xls") or die $!;
    print WF "Id\tfutime\tfustat\tAge\tGender\tGrade\tStage\tT\tM\tN\n";
    foreach my $dir(@dirs){
        #print $dir . "\n";
        next if($dir eq '.');
        next if($dir eq '..');
        #print $dir . "\n";
        
        if(-d $dir){
          opendir(RD,"$dir") or die $!;
          while(my $xmlfile=readdir(RD)){
            if($xmlfile=~/\.xml$/){
                #print "$dir\\$xmlfile\n";
                    my $userxs = XML::Simple->new(KeyAttr => "name");
                    my $userxml="";
                    if(-f "$dir/$xmlfile"){
                        $userxml = $userxs->XMLin("$dir/$xmlfile");
                    }else{
                        $userxml = $userxs->XMLin("$dir\$xmlfile");
                    }
                    # print output
                    #open(WF,">dumper.txt") or die $!;
                    #print WF Dumper($userxml);
                    #close(WF);
                    my $disease_code=$userxml->{'admin:admin'}{'admin:disease_code'}{'content'};   #get disease code
                    my $disease_code_lc=lc($disease_code);
                    my $patient_key=$disease_code_lc . ':patient';                                #ucec:patient
                    my $follow_key=$disease_code_lc . ':follow_ups';
                    
                    my $patient_barcode=$userxml->{$patient_key}{'shared:bcr_patient_barcode'}{'content'};  #TCGA-AX-A1CJ
                    my $gender=$userxml->{$patient_key}{'shared:gender'}{'content'};      #male/female
                    my $age=$userxml->{$patient_key}{'clin_shared:age_at_initial_pathologic_diagnosis'}{'content'};
                    my $race=$userxml->{$patient_key}{'clin_shared:race_list'}{'clin_shared:race'}{'content'};  #white/black
                    my $grade=$userxml->{$patient_key}{'shared:neoplasm_histologic_grade'}{'content'};  #G1/G2/G3
                    my $clinical_stage=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:clinical_stage'}{'content'};  #stage I
                    my $clinical_T=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:clinical_categories'}{'shared_stage:clinical_T'}{'content'};
                    my $clinical_M=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:clinical_categories'}{'shared_stage:clinical_M'}{'content'};
                    my $clinical_N=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:clinical_categories'}{'shared_stage:clinical_N'}{'content'};
                    my $pathologic_stage=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:pathologic_stage'}{'content'};  #stage I
                    my $pathologic_T=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:pathologic_categories'}{'shared_stage:pathologic_T'}{'content'};
                    my $pathologic_M=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:pathologic_categories'}{'shared_stage:pathologic_M'}{'content'};
                    my $pathologic_N=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:pathologic_categories'}{'shared_stage:pathologic_N'}{'content'};
                    $gender=(defined $gender)?$gender:"unknow";
                    $age=(defined $age)?$age:"unknow";
                    $race=(defined $race)?$race:"unknow";
                    $grade=(defined $grade)?$grade:"unknow";
                    $clinical_stage=(defined $clinical_stage)?$clinical_stage:"unknow";
                    $clinical_T=(defined $clinical_T)?$clinical_T:"unknow";
                    $clinical_M=(defined $clinical_M)?$clinical_M:"unknow";
                    $clinical_N=(defined $clinical_N)?$clinical_N:"unknow";
                    $pathologic_stage=(defined $pathologic_stage)?$pathologic_stage:"unknow";
                    $pathologic_T=(defined $pathologic_T)?$pathologic_T:"unknow";
                    $pathologic_M=(defined $pathologic_M)?$pathologic_M:"unknow";
                    $pathologic_N=(defined $pathologic_N)?$pathologic_N:"unknow";
                    
                    my $survivalTime="";
                    my $vital_status=$userxml->{$patient_key}{'clin_shared:vital_status'}{'content'};
                    my $followup=$userxml->{$patient_key}{'clin_shared:days_to_last_followup'}{'content'};
                    my $death=$userxml->{$patient_key}{'clin_shared:days_to_death'}{'content'};
                    if($vital_status eq 'Alive'){
                        $survivalTime="$followup\t0";
                    }
                    else{
                        $survivalTime="$death\t1";
                    }
                    for my $i(keys %{$userxml->{$patient_key}{$follow_key}}){
                        eval{
                                $followup=$userxml->{$patient_key}{$follow_key}{$i}{'clin_shared:days_to_last_followup'}{'content'};
                                $vital_status=$userxml->{$patient_key}{$follow_key}{$i}{'clin_shared:vital_status'}{'content'};
                                $death=$userxml->{$patient_key}{$follow_key}{$i}{'clin_shared:days_to_death'}{'content'};
                      };
                      if($@){
                          for my $j(0..5){                       #假设最多有6次随访
                                      my $followup_for=$userxml->{$patient_key}{$follow_key}{$i}[$j]{'clin_shared:days_to_last_followup'}{'content'};
                                        my $vital_status_for=$userxml->{$patient_key}{$follow_key}{$i}[$j]{'clin_shared:vital_status'}{'content'};
                                        my $death_for=$userxml->{$patient_key}{$follow_key}{$i}[$j]{'clin_shared:days_to_death'}{'content'};
                                        if( ($followup_for =~ /\d+/) || ($death_for  =~ /\d+/) ){
                                                      $followup=$followup_for;
                                                      $vital_status=$vital_status_for;
                                                      $death=$death_for;
                                                      my @survivalArr=split(/\t/,$survivalTime);
                                                        if($vital_status eq 'Alive'){
                                                            if($followup>$survivalArr[0]){
                                                            $survivalTime="$followup\t0";
                                                          }
                                                      }
                                                      else{
                                                        if($death>$survivalArr[0]){
                                                            $survivalTime="$death\t1";
                                                          }
                                                      }
                                        }
                              }
                      }
    
                      my @survivalArr=split(/\t/,$survivalTime);
                        if($vital_status eq 'Alive'){
                            if($followup>$survivalArr[0]){
                            $survivalTime="$followup\t0";
                          }
                      }
                      else{
                        if($death>$survivalArr[0]){
                            $survivalTime="$death\t1";
                          }
                      }
                      
                    }
                    print WF "$patient_barcode\t$survivalTime\t$age\t$gender\t$grade\t$pathologic_stage\t$pathologic_T\t$pathologic_M\t$pathologic_N\n";
                }
            }
            close(RD);
        }
    }
    close(WF);
    

    相关文章

      网友评论

        本文标题:TCGA转录组数据及临床数据下载及整理

        本文链接:https://www.haomeiwen.com/subject/xoqgmltx.html