美文网首页
TMB_一点分析

TMB_一点分析

作者: 蚂蚁爱吃饭 | 来源:发表于2021-05-31 18:52 被阅读0次

    要分析TMB标准品数据,下载了MC3 maf文件,按照说明文档进行样本/位点的过滤,反正就是滤不出文档里最后滤出的数,我保留的要多一些。

    #!/usr/bin/perl
    use strict;
    use warnings;
    
    my $file=shift;my %ha;my %delete;my %keep;my %arti;
    
    my $bed="/project/baowenjuan/bed_files/652Genes_Panel/V0/gene_panel.bed";
    my %cds;my %panel;my %keep_items;
    open IN,"<$bed";
    while (<IN>) {
        chomp;
        my @a=split;
        for (my $i=$a[1]-5;$i<$a[2]+5;$i++) {
            $cds{$a[0]}{$i}=1;
        }
    }
    close IN;
    
    open O1,">filtered_maf.stat";
    open O2,">filtered_maf";
    open IN,"<$file";
    while (<IN>) {
        chomp;
        my $line=$_;
        next if (/^Hugo_Symbol/);
        my @a=split "\t",$line;
        my $id_t=(split "-",$a[15])[2];
        my $id_c=(split "-",$a[16])[2];
        next unless ($id_t eq $id_c);
        if ($a[-6]=~/native_wga_mix|wga|gapfiller|nonpreferredpair/){$delete{$id_t}=1;$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};
        if ($a[-6]=~/StrandBias|common_in_exac|oxog/){$arti{$id_t}++;$ha{$id_t}{'total_count'}++};
        unless ($a[-15] eq '.' || $a[-15] <0.01){$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};#ExAC_AF
        unless ($a[-12] eq '.' || $a[-12] <0.01){$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};#ExAC_AF_EAS
        if ($a[41]<3||$a[39]<25){$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};#DP >= 25  and AD >= 3
        if ($a[42]<8){$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};#n_depth>=8
        unless (($a[8] eq 'Frame_Shift_Ins') || ($a[8] eq 'Frame_Shift_Del') ||($a[8] eq 'In_Frame_Del') ||($a[8] eq 'In_Frame_Ins') ||($a[8] eq 'Missense_Mutation') ||($a[8] eq 'Nonsense_Mutation') ||($a[8] eq 'Nonstop_Mutation') ||($a[8] eq 'Silent') ||($a[8] eq 'Splice_Site')||($a[8] eq 'Translation_Start_Site')){$ha{$id_t}{'total_count'}++;next};
        my $alt_freq=$a[41]/$a[39];if ($alt_freq>0.1) {$keep{$id_t}=1}#MSAF >= 10%
        push (@{$ha{$id_t}{'freq'}},$alt_freq);
        $ha{$id_t}{'total_count'}++;
        $keep_items{$a[4]}{$a[5]}=1;
        my $chr="chr".$a[4];
        next unless (defined $cds{$chr}{$a[5]});
        push (@{$panel{$id_t}{'freq'}},$alt_freq);
    
    }
    close IN;
    
    open IN,"<$file";
    while (<IN>) {
        chomp;
        if (/Hugo_Symbol/){print O2 "$_\n";next};
        my @a=split "\t",$_;
        my $id_t=(split "-",$a[15])[2];
        next if (defined $arti{$id_t} && ($arti{$id_t}/$ha{$id_t}{'total_count'}>0.5));
        print O2 "$_\n" if (defined $keep_items{$a[4]}{$a[5]} && defined $keep{$id_t});
    }
    close O2;
    close IN;
    
    print O1 "sample_ID\t1\%\t3\%\t5\%\tMyPanel_1%\tMyPanel_3%\tMyPanel_5%\n";
    foreach my $k1 (sort keys %ha) {
        next if (defined $delete{$k1});
        next unless (defined $keep{$k1});
        next if (defined $arti{$k1} && ($arti{$k1}/$ha{$k1}{'total_count'}>0.5));
        my ($f01,$f03,$f05)=check_freq_distribution(\@{$ha{$k1}{'freq'}});
        print O1 "$k1\t$f01\t$f03\t$f05\t";
        ($f01,$f03,$f05)=check_freq_distribution(\@{$panel{$k1}{'freq'}});
        print O1 "$f01\t$f03\t$f05\n";
    }
    
    sub check_freq_distribution{
        my $freqs=$_[0];
        my ($sf01,$sf03,$sf05)=(0,0,0);
        my $c=0;
        while ($c<@$freqs) {
            if ($freqs->[$c] >0.01) {$sf01++}
            if ($freqs->[$c] >0.03) {$sf03++}
            if ($freqs->[$c] >0.05) {$sf05++}
            $c++;
        }
        return ($sf01,$sf03,$sf05);
    }
    

    先把各个样本的各个频率的突变个数统计输出来,以及我们自己的panel(652个基因)包含的突变:


    过滤后的统计文件

    用R的lm()分析下:

    d<-read.table("filtered_maf.stat",header=T)
    #这个是原始的突变个数,还要对panel的大小做一个每Mb的折算哈,涉及到panel的一些敏感信息,就不放代码了
    > head(d,2)
      sample_ID       S01       S03       S05     My01     My03     My05
    1      0003 1.3559683 1.3559683 1.3559683 3.939136 3.939136 3.939136
    2      0033 0.8698664 0.8698664 0.8698664 5.514790 5.514790 5.514790
    lm.model<-lm(d[,5]~d[,2])  #先看下1%的
    summary(lm.model)
    Call:
    lm(formula = d[, 5] ~ d[, 2])
    Residuals:
        Min      1Q  Median      3Q     Max
    -46.427  -1.500  -0.452   0.998  31.987
    Coefficients:
                Estimate Std. Error t value Pr(>|t|)
    (Intercept) 1.610484   0.036389   44.26   <2e-16 ***
    d[, 2]      1.121099   0.001431  783.55   <2e-16 ***
    ---
    Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    Residual standard error: 3.194 on 8257 degrees of freedom
    Multiple R-squared:  0.9867,    Adjusted R-squared:  0.9867
    F-statistic: 6.139e+05 on 1 and 8257 DF,  p-value: < 2.2e-16
    > coefficients(lm.model)
    (Intercept)      d[, 2]
       1.610484    1.121099
    

    就是My01=1.121099*S01+1.610484

    换个python瞅瞅:

    import pandas as pd
    from sklearn import linear_model
    data=pd.read_table("/project/baowenjuan/Tumor_FFPE/project/TMB/raw/TMB.txt",header=0,sep=" ")
    def trainModel(trainData,features,labels):
        model=linear_model.LinearRegression()
        model.fit(trainData[feature],trainData[label])
        return model
    def evaluateModel(model,testData,features,labels):
        error=np.mean((model.predict(testData[features])-testData[labels])**2)
        score=model.score(testData[features],testData[labels])
        return error,score
    features=["My01"]
    labels=["S01"]
    traindata=data[:500]
    testdata=data[501:700]
    model=trainModel(traindata,features,labels)
    error,score=evaluateModel(model,testdata,features,labels)
    >>> error
    S01    6.058423
    dtype: float64
    >>> score
    0.9233057201839856
    

    这score还是蛮高的。用统计方法看看:

    import statsmodels.api as sm
    features=["S01"]                
    labels=["My01"]                   
    Y=traindata[labels]                   
    X=sm.add_constant(traindata[features])
    model=sm.OLS(Y,X)
    re=model.fit()
    re.summary()
                                 OLS Regression Results
    ==============================================================================
    Dep. Variable:                   My01   R-squared:                       0.971
    Model:                            OLS   Adj. R-squared:                  0.971
    Method:                 Least Squares   F-statistic:                 1.664e+04
    Date:                Mon, 31 May 2021   Prob (F-statistic):               0.00
    Time:                        18:48:33   Log-Likelihood:                -1266.1
    No. Observations:                 500   AIC:                             2536.
    Df Residuals:                     498   BIC:                             2545.
    Df Model:                           1
    Covariance Type:            nonrobust
    ==============================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
    ------------------------------------------------------------------------------
    const          1.2910      0.147      8.779      0.000       1.002       1.580
    S01            1.1216      0.009    128.984      0.000       1.104       1.139
    ==============================================================================
    Omnibus:                      201.628   Durbin-Watson:                   1.815
    Prob(Omnibus):                  0.000   Jarque-Bera (JB):             8355.478
    Skew:                           1.008   Prob(JB):                         0.00
    Kurtosis:                      22.925   Cond. No.                         18.2
    ==============================================================================
    
    Notes:
    [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
    

    My01=1.1216*S01 +1.2910
    跟前面有点点差异,可能前面用的整个d,后面用的traindata吧。换全部的数据试试:

     Y=data[labels]
     X=sm.add_constant(data[features])
     model=sm.OLS(Y,X)
     re=model.fit()
     re.summary()
                                OLS Regression Results
    ==============================================================================
    Dep. Variable:                   My01   R-squared:                       0.987
    Model:                            OLS   Adj. R-squared:                  0.987
    Method:                 Least Squares   F-statistic:                 6.139e+05
    Date:                Mon, 31 May 2021   Prob (F-statistic):               0.00
    Time:                        18:51:28   Log-Likelihood:                -21309.
    No. Observations:                8259   AIC:                         4.262e+04
    Df Residuals:                    8257   BIC:                         4.264e+04
    Df Model:                           1
    Covariance Type:            nonrobust
    ==============================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
    ------------------------------------------------------------------------------
    const          1.6105      0.036     44.258      0.000       1.539       1.682
    S01            1.1211      0.001    783.548      0.000       1.118       1.124
    ==============================================================================
    Omnibus:                     2309.128   Durbin-Watson:                   1.873
    Prob(Omnibus):                  0.000   Jarque-Bera (JB):           248732.705
    Skew:                           0.180   Prob(JB):                         0.00
    Kurtosis:                      29.882   Cond. No.                         26.3
    ==============================================================================
    

    就一样啦~

    对maf的一些统计

    >ma<-read.maf('/project/baowenjuan/Tumor_FFPE/project/TMB/raw/filtered_maf', clinicalData = NULL, removeDuplicatedVariants = TRUE, useAll = TRUE, gisticAllLesionsFile = NULL, gisticAmpGenesFile = NULL, gisticDelGenesFile = NULL, gisticScoresFile = NULL, cnLevel = "all", cnTable = NULL, isTCGA = TRUE, vc_nonSyn = NULL, verbose = TRUE)
    >plotmafSummary(maf = ma, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE)
    >oncoplot(f_maf, top = 10, genes = NULL, altered = FALSE,
     drawRowBar = TRUE,
    drawColBar = TRUE, includeColBarCN = TRUE, draw_titv = FALSE,
    logColBar = FALSE, clinicalFeatures = NULL, additionalFeature = NULL, additionalFeaturePch = 20,
    additionalFeatureCol = "white", additionalFeatureCex = 0.9,
    annotationDat = NULL, annotationColor = NULL, genesToIgnore = NULL,
    showTumorSampleBarcodes = FALSE, barcode_mar = 4, gene_mar = 5,
    removeNonMutated = TRUE, fill = TRUE, cohortSize = NULL,
    colors = NULL, sortByMutation = FALSE, sortByAnnotation = FALSE,
    numericAnnoCol = NULL, groupAnnotationBySize = TRUE,
    annotationOrder = NULL, keepGeneOrder = FALSE,
    GeneOrderSort = TRUE, sampleOrder = NULL, writeMatrix = FALSE,
    sepwd_genes = 0.5, sepwd_samples = 0.25, fontSize = 0.8,
    SampleNamefontSize = 1, showTitle = TRUE, titleFontSize = 1.5,
    legendFontSize = 1.2, annotationFontSize = 1.2, bgCol = "#CCCCCC",
    borderCol = "white", colbar_pathway = FALSE)
    >somaticInteractions(maf = f_maf, top = 15, pvalue = c(0.05, 0.1))
          gene1 gene2        pValue oddsRatio   00   11   01   10        Event
      1:  MUC16   TTN 1.160797e-210  5.767656 5235 1068 1531  633 Co_Occurence
      2:    TTN CSMD3 1.118665e-173  6.390472 5501  777  367 1822 Co_Occurence
      3:  MUC16 LRP1B 3.093874e-165  6.714557 6263  596  503 1105 Co_Occurence
      4:  LRP1B   TTN 3.359613e-162  6.166544 5511  742 1857  357 Co_Occurence
      5:   RYR2   TTN 8.179908e-154  6.078670 5527  709 1890  341 Co_Occurence
     ---
    101:  KMT2D  TP53  8.326753e-08  1.503389 4976  358 2693  440 Co_Occurence
    102: PIK3CA USH2A  8.639989e-08  1.644043 6574  181  761  951 Co_Occurence
    103: PIK3CA LRP1B  1.196528e-07  1.593170 6441  205  894  927 Co_Occurence
    104: PIK3CA CSMD3  1.766406e-06  1.517069 6397  206  938  926 Co_Occurence
    105: PIK3CA  TP53  5.060652e-01  1.045442 4702  418 2633  714 Co_Occurence
                  pair event_ratio
      1:    MUC16, TTN   1068/2164
      2:    CSMD3, TTN    777/2189
      3:  LRP1B, MUC16    596/1608
      4:    LRP1B, TTN    742/2214
      5:     RYR2, TTN    709/2231
     ---
    101:   KMT2D, TP53    358/3133
    102: PIK3CA, USH2A    181/1712
    103: LRP1B, PIK3CA    205/1821
    104: CSMD3, PIK3CA    206/1864
    105:  PIK3CA, TP53    418/3347
    
    > library(BSgenome.Hsapiens.UCSC.hg19, quietly = TRUE) 
    >maf.tnm <- trinucleotideMatrix(maf=f_maf,ref_genome="BSgenome.Hsapiens.UCSC.hg19",useSyn=TRUE,prefix="chr")
    ref_genome=“BSgenome.Hsapiens.UCSC.hg19” :为刚才导入的hg19包
    prefix=“chr”:从MAF文件中添加或移除这个字符。因为UCSC包以及输入的maf文件都有chr,故此处不设置
    add=TRUE:针对上面的prefix,决定是添加(TRUE)还是删除(FALSE)
    ignoreChr=c(“chrM”):忽略某染色体
    useSyn=TRUE:是否包含同义突变
    fn="" 生成的APOBEC结果文件前缀,默认NULL不生成该文件
    
    >maf.sign = estimateSignatures(mat = maf.tnm, nTry = 6)  #根据共遗传相关矩阵估计signature数, 这一步超级慢,跑几个小时
    
    -Running NMF for 6 ranks
    Compute NMF rank= 2  ...
    + measures ... OK
    Compute NMF rank= 3  ... + measures ... OK
    Compute NMF rank= 4  ... + measures ... OK
    Compute NMF rank= 5  ... + measures ... OK
    Compute NMF rank= 6  ... + measures ... OK
    -Finished in 03:03:22 elapsed (00:05:36 cpu)
    
    > plotCophenetic(res = maf.sign)
    > maf.sig = extractSignatures(mat = maf.tnm, n = 4)
    -Running NMF for factorization rank: 4
    -Finished in00:15:19 elapsed (00:12:31 cpu)
    >plotSignatures(nmfRes = maf.sig, title_size = 2)
    
    > plotApobecDiff(tnm=maf.tnm, maf=f_maf)
    --Possible FLAGS among top ten genes:
      TTN
      MUC16
      SYNE1
      FLG
    -Processing clinical data
    --Possible FLAGS among top ten genes:
      TTN
      MUC16
      SYNE1
      FLG
    -Processing clinical data
    Showing top 10 of 2189 differentially mutated genes
    $results
           Hugo_Symbol Enriched nonEnriched         pval        or     ci.up
        1:      PIK3CA      363         769 2.647811e-32 2.3935692 2.7568263
        2:        IDH1       23         481 2.363612e-21 0.1980938 0.3021205
        3:        TP53      696        2355 7.926231e-14 1.5335509 1.7167154
        4:       KDM6A      101         167 8.269990e-14 2.7672310 3.5883079
        5:      NFE2L2       89         144 1.070473e-12 2.8145700 3.7149450
       ---
    19300:       ZNF92       11          52 1.000000e+00 0.9277923 1.8050449
    19301:       ZNRD1        1           7 1.000000e+00 0.6266738 4.8834059
    19302:     ZSCAN25       10          45 1.000000e+00 0.9749828 1.9678563
    19303:      ZSWIM7        1           7 1.000000e+00 0.6266738 4.8834059
    19304:       ZUFSP       12          57 1.000000e+00 0.9232667 1.7454259
               ci.low      adjPval
        1: 2.07614044 5.111334e-28
        2: 0.12393799 2.281358e-17
        3: 1.36956569 3.991097e-10
        4: 2.12556645 3.991097e-10
        5: 2.12305145 4.132883e-09
       ---
    19300: 0.43547407 1.000000e+00
    19301: 0.01389871 1.000000e+00
    19302: 0.43712841 1.000000e+00
    19303: 0.01389871 1.000000e+00
    19304: 0.44973887 1.000000e+00
    
    $SampleSummary
            Cohort SampleSize    Mean Median
    1:    Enriched       1569 162.347    103
    2: nonEnriched       6885 207.521     48
    
    > library("ggpmisc")
    > library("ggplot2")
    > library("ggpubr")
    > d<-read.table("filtered_maf.stat",header=T)
    >  d[,c(2,3,4)] <-d[,c(2,3,4)]/39086460*1000000
    > d[,c(5,6,7)] <-d[,c(5,6,7)]/1331534*1000000
    > colnames(d) <-c("sampleID","a","b","c","d","e","f")  #因为%不识别
    >  head(d,3)
      sampleID         a         b         c        d        e        f
    1     0003 1.3559683 1.3559683 1.3559683 3.939136 3.939136 3.939136
    2     0033 0.8698664 0.8698664 0.8698664 5.514790 5.514790 5.514790
    3     0047 1.9444073 1.9444073 1.8164858 3.151309 3.151309 3.151309
    > ggplot(d,aes(x=c,y=f))+geom_point(shape=17)+geom_smooth(method="lm",color="black",fill="lightgrey")+stat_cor(method="pearson",label.x=3,label.y=30)+stat_poly_eq(aes(label=..eq.label..),formula="y~x",parse=TRUE,geom="text",label.x = 150,label.y = 200, hjust = 0)+ stat_fit_tb(tb.type = 'fit.anova')+xlab("Maf_Panel")+ylab("652GenePanel")
    > ggplot(d,aes(x=c,y=f))+geom_point(shape=17)+geom_smooth(method="lm",color="black",fill="lightgrey")+stat_cor(method="pearson",label.x=40,label.y=370)+stat_poly_eq(aes(label=..eq.label..),formula="y~x",parse=TRUE,geom="text",label.x = 40,label.y = 350, hjust = 0)+xlab("Maf_Panel")+ylab("652GenePanel")+theme(axis.text.x=element_text(angle=45,hjust = 1,colour="black",family="Times",size=10),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(colour = "black"),plot.margin = unit(c(2,3,3,4),"cm"))
    
    TMB分布
    #把sampleID加上
    > sample<-read.table("/project/baowenjuan/Tumor_FFPE/project/TMB/raw/Panel652-1/Gene_270_455728_P1.txt",header=TRUE)
    > rownames(mer11)<-sample[,1]
    > head(mer11,2)
         90       70       690      670       650      630       610 590     570
    0003  0 0.000000 0.8171377 2.527024 0.8724953 1.782318 0.9383425   0 1.04236
    0033  0 7.054425 0.8171377 1.684683 0.8724953 0.000000 1.8766849   1 0.00000
    
    #读入原始maf统计的TMB值,这里还没对TMB做panel小大折算的
    > mar<-read.table("/project/baowenjuan/Tumor_FFPE/project/TMB/raw/filtered_maf.stat",header=TRUE)
    > colnames(mar)<-c("sample_ID","p1","p3","p5","mp1","mp3","mp5")
    > mar[,2:4]<-mar[,2:4]/39086460*1000000    #39086460为全外panel大小
    > mar[,5:7]<-mar[,5:7]/1331534*1000000   #1331534是652panel的大小
    > head(mar,3)
      sample_ID        p1        p3        p5      mp1      mp3      mp5
    1      0003 1.3559683 1.3559683 1.3559683 3.755067 3.755067 3.755067
    2      0033 0.8698664 0.8698664 0.8698664 5.257094 5.257094 5.257094
    3      0047 1.9444073 1.9444073 1.8164858 3.004054 3.004054 3.004054
    
    > rownames(mar)<-mar[,1]
    > mar<-mar[-1]
    > head(mar,3)
                p1        p3        p5      mp1      mp3      mp5
    0003 1.3559683 1.3559683 1.3559683 3.755067 3.755067 3.755067
    0033 0.8698664 0.8698664 0.8698664 5.257094 5.257094 5.257094
    0047 1.9444073 1.9444073 1.8164858 3.004054 3.004054 3.004054
    
    #将两个矩阵按照样本名合并为一个
    > cm<-merge(mer11,mar,by="row.names",all=TRUE)
    > head(cm,3)
      Row.names 90       70       690      670       650      630       610 590
    1      0003  0 0.000000 0.8171377 2.527024 0.8724953 1.782318 0.9383425   0
    2      0033  0 7.054425 0.8171377 1.684683 0.8724953 0.000000 1.8766849   1
    3      0047  0 0.000000 2.4514130 3.369366 0.8724953 4.455796 4.6917123   3
           570      550      530      510 50      490      470      450      430
    1 1.042360 1.161071 3.177354 0.000000  0 3.486058 1.215446 2.428767 1.380171
    2 0.000000 1.161071 1.059118 2.176499  0 3.486058 0.000000 2.428767 0.000000
    3 3.127081 0.000000 2.118236 5.441247  0 4.648077 3.646339 2.428767 1.380171
           410      390      370 350      330      310      290      270      250
    1 1.474194 0.000000 0.000000   0 1.652977 2.054021 1.952057 4.388583 2.164957
    2 1.474194 1.408377 1.639110   0 1.652977 0.000000 1.952057 0.000000 2.164957
    3 4.422581 1.408377 3.278221   0 1.652977 0.000000 1.952057 4.388583 0.000000
           230      210 190 170      150    130      110        p1        p3
    1 0.000000 2.894423   0   0 8.029226 8.5164 4.929071 1.3559683 1.3559683
    2 2.627976 0.000000   0   0 4.014613 4.2582 0.000000 0.8698664 0.8698664
    3 5.255952 0.000000   0   0 4.014613 0.0000 0.000000 1.9444073 1.9444073
             p5      mp1      mp3      mp5
    1 1.3559683 3.755067 3.755067 3.755067
    2 0.8698664 5.257094 5.257094 5.257094
    3 1.8164858 3.004054 3.004054 3.004054
    
    #合并后有的行里面可能有NA,去掉这些行
    > cm1<-na.omit(cm)
    #只选择200行
    > cm2<-cm1[1:200,]
    > cm2<-cm1[-1]
    > rownames(cm2)<-cm1[,1]
    > head(cm2,2)
         90       70       690      670       650      630       610 590     570
    0003  0 0.000000 0.8171377 2.527024 0.8724953 1.782318 0.9383425   0 1.04236
    0033  0 7.054425 0.8171377 1.684683 0.8724953 0.000000 1.8766849   1 0.00000
    
    #计算每一列(1:33列)与34列的相关性系数
    > co={}
    > for(i in 1:33){ne_co=cor(cm2[,i],cm2[,34]){co<-cbind(co,ne_co)}
    
    #循环跑50次筛选
    > co={}
    
    for (i in 1:50){
    choose<-sample(seq(1,7949,1),200)
    cm2<-cm1[choose,]
    for (j in 2:34){
    ne_co=cor(cm2[,j],cm2[,35])
    ne_co<-as.vector(ne_co)
    co<-cbind(co,ne_co)
    }
    }
    #奇怪,输出的不是矩阵,都bind到一行了,手动改为矩阵
    > d<-matrix(co,nrow=33,ncol=50)
    > colnames(d)<-paste("S_",seq(1,50,1,sep=""))
    > rownames(d)<-colnames(cm)[2:34]
    > head(d,3)
              S_1       S_2       S_3       S_4       S_5       S_6       S_7
    90  0.9915502 0.9948556 0.9036831 0.8072887 0.9066141 0.8463588 0.8704763
    70  0.9827493 0.9849512 0.7761056 0.7818305 0.7954177 0.7378713 0.8088688
    690 0.9987495 0.9986940 0.9849712 0.9676668 0.9768251 0.9773340 0.9801521
    
    > d1<-reshape2::melt(d)
    > head(d1,3)
      Var1 Var2     value
    1   90  S_1 0.9915502
    2   70  S_1 0.9827493
    3  690  S_1 0.9987495
    
    #画图
    > ggboxplot(d1,x='Var1',y='value',group='Var2',color="black",add="jitter",xlab="Gene Size",ylab="Cor",title="TMB(>1%) correlation with diff gene size",ggtheme=theme_pubr())+theme(axis.text.x=element_text(angle=45,hjust = 1,colour="black",family="Times",size=10),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(colour = "black"),plot.margin = unit(c(2,2,2,2),"cm"))
    
    TMB_GeneNumber
    #查看成对样本采用WES和652panel的TMB值:
    ds<- d[sample(seq(1,dim(d)[1],1),50),]   #随机选择50行
    ds1<-ds1<-ds[,c(1,2,5)]   #选择1%频率的
    > ds1
         sampleID          a          d
    2986     A14P  2.1746661  3.0040540
    1842     7147  9.7220367 11.2652024
    3371     A1N2  3.5818030  4.5060810
    4761     A4A9  5.7053005  9.0121619
    > colnames(ds1)<-c("sampleID","WES","652GenePanel")
    > ggpaired(ds1,cond1='WES',cond2='652GenePanel',color='sampleID',width=0.1,line.size=0.5,line.color="grey",repel=T,title="WES vs 652GenePanel (50 samples show)",ylab="TMB value",xlab="")+theme(plot.margin = unit(c(2,2,2,2),"cm"),legend.position = "none")
    
    #随机采集200行,做一致性分析
    > dim(d)
    [1] 8411    7
    
    > co={}
    for (i in 1:50){
    ds<- d[sample(seq(1,dim(d)[1],1),200),]
    new_co<-cor(ds[,2],ds[,5])
    co<-cbind(co,new_co)
    }
    
    50个样本WES和652Panel-TMB比较

    相关文章

      网友评论

          本文标题:TMB_一点分析

          本文链接:https://www.haomeiwen.com/subject/veibsltx.html