要分析TMB标准品数据,下载了MC3 maf文件,按照说明文档进行样本/位点的过滤,反正就是滤不出文档里最后滤出的数,我保留的要多一些。
#!/usr/bin/perl
use strict;
use warnings;
my $file=shift;my %ha;my %delete;my %keep;my %arti;
my $bed="/project/baowenjuan/bed_files/652Genes_Panel/V0/gene_panel.bed";
my %cds;my %panel;my %keep_items;
open IN,"<$bed";
while (<IN>) {
chomp;
my @a=split;
for (my $i=$a[1]-5;$i<$a[2]+5;$i++) {
$cds{$a[0]}{$i}=1;
}
}
close IN;
open O1,">filtered_maf.stat";
open O2,">filtered_maf";
open IN,"<$file";
while (<IN>) {
chomp;
my $line=$_;
next if (/^Hugo_Symbol/);
my @a=split "\t",$line;
my $id_t=(split "-",$a[15])[2];
my $id_c=(split "-",$a[16])[2];
next unless ($id_t eq $id_c);
if ($a[-6]=~/native_wga_mix|wga|gapfiller|nonpreferredpair/){$delete{$id_t}=1;$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};
if ($a[-6]=~/StrandBias|common_in_exac|oxog/){$arti{$id_t}++;$ha{$id_t}{'total_count'}++};
unless ($a[-15] eq '.' || $a[-15] <0.01){$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};#ExAC_AF
unless ($a[-12] eq '.' || $a[-12] <0.01){$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};#ExAC_AF_EAS
if ($a[41]<3||$a[39]<25){$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};#DP >= 25 and AD >= 3
if ($a[42]<8){$arti{$id_t}++;$ha{$id_t}{'total_count'}++;next};#n_depth>=8
unless (($a[8] eq 'Frame_Shift_Ins') || ($a[8] eq 'Frame_Shift_Del') ||($a[8] eq 'In_Frame_Del') ||($a[8] eq 'In_Frame_Ins') ||($a[8] eq 'Missense_Mutation') ||($a[8] eq 'Nonsense_Mutation') ||($a[8] eq 'Nonstop_Mutation') ||($a[8] eq 'Silent') ||($a[8] eq 'Splice_Site')||($a[8] eq 'Translation_Start_Site')){$ha{$id_t}{'total_count'}++;next};
my $alt_freq=$a[41]/$a[39];if ($alt_freq>0.1) {$keep{$id_t}=1}#MSAF >= 10%
push (@{$ha{$id_t}{'freq'}},$alt_freq);
$ha{$id_t}{'total_count'}++;
$keep_items{$a[4]}{$a[5]}=1;
my $chr="chr".$a[4];
next unless (defined $cds{$chr}{$a[5]});
push (@{$panel{$id_t}{'freq'}},$alt_freq);
}
close IN;
open IN,"<$file";
while (<IN>) {
chomp;
if (/Hugo_Symbol/){print O2 "$_\n";next};
my @a=split "\t",$_;
my $id_t=(split "-",$a[15])[2];
next if (defined $arti{$id_t} && ($arti{$id_t}/$ha{$id_t}{'total_count'}>0.5));
print O2 "$_\n" if (defined $keep_items{$a[4]}{$a[5]} && defined $keep{$id_t});
}
close O2;
close IN;
print O1 "sample_ID\t1\%\t3\%\t5\%\tMyPanel_1%\tMyPanel_3%\tMyPanel_5%\n";
foreach my $k1 (sort keys %ha) {
next if (defined $delete{$k1});
next unless (defined $keep{$k1});
next if (defined $arti{$k1} && ($arti{$k1}/$ha{$k1}{'total_count'}>0.5));
my ($f01,$f03,$f05)=check_freq_distribution(\@{$ha{$k1}{'freq'}});
print O1 "$k1\t$f01\t$f03\t$f05\t";
($f01,$f03,$f05)=check_freq_distribution(\@{$panel{$k1}{'freq'}});
print O1 "$f01\t$f03\t$f05\n";
}
sub check_freq_distribution{
my $freqs=$_[0];
my ($sf01,$sf03,$sf05)=(0,0,0);
my $c=0;
while ($c<@$freqs) {
if ($freqs->[$c] >0.01) {$sf01++}
if ($freqs->[$c] >0.03) {$sf03++}
if ($freqs->[$c] >0.05) {$sf05++}
$c++;
}
return ($sf01,$sf03,$sf05);
}
先把各个样本的各个频率的突变个数统计输出来,以及我们自己的panel(652个基因)包含的突变:
过滤后的统计文件
用R的lm()分析下:
d<-read.table("filtered_maf.stat",header=T)
#这个是原始的突变个数,还要对panel的大小做一个每Mb的折算哈,涉及到panel的一些敏感信息,就不放代码了
> head(d,2)
sample_ID S01 S03 S05 My01 My03 My05
1 0003 1.3559683 1.3559683 1.3559683 3.939136 3.939136 3.939136
2 0033 0.8698664 0.8698664 0.8698664 5.514790 5.514790 5.514790
lm.model<-lm(d[,5]~d[,2]) #先看下1%的
summary(lm.model)
Call:
lm(formula = d[, 5] ~ d[, 2])
Residuals:
Min 1Q Median 3Q Max
-46.427 -1.500 -0.452 0.998 31.987
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.610484 0.036389 44.26 <2e-16 ***
d[, 2] 1.121099 0.001431 783.55 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.194 on 8257 degrees of freedom
Multiple R-squared: 0.9867, Adjusted R-squared: 0.9867
F-statistic: 6.139e+05 on 1 and 8257 DF, p-value: < 2.2e-16
> coefficients(lm.model)
(Intercept) d[, 2]
1.610484 1.121099
就是My01=1.121099*S01+1.610484
换个python瞅瞅:
import pandas as pd
from sklearn import linear_model
data=pd.read_table("/project/baowenjuan/Tumor_FFPE/project/TMB/raw/TMB.txt",header=0,sep=" ")
def trainModel(trainData,features,labels):
model=linear_model.LinearRegression()
model.fit(trainData[feature],trainData[label])
return model
def evaluateModel(model,testData,features,labels):
error=np.mean((model.predict(testData[features])-testData[labels])**2)
score=model.score(testData[features],testData[labels])
return error,score
features=["My01"]
labels=["S01"]
traindata=data[:500]
testdata=data[501:700]
model=trainModel(traindata,features,labels)
error,score=evaluateModel(model,testdata,features,labels)
>>> error
S01 6.058423
dtype: float64
>>> score
0.9233057201839856
这score还是蛮高的。用统计方法看看:
import statsmodels.api as sm
features=["S01"]
labels=["My01"]
Y=traindata[labels]
X=sm.add_constant(traindata[features])
model=sm.OLS(Y,X)
re=model.fit()
re.summary()
OLS Regression Results
==============================================================================
Dep. Variable: My01 R-squared: 0.971
Model: OLS Adj. R-squared: 0.971
Method: Least Squares F-statistic: 1.664e+04
Date: Mon, 31 May 2021 Prob (F-statistic): 0.00
Time: 18:48:33 Log-Likelihood: -1266.1
No. Observations: 500 AIC: 2536.
Df Residuals: 498 BIC: 2545.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 1.2910 0.147 8.779 0.000 1.002 1.580
S01 1.1216 0.009 128.984 0.000 1.104 1.139
==============================================================================
Omnibus: 201.628 Durbin-Watson: 1.815
Prob(Omnibus): 0.000 Jarque-Bera (JB): 8355.478
Skew: 1.008 Prob(JB): 0.00
Kurtosis: 22.925 Cond. No. 18.2
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
My01=1.1216*S01 +1.2910
跟前面有点点差异,可能前面用的整个d,后面用的traindata吧。换全部的数据试试:
Y=data[labels]
X=sm.add_constant(data[features])
model=sm.OLS(Y,X)
re=model.fit()
re.summary()
OLS Regression Results
==============================================================================
Dep. Variable: My01 R-squared: 0.987
Model: OLS Adj. R-squared: 0.987
Method: Least Squares F-statistic: 6.139e+05
Date: Mon, 31 May 2021 Prob (F-statistic): 0.00
Time: 18:51:28 Log-Likelihood: -21309.
No. Observations: 8259 AIC: 4.262e+04
Df Residuals: 8257 BIC: 4.264e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 1.6105 0.036 44.258 0.000 1.539 1.682
S01 1.1211 0.001 783.548 0.000 1.118 1.124
==============================================================================
Omnibus: 2309.128 Durbin-Watson: 1.873
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248732.705
Skew: 0.180 Prob(JB): 0.00
Kurtosis: 29.882 Cond. No. 26.3
==============================================================================
就一样啦~
对maf的一些统计
>ma<-read.maf('/project/baowenjuan/Tumor_FFPE/project/TMB/raw/filtered_maf', clinicalData = NULL, removeDuplicatedVariants = TRUE, useAll = TRUE, gisticAllLesionsFile = NULL, gisticAmpGenesFile = NULL, gisticDelGenesFile = NULL, gisticScoresFile = NULL, cnLevel = "all", cnTable = NULL, isTCGA = TRUE, vc_nonSyn = NULL, verbose = TRUE)
>plotmafSummary(maf = ma, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE)
>oncoplot(f_maf, top = 10, genes = NULL, altered = FALSE,
drawRowBar = TRUE,
drawColBar = TRUE, includeColBarCN = TRUE, draw_titv = FALSE,
logColBar = FALSE, clinicalFeatures = NULL, additionalFeature = NULL, additionalFeaturePch = 20,
additionalFeatureCol = "white", additionalFeatureCex = 0.9,
annotationDat = NULL, annotationColor = NULL, genesToIgnore = NULL,
showTumorSampleBarcodes = FALSE, barcode_mar = 4, gene_mar = 5,
removeNonMutated = TRUE, fill = TRUE, cohortSize = NULL,
colors = NULL, sortByMutation = FALSE, sortByAnnotation = FALSE,
numericAnnoCol = NULL, groupAnnotationBySize = TRUE,
annotationOrder = NULL, keepGeneOrder = FALSE,
GeneOrderSort = TRUE, sampleOrder = NULL, writeMatrix = FALSE,
sepwd_genes = 0.5, sepwd_samples = 0.25, fontSize = 0.8,
SampleNamefontSize = 1, showTitle = TRUE, titleFontSize = 1.5,
legendFontSize = 1.2, annotationFontSize = 1.2, bgCol = "#CCCCCC",
borderCol = "white", colbar_pathway = FALSE)
>somaticInteractions(maf = f_maf, top = 15, pvalue = c(0.05, 0.1))
gene1 gene2 pValue oddsRatio 00 11 01 10 Event
1: MUC16 TTN 1.160797e-210 5.767656 5235 1068 1531 633 Co_Occurence
2: TTN CSMD3 1.118665e-173 6.390472 5501 777 367 1822 Co_Occurence
3: MUC16 LRP1B 3.093874e-165 6.714557 6263 596 503 1105 Co_Occurence
4: LRP1B TTN 3.359613e-162 6.166544 5511 742 1857 357 Co_Occurence
5: RYR2 TTN 8.179908e-154 6.078670 5527 709 1890 341 Co_Occurence
---
101: KMT2D TP53 8.326753e-08 1.503389 4976 358 2693 440 Co_Occurence
102: PIK3CA USH2A 8.639989e-08 1.644043 6574 181 761 951 Co_Occurence
103: PIK3CA LRP1B 1.196528e-07 1.593170 6441 205 894 927 Co_Occurence
104: PIK3CA CSMD3 1.766406e-06 1.517069 6397 206 938 926 Co_Occurence
105: PIK3CA TP53 5.060652e-01 1.045442 4702 418 2633 714 Co_Occurence
pair event_ratio
1: MUC16, TTN 1068/2164
2: CSMD3, TTN 777/2189
3: LRP1B, MUC16 596/1608
4: LRP1B, TTN 742/2214
5: RYR2, TTN 709/2231
---
101: KMT2D, TP53 358/3133
102: PIK3CA, USH2A 181/1712
103: LRP1B, PIK3CA 205/1821
104: CSMD3, PIK3CA 206/1864
105: PIK3CA, TP53 418/3347
> library(BSgenome.Hsapiens.UCSC.hg19, quietly = TRUE)
>maf.tnm <- trinucleotideMatrix(maf=f_maf,ref_genome="BSgenome.Hsapiens.UCSC.hg19",useSyn=TRUE,prefix="chr")
ref_genome=“BSgenome.Hsapiens.UCSC.hg19” :为刚才导入的hg19包
prefix=“chr”:从MAF文件中添加或移除这个字符。因为UCSC包以及输入的maf文件都有chr,故此处不设置
add=TRUE:针对上面的prefix,决定是添加(TRUE)还是删除(FALSE)
ignoreChr=c(“chrM”):忽略某染色体
useSyn=TRUE:是否包含同义突变
fn="" 生成的APOBEC结果文件前缀,默认NULL不生成该文件
>maf.sign = estimateSignatures(mat = maf.tnm, nTry = 6) #根据共遗传相关矩阵估计signature数, 这一步超级慢,跑几个小时
-Running NMF for 6 ranks
Compute NMF rank= 2 ...
+ measures ... OK
Compute NMF rank= 3 ... + measures ... OK
Compute NMF rank= 4 ... + measures ... OK
Compute NMF rank= 5 ... + measures ... OK
Compute NMF rank= 6 ... + measures ... OK
-Finished in 03:03:22 elapsed (00:05:36 cpu)
> plotCophenetic(res = maf.sign)
> maf.sig = extractSignatures(mat = maf.tnm, n = 4)
-Running NMF for factorization rank: 4
-Finished in00:15:19 elapsed (00:12:31 cpu)
>plotSignatures(nmfRes = maf.sig, title_size = 2)
> plotApobecDiff(tnm=maf.tnm, maf=f_maf)
--Possible FLAGS among top ten genes:
TTN
MUC16
SYNE1
FLG
-Processing clinical data
--Possible FLAGS among top ten genes:
TTN
MUC16
SYNE1
FLG
-Processing clinical data
Showing top 10 of 2189 differentially mutated genes
$results
Hugo_Symbol Enriched nonEnriched pval or ci.up
1: PIK3CA 363 769 2.647811e-32 2.3935692 2.7568263
2: IDH1 23 481 2.363612e-21 0.1980938 0.3021205
3: TP53 696 2355 7.926231e-14 1.5335509 1.7167154
4: KDM6A 101 167 8.269990e-14 2.7672310 3.5883079
5: NFE2L2 89 144 1.070473e-12 2.8145700 3.7149450
---
19300: ZNF92 11 52 1.000000e+00 0.9277923 1.8050449
19301: ZNRD1 1 7 1.000000e+00 0.6266738 4.8834059
19302: ZSCAN25 10 45 1.000000e+00 0.9749828 1.9678563
19303: ZSWIM7 1 7 1.000000e+00 0.6266738 4.8834059
19304: ZUFSP 12 57 1.000000e+00 0.9232667 1.7454259
ci.low adjPval
1: 2.07614044 5.111334e-28
2: 0.12393799 2.281358e-17
3: 1.36956569 3.991097e-10
4: 2.12556645 3.991097e-10
5: 2.12305145 4.132883e-09
---
19300: 0.43547407 1.000000e+00
19301: 0.01389871 1.000000e+00
19302: 0.43712841 1.000000e+00
19303: 0.01389871 1.000000e+00
19304: 0.44973887 1.000000e+00
$SampleSummary
Cohort SampleSize Mean Median
1: Enriched 1569 162.347 103
2: nonEnriched 6885 207.521 48
> library("ggpmisc")
> library("ggplot2")
> library("ggpubr")
> d<-read.table("filtered_maf.stat",header=T)
> d[,c(2,3,4)] <-d[,c(2,3,4)]/39086460*1000000
> d[,c(5,6,7)] <-d[,c(5,6,7)]/1331534*1000000
> colnames(d) <-c("sampleID","a","b","c","d","e","f") #因为%不识别
> head(d,3)
sampleID a b c d e f
1 0003 1.3559683 1.3559683 1.3559683 3.939136 3.939136 3.939136
2 0033 0.8698664 0.8698664 0.8698664 5.514790 5.514790 5.514790
3 0047 1.9444073 1.9444073 1.8164858 3.151309 3.151309 3.151309
> ggplot(d,aes(x=c,y=f))+geom_point(shape=17)+geom_smooth(method="lm",color="black",fill="lightgrey")+stat_cor(method="pearson",label.x=3,label.y=30)+stat_poly_eq(aes(label=..eq.label..),formula="y~x",parse=TRUE,geom="text",label.x = 150,label.y = 200, hjust = 0)+ stat_fit_tb(tb.type = 'fit.anova')+xlab("Maf_Panel")+ylab("652GenePanel")
> ggplot(d,aes(x=c,y=f))+geom_point(shape=17)+geom_smooth(method="lm",color="black",fill="lightgrey")+stat_cor(method="pearson",label.x=40,label.y=370)+stat_poly_eq(aes(label=..eq.label..),formula="y~x",parse=TRUE,geom="text",label.x = 40,label.y = 350, hjust = 0)+xlab("Maf_Panel")+ylab("652GenePanel")+theme(axis.text.x=element_text(angle=45,hjust = 1,colour="black",family="Times",size=10),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(colour = "black"),plot.margin = unit(c(2,3,3,4),"cm"))
TMB分布
#把sampleID加上
> sample<-read.table("/project/baowenjuan/Tumor_FFPE/project/TMB/raw/Panel652-1/Gene_270_455728_P1.txt",header=TRUE)
> rownames(mer11)<-sample[,1]
> head(mer11,2)
90 70 690 670 650 630 610 590 570
0003 0 0.000000 0.8171377 2.527024 0.8724953 1.782318 0.9383425 0 1.04236
0033 0 7.054425 0.8171377 1.684683 0.8724953 0.000000 1.8766849 1 0.00000
#读入原始maf统计的TMB值,这里还没对TMB做panel小大折算的
> mar<-read.table("/project/baowenjuan/Tumor_FFPE/project/TMB/raw/filtered_maf.stat",header=TRUE)
> colnames(mar)<-c("sample_ID","p1","p3","p5","mp1","mp3","mp5")
> mar[,2:4]<-mar[,2:4]/39086460*1000000 #39086460为全外panel大小
> mar[,5:7]<-mar[,5:7]/1331534*1000000 #1331534是652panel的大小
> head(mar,3)
sample_ID p1 p3 p5 mp1 mp3 mp5
1 0003 1.3559683 1.3559683 1.3559683 3.755067 3.755067 3.755067
2 0033 0.8698664 0.8698664 0.8698664 5.257094 5.257094 5.257094
3 0047 1.9444073 1.9444073 1.8164858 3.004054 3.004054 3.004054
> rownames(mar)<-mar[,1]
> mar<-mar[-1]
> head(mar,3)
p1 p3 p5 mp1 mp3 mp5
0003 1.3559683 1.3559683 1.3559683 3.755067 3.755067 3.755067
0033 0.8698664 0.8698664 0.8698664 5.257094 5.257094 5.257094
0047 1.9444073 1.9444073 1.8164858 3.004054 3.004054 3.004054
#将两个矩阵按照样本名合并为一个
> cm<-merge(mer11,mar,by="row.names",all=TRUE)
> head(cm,3)
Row.names 90 70 690 670 650 630 610 590
1 0003 0 0.000000 0.8171377 2.527024 0.8724953 1.782318 0.9383425 0
2 0033 0 7.054425 0.8171377 1.684683 0.8724953 0.000000 1.8766849 1
3 0047 0 0.000000 2.4514130 3.369366 0.8724953 4.455796 4.6917123 3
570 550 530 510 50 490 470 450 430
1 1.042360 1.161071 3.177354 0.000000 0 3.486058 1.215446 2.428767 1.380171
2 0.000000 1.161071 1.059118 2.176499 0 3.486058 0.000000 2.428767 0.000000
3 3.127081 0.000000 2.118236 5.441247 0 4.648077 3.646339 2.428767 1.380171
410 390 370 350 330 310 290 270 250
1 1.474194 0.000000 0.000000 0 1.652977 2.054021 1.952057 4.388583 2.164957
2 1.474194 1.408377 1.639110 0 1.652977 0.000000 1.952057 0.000000 2.164957
3 4.422581 1.408377 3.278221 0 1.652977 0.000000 1.952057 4.388583 0.000000
230 210 190 170 150 130 110 p1 p3
1 0.000000 2.894423 0 0 8.029226 8.5164 4.929071 1.3559683 1.3559683
2 2.627976 0.000000 0 0 4.014613 4.2582 0.000000 0.8698664 0.8698664
3 5.255952 0.000000 0 0 4.014613 0.0000 0.000000 1.9444073 1.9444073
p5 mp1 mp3 mp5
1 1.3559683 3.755067 3.755067 3.755067
2 0.8698664 5.257094 5.257094 5.257094
3 1.8164858 3.004054 3.004054 3.004054
#合并后有的行里面可能有NA,去掉这些行
> cm1<-na.omit(cm)
#只选择200行
> cm2<-cm1[1:200,]
> cm2<-cm1[-1]
> rownames(cm2)<-cm1[,1]
> head(cm2,2)
90 70 690 670 650 630 610 590 570
0003 0 0.000000 0.8171377 2.527024 0.8724953 1.782318 0.9383425 0 1.04236
0033 0 7.054425 0.8171377 1.684683 0.8724953 0.000000 1.8766849 1 0.00000
#计算每一列(1:33列)与34列的相关性系数
> co={}
> for(i in 1:33){ne_co=cor(cm2[,i],cm2[,34]){co<-cbind(co,ne_co)}
#循环跑50次筛选
> co={}
for (i in 1:50){
choose<-sample(seq(1,7949,1),200)
cm2<-cm1[choose,]
for (j in 2:34){
ne_co=cor(cm2[,j],cm2[,35])
ne_co<-as.vector(ne_co)
co<-cbind(co,ne_co)
}
}
#奇怪,输出的不是矩阵,都bind到一行了,手动改为矩阵
> d<-matrix(co,nrow=33,ncol=50)
> colnames(d)<-paste("S_",seq(1,50,1,sep=""))
> rownames(d)<-colnames(cm)[2:34]
> head(d,3)
S_1 S_2 S_3 S_4 S_5 S_6 S_7
90 0.9915502 0.9948556 0.9036831 0.8072887 0.9066141 0.8463588 0.8704763
70 0.9827493 0.9849512 0.7761056 0.7818305 0.7954177 0.7378713 0.8088688
690 0.9987495 0.9986940 0.9849712 0.9676668 0.9768251 0.9773340 0.9801521
> d1<-reshape2::melt(d)
> head(d1,3)
Var1 Var2 value
1 90 S_1 0.9915502
2 70 S_1 0.9827493
3 690 S_1 0.9987495
#画图
> ggboxplot(d1,x='Var1',y='value',group='Var2',color="black",add="jitter",xlab="Gene Size",ylab="Cor",title="TMB(>1%) correlation with diff gene size",ggtheme=theme_pubr())+theme(axis.text.x=element_text(angle=45,hjust = 1,colour="black",family="Times",size=10),panel.grid.major = element_blank(),panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(colour = "black"),plot.margin = unit(c(2,2,2,2),"cm"))
TMB_GeneNumber
#查看成对样本采用WES和652panel的TMB值:
ds<- d[sample(seq(1,dim(d)[1],1),50),] #随机选择50行
ds1<-ds1<-ds[,c(1,2,5)] #选择1%频率的
> ds1
sampleID a d
2986 A14P 2.1746661 3.0040540
1842 7147 9.7220367 11.2652024
3371 A1N2 3.5818030 4.5060810
4761 A4A9 5.7053005 9.0121619
> colnames(ds1)<-c("sampleID","WES","652GenePanel")
> ggpaired(ds1,cond1='WES',cond2='652GenePanel',color='sampleID',width=0.1,line.size=0.5,line.color="grey",repel=T,title="WES vs 652GenePanel (50 samples show)",ylab="TMB value",xlab="")+theme(plot.margin = unit(c(2,2,2,2),"cm"),legend.position = "none")
#随机采集200行,做一致性分析
> dim(d)
[1] 8411 7
> co={}
for (i in 1:50){
ds<- d[sample(seq(1,dim(d)[1],1),200),]
new_co<-cor(ds[,2],ds[,5])
co<-cbind(co,new_co)
}
50个样本WES和652Panel-TMB比较
网友评论