新年最适合干什么?当然是复盘总结!!
先从代码开始复盘。
linux处理bed文件
#对输入的文件进行处理,使得其满足methylkit的输入文件格式
for file in /data/home/lt/data/brain_adrenal/*
do
var="$file"
tmp=$(echo ${var##*/})
a=$(echo ${tmp%.*})
echo $a #在屏幕上显示即将生成的文件名
gawk '{print $1"."$3,$1,$3,$6,$10,$11,100-$11}' $file | \
gawk '{if($4=="+"){$4="F"}else{$4="R"}print $0}' | \
sed '1d' | sed '1i\chrBase chr base strand coverage freqC freqT' > $a #
done
用linux不多,还是一开始尝试上游分析(测序数据的质控、比对)的时候用的比较多。linux的优点是比较快。
methylkit R包上游处理
setwd("D:/DMP_mk_")
library(methylKit)
#创建file list
#多个对象
if(T)
{
file.list <- list(
"adrenal_1.txt",
"adrenal_2.txt",
"brain_1.txt",
"brain_2.txt",
"Breast_1.txt",
"Breast_2.txt",
"Kidney_1.txt",
"Kidney_2.txt",
"Left_Ventricle_1.txt",
"Left_Ventricle_2.txt",
"Liver_1.txt",
"Liver_2.txt",
"Lung_1.txt",
"Lung_2.txt",
"Pancreas_1.txt",
"Pancreas_2.txt",
"SkeletalMuscle_1.txt",
"SkeletalMuscle_2.txt",
"SkeletalMuscle_3.txt",
"SkeletalMuscle_4.txt",
"Skin_1.txt",
"Skin_2.txt",
"Stomach_1.txt",
"Stomach_2.txt",
"Testis_1.txt",
"Testis_2.txt",
"Uterus_1.txt",
"Uterus_2.txt"
)
}
# read the files to a methylRawList object: myobj
#多个分组对象读取
if(T)
{
myobj <- methRead(
file.list,
sample.id=list(
"adrenal_1.txt",
"adrenal_2.txt",
"brain_1.txt",
"brain_2.txt",
"Breast_1.txt",
"Breast_2.txt",
"Kidney_1.txt",
"Kidney_2.txt",
"Left_Ventricle_1",
"Left_Ventricle_2",
"Liver_1.txt",
"Liver_2.txt",
"Lung_1.txt",
"Lung_2.txt",
"Pancreas_1.txt",
"Pancreas_2.txt",
"SkeletalMuscle_1.txt",
"SkeletalMuscle_2.txt",
"SkeletalMuscle_3.txt",
"SkeletalMuscle_4.txt",
"Skin_1.txt",
"Skin_2.txt",
"Stomach_1.txt",
"Stomach_2.txt",
"Testis_1.txt",
"Testis_2.txt",
"Uterus_1.txt",
"Uterus_2.txt"),
assembly="hg19",
sep = " ",
treatment=c(0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8,8,9,9,10,10,11,11,12,12),
context="CpG",
mincov = 10
)
}
#去除覆盖率较低的reads(count<10)
myobj=filterByCoverage(myobj,lo.count=10,lo.perc=NULL,hi.count=NULL,hi.perc=99.9)
#归一化
myobj <- normalizeCoverage(myobj)
#合并所有样本中共有的位点
meth=unite(myobj, destrand=FALSE)
save(meth,file = "united_data.Rdata")
#样品聚类信息
pdf(file = "correlation_meth_.pdf",width = 7,height = 20)#在pdf编辑器中进行了编辑
clusterSamples(meth, dist="correlation", method="ward", plot=TRUE)
dev.off()
methylKit是一个用来分析RRBS数据及其变体的R包,利用 filterByCoverage 函数可以对RRBS数据进行初步的过滤以及归一化,消除PCR过度扩增以及在此位点上覆盖的reads过小所带来的偏差。
测序深度和覆盖度
在这里的coverage个人认为是覆盖了某个base的reads数目。hi.perc=99.9表示去掉最大的0.1%。
normalizeCoverage 函数使用从样本覆盖率分布的均值或中位数之间的差异派生的比例因子来标准化样本之间的覆盖率值。这样可以消除不同样本的测序总reads数目所带来的差异。(个人感觉如果最后计算的是甲基化的百分比,标准化的作用不大。)
没有过滤之前:
> getMethylationStats(myobj[[1]],both.strands=FALSE)
methylation statistics per base
summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 3.226 22.795 33.333 100.000
percentiles:
0% 10% 20% 30% 40% 50% 60% 70%
0.000000 0.000000 0.000000 0.000000 1.612903 3.225806 6.086957 15.789474
80% 90% 95% 99% 99.5% 99.9% 100%
60.869565 90.909091 96.551724 100.000000 100.000000 100.000000 100.000000
> getCoverageStats(myobj[[1]],both.strands=FALSE)
read coverage statistics per base
summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.00 24.00 42.00 58.41 69.00 46296.00
percentiles:
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5% 99.9%
10 15 21 28 35 42 51 62 77 106 150 241 271 336
100%
46296
过滤之后:
> getMethylationStats(myobj[[1]],both.strands=FALSE)
methylation statistics per base
summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 3.226 22.799 33.333 100.000
percentiles:
0% 10% 20% 30% 40% 50% 60% 70%
0.000000 0.000000 0.000000 0.000000 1.612903 3.225806 6.060606 15.789474
80% 90% 95% 99% 99.5% 99.9% 100%
60.869565 90.909091 96.551724 100.000000 100.000000 100.000000 100.000000
> getCoverageStats(myobj[[1]],both.strands=FALSE)
read coverage statistics per base
summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.00 24.00 42.00 54.65 68.00 335.00
percentiles:
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5% 99.9%
10 15 21 28 35 42 51 62 77 105 148 237 263 306
100%
335
归一化之后:
> getMethylationStats(myobj[[1]],both.strands=FALSE)
methylation statistics per base
summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 3.125 22.760 33.333 100.000
percentiles:
0% 10% 20% 30% 40% 50% 60% 70%
0.000000 0.000000 0.000000 0.000000 1.351351 3.125000 6.000000 15.789474
80% 90% 95% 99% 99.5% 99.9% 100%
61.016949 91.176471 96.774194 100.000000 100.000000 100.000000 100.000000
> getCoverageStats(myobj[[1]],both.strands=FALSE)
read coverage statistics per base
summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
13.00 30.00 53.00 68.96 86.00 423.00
percentiles:
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 99% 99.5% 99.9%
13 19 26 35 44 53 64 78 97 132 187 299 332 386
100%
423
生成聚类图:
> clusterSamples(meth, dist="correlation", method="ward", plot=TRUE)
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
Call:
hclust(d = d, method = HCLUST.METHODS[hclust.method])
Cluster method : ward.D
Distance : pearson
Number of objects: 28
在这一步中我们得到了united_data.Rdata文件。
另外一种unite选择
if(F){
meth=unite(myobj,destrand=FALSE,min.per.group = 1L)
perc.meth=percMethylation(meth)
tmp <- data.frame(perc.meth)
for (i in 1:715765)
{
for (j in seq(1,25,2))
{
if(is.na(tmp[i,j]))
tmp[i,j] = tmp[i,j+1]
}
}
rm(i,j)
for (i in 1:715765)
{
if(is.na(tmp[i,27]))
{tmp[i,27] = tmp[i,28]}
}
rm(i)
tmp_1 <- tmp
for (i in 1:715765)
{
for (j in seq(2,28,2))
{
if(is.na(tmp_1[i,j]))
tmp_1[i,j] = tmp_1[i,j-1]
}
}
rm(i,j)
tmp_2 <- tmp_1
for (i in 1:715765)
{
if(is.na(tmp_2[i,17]))
tmp_2[i,17] = tmp_2[i,19]
if(is.na(tmp_2[i,18]))
tmp_2[i,18] = tmp_2[i,20]
if(is.na(tmp_2[i,19]))
tmp_2[i,19] = tmp_2[i,17]
if(is.na(tmp_2[i,20]))
tmp_2[i,20] = tmp_2[i,18]
}
meth_1 <- meth
perc.meth_1 <- tmp_2
rm(meth,perc.meth)
save(meth_1,perc.meth_1,file = "united_data_1.Rdata")
}
这里我们就可以得到同组至少包含一个样本的unite文件。我们将它另存为united_data_1.Rdata文件。但是上述代码运行的时间在一个小时以上,R语言的for循环速度实在太慢了,暂时还没有找到好的解决方法。
网友评论