vcfR

作者: Thinkando | 来源:发表于2019-04-04 23:09 被阅读78次
  • vcfR is a package intended to help visualize, manipulate and quality filter VCF data.

1. addID

data(vcfR_test)
head(vcfR_test)
vcfR_test <- addID(vcfR_test)
head(vcfR_test)
  • 把vcf 缺失的ID补上

2. 频数计算

  • AD 每个等位基因测序的次数
  • 对于二倍体的纯合位置,观察到的频率为1或0,对于杂合位置,观察到接近0.5的频率位置。与此期望的偏差可能表明等位基因不平衡或倍性差异。
set.seed(999)
x1 <- round(rnorm(n=9, mean=10, sd=2))
x2 <- round(rnorm(n=9, mean=20, sd=2))
ad <- matrix(paste(x1, x2, sep=","), nrow=3, ncol=3)
# Sample_1,Sample_2,Sample_3
colnames(ad) <- paste('Sample', 1:3, sep="_")
rownames(ad) <- paste('Variant', 1:3, sep="_")
ad[1,1] <- "9,23,12"
AD_frequency(ad=ad)
image.png

3. 质控

library(vcfR)
data(vcfR_example)
chrom <- create.chromR('sc50', seq=dna, vcf=vcf, ann=gff)
head(chrom)
chrom
plot(chrom)
chrom <- masker(chrom, min_QUAL = 1, min_DP = 300, max_DP = 700, min_MQ = 59, max_MQ = 61)
chrom <- proc.chromR(chrom, win.size=1000)
plot(chrom)
chromoqc(chrom)
  • 这图可以出一个质控报告了


    image.png

4. 删除indel

data(vcfR_test)
gt <- extract.gt(vcfR_test) # 返回0|0,1|0
gt <- extract.gt(vcfR_test, return.alleles = TRUE) # 返回A|A,A|T
data(vcfR_test)
getFIX(vcfR_test) # 查看正常的vcf 文件
vcf <- extract.indels(vcfR_test) # 排除indels
getFIX(vcf)
vcf@fix[nrow(vcf@fix),'ALT'] <- ".,A" # @是从R的类实例里面读取数据,把NA变成".,A"
vcf <- extract.indels(vcf)
getFIX(vcf)
data(vcfR_test)
vcfR_test@fix[1,'ALT'] <- "<NON_REF>" 
vcf <- extract.indels(vcfR_test)
getFIX(vcf)
data(vcfR_test)
extract.haps(vcfR_test, unphased_as_NA = FALSE) # unphased 无偏的
extract.haps(vcfR_test)

allel frequence count

# An empty plot.
freq_peak_plot(pos=1:40)
data(vcfR_example)
gt <- extract.gt(vcf)
hets <- is_het(gt) # 查看是不是杂合
# Censor non-heterozygous positions.
is.na(vcf@gt[,-1][!hets]) <- TRUE
# Extract allele depths.
ad <- extract.gt(vcf, element = "AD")
ad1 <- masplit(ad, record = 1) #major allele
ad2 <- masplit(ad, record = 2) #second allele
freq1 <- ad1/(ad1+ad2)
freq2 <- ad2/(ad1+ad2)
myPeaks1 <- freq_peak(freq1, getPOS(vcf))
is.na(myPeaks1$peaks[myPeaks1$counts < 20]) <- TRUE
myPeaks2 <- freq_peak(freq2, getPOS(vcf), lhs = FALSE)
is.na(myPeaks2$peaks[myPeaks2$counts < 20]) <- TRUE
freq_peak_plot(pos = getPOS(vcf), ab1 = freq1, ab2 = freq2, fp1 = myPeaks1, fp2=myPeaks2)
image.png

参考文献

  1. https://cran.r-project.org/web/packages/vcfR/vcfR.pdf

相关文章

网友评论

    本文标题:vcfR

    本文链接:https://www.haomeiwen.com/subject/ljeqiqtx.html