这里只考虑正向的,如果有反向的可以先转为正向。
# Summarizing interval coverage across genomic features
bedfile <- valr_example("genes.hg19.chr22.bed.gz")
genomefile <- valr_example("hg19.chrom.sizes.gz")
bgfile <- valr_example("hela.h3k4.chip.bg.gz")
genes <- read_bed(bedfile, n_fields = 6)
genome <- read_genome(genomefile)
y <- read_bedgraph(bgfile)
# generate 1 bp TSS intervals, "+" strand only
tss <- genes %>%
filter(strand == "+") %>%
mutate(end = start + 1)
# 1000 bp up and downstream
region_size <- 2000
# 50 bp windows
win_size <- 100
bin_number <- region_size *2 /win_size
# add slop to the TSS, break into windows and add a group
# bed_slop函数扩展区域空间,延长区间上下游,这里延长上下游1KB
# bed_makewindows 扩展后的bed区间以50bp为一个bin进行划分
x <- tss %>%
bed_slop(genome, both = region_size) %>%
bed_makewindows(genome, win_size)
# map signals to TSS regions and calculate summary statistics.
# 使用bed_map函数统计每一个bin中的总reads数目
res <- bed_map(x, y, win_sum = sum(value, na.rm = TRUE)) %>%
group_by(.win_id) %>%
summarize(win_mean = mean(win_sum, na.rm = TRUE),
win_sd = sd(win_sum, na.rm = TRUE))
library(ggplot2)
x_labels <- seq(-region_size, region_size, by = win_size * 5)
x_breaks <- seq(1, bin_number + 1, by = 5)
sd_limits <- aes(ymax = win_mean + win_sd, ymin = win_mean - win_sd)
p <- ggplot(res, aes(x = .win_id, y = win_mean)) +
geom_point(size = 0.25) + geom_pointrange(sd_limits, size = 0.1) +
scale_x_continuous(labels = x_labels, breaks = x_breaks) +
xlab("Position (bp from TSS)") + ylab("Signal") +
theme_classic()
TSS_Plot <- ggplot(res, aes(x = .win_id, y = win_mean)) +
geom_point(size = 0.25) + geom_line(size = 1) +
scale_x_continuous(labels = c("-2Kb","TSS", "2Kb"), breaks = seq(0,40, by = bin_number/2)) +
xlab("Position (bp from TSS)") + ylab("Signal") +
theme_bw() +
theme(legend.key=element_rect(linetype='dashed',color="white"),
axis.text.y = element_text(size=20),axis.text.x = element_text(size=20),
legend.title = element_text(size=15),legend.text = element_text(size=15),
legend.key.height=unit(1.2,'cm'))
## 绘制TSS + GeneBody + TES
## 生成genebody的counts数
gene_body <- genes %>%
filter(strand == "+")
gene_body_x <- gene_body %>%
bed_makewindows(genome, num_win = bin_number/2)
gene_body_res <- bed_map(gene_body_x, y, win_sum = sum(value, na.rm = TRUE)) %>%
group_by(.win_id) %>%
summarize(win_mean = mean(win_sum, na.rm = TRUE),
win_sd = sd(win_sum, na.rm = TRUE))
gene_body_res[,1] <- c((bin_number/2 + 1):bin_number)
up_stream <- res[1:bin_number/2,]
# down_stream <- matrix(data = NA, ncol = 3, nrow = 20)
down_stream <- res[(bin_number/2 + 1):bin_number,]
down_stream[, 1] <- c((bin_number + 1):(bin_number/2 + bin_number))
new_data <- rbind(up_stream , gene_body_res, down_stream)
## ggplot2进行可视化
# 1000 bp up and downstream
region_size <- 2000
# 50 bp windows
win_size <- 100
ggplot(new_data, aes(x = .win_id, y = win_mean)) +
geom_point(size = 0.25) + geom_line(size = 1) +
scale_x_continuous(labels = c("-2Kb","TSS", "TES", "2Kb"), breaks = seq(0,bin_number*3/2, by = bin_number/2)) +
theme_bw() + xlab("") + ylab("Signal") +
theme(legend.key=element_rect(linetype='dashed',color="white"),
axis.text.y = element_text(size=20),axis.text.x = element_text(size=20),
legend.title = element_text(size=15),legend.text = element_text(size=15),
legend.key.height=unit(1.2,'cm'))
image.png
从上图明显可以看出此信号明显的富集在TSS区右侧
网友评论