deeptools 结果重新绘图

Original JunJunLab 老俊俊的生信笔记 2022-08-15

收录于合集 #Chip-seq笔记 1个

随缘

1引言

之前就有这个想法,把 deeptools computeMatrix 输出的结果尝试自己能不能 重新个性化绘制 以及 是否方便 。但一直没有施以行动,此篇推文即是关于这样一个分享。

最重要的是去理解 computeMatrix 输出的结果文件的内容,才能准确的绘图!

有关 deeptools 的介绍见:

Deeptools 神器之 bamCoverage
神器之 computeMatrix + 绘图

这里从网上下载了 5 个 Chip-seq 的结果文件进行可视化,当作测试数据。

2deeptools 可视化

首先我们使用 deeptools 的功能去可视化,去看看效果怎样。

计算在基因 TSS 及 TES 的信号分布,reference-point模式:

computeMatrix reference-point -p 10 \
              --binSize 50 \
              --referencePoint TSS \
              -a 3000 -b 3000 \
              -R hg19.ncbiRefSeq.gtf \
              -S Input.bigWig H3K4me1.bigWig H3K4me3.bigWig H3K27ac.bigWig H3K9ac.bigWig \
              --skipZeros \
              -o combine-refPoint-data.gz

scale-regions 模式:

computeMatrix scale-regions -p 10 \
              --binSize 50 \
              --regionBodyLength 5000 \
              -a 3000 -b 3000 \
              -R hg19.ncbiRefSeq.gtf \
              -S Input.bigWig H3K4me1.bigWig H3K4me3.bigWig H3K27ac.bigWig H3K9ac.bigWig \
              --skipZeros \
              -o combine-scaleRegion-data.gz

然后使用 plotHeatmap 函数进行可视化绘制热图:

plotHeatmap -m combine-refPoint-data.gz \
            --missingDataColor 1 \
            --colorList 'white,#339933' \
            --heatmapHeight 12 \
            -o refp-heatmap.pdf

plotHeatmap -m combine-scaleRegion-data.gz \
            --missingDataColor 1 \
            --colorList 'white,#0066CC' \
            --heatmapHeight 12 \
            -o scaleRegion-heatmap.pdf

TSS:

TSS-TES:

感觉还是很不错的!

3R 里可视化

reference-point 数据

首先需要对输出的压缩文件进行解压,这里读入的是解压后的文件:

library(ggplot2)
library(tidyverse)
library(reshape2)
library(ggsci)
library(Rmisc)
library(data.table)

# load data
refp <- fread('combine-data',header = F)

# check
head(refp[1:3,1:8])
#      V1       V2       V3      V4 V5 V6    V7     V8
# 1: chrY 59001389 59002517 CTBP2P1  .  + 1.193 0.9338
# 2: chrY 28772469 28773515  CCNQP2  .  - 0.105 0.2480
# 3: chrY 28740814 28780802 PARP4P1  .  - 0.000 0.0000

dim(refp)
# [1] 92857   606

# 间距除以50 binsize得120,再乘以5个样本得600列,加上前6列基因信息为606列
(3000 + 3000)/50*5
# [1] 600

到这里基本上知道了数据的基本内容了,六列基因的信息加上bin 列的值。后面绘图需要再处理一下格式:

# 保留基因名
clean_refp <- refp %>% select(c(-1:-3,-5,-6))

# check
head(clean_refp[1:3,1:8])
#         V4    V7     V8     V9    V10    V11    V12   V13
# 1: CTBP2P1 1.193 0.9338 1.0962 1.6124 2.3094 2.5792 2.727
# 2:  CCNQP2 0.105 0.2480 0.3950 0.4400 0.4400 0.4400 0.335
# 3: PARP4P1 0.000 0.0000 0.0000 0.0000 0.0000 0.0000 0.000

# wide to long
long_refp <- melt(clean_refp,id.vars = 'V4',value.name = 'signal') %>%
  select(-variable)

# add sample name
long_refp$sample <- rep(c("Input","H3K4me1","H3K4me3","H3K27ac","H3K9ac"),
                        each = nrow(refp)*120)

# add x position
long_refp$pos <- rep(c(1:120),each = nrow(refp),times = 5)

Profile 图为每个位置处所有基因的一个均值作为代表,这里也计算了 sd和95% 置信区间,也可以作为阴影区间可视化:

# calculate means
filnal_refp <- long_refp %>%
  # remove na
  tidyr::drop_na() %>%
  dplyr::group_by(sample,pos) %>%
  # mean and 95% interval confidence
  dplyr::summarise(mean_signal = mean(signal),
                   sd = sd(signal),
                   upper = CI(signal,ci = 0.95)[1],
                   lower = CI(signal,ci = 0.95)[3])

# check
head(filnal_refp)
# # A tibble: 6 x 6
# # Groups:   sample [1]
#    sample    pos mean_signal    sd upper lower
#     <chr>   <int>       <dbl> <dbl> <dbl> <dbl>
# 1 H3K27ac     1        2.28  7.13  2.33  2.24
# 2 H3K27ac     2        2.30  7.18  2.34  2.25
# 3 H3K27ac     3        2.31  7.21  2.35  2.26
# 4 H3K27ac     4        2.31  7.21  2.36  2.27
# 5 H3K27ac     5        2.32  7.20  2.37  2.28
# 6 H3K27ac     6        2.33  7.20  2.38  2.28

可视化:

# plot
p <- ggplot(filnal_refp,aes(x = pos,y = mean_signal)) +
  geom_line(aes(color = sample),size = 1) +
  theme_classic(base_size = 14) +
  scale_color_d3(name = '') +
  # x label
  scale_x_continuous(breaks = c(0,60,120),
                     labels = c('-3 kb','TSS','+3 kb')) +
  xlab('') + ylab('Normalized signal') +
  theme(aspect.ratio = 0.8)

p

分面:

# facet
p + facet_wrap(~sample,scales = 'free_x',ncol = 5) +
  theme(strip.background = element_rect(color = NA,fill = 'grey'))

添加置信区间,是有阴影区间的,可能看不太出来:

# add CI
ggplot(filnal_refp,aes(x = pos,y = mean_signal)) +
  # add 0.95 interval
  geom_ribbon(aes(ymin = lower,
                  ymax = upper,
                  fill = sample),
              alpha = 0.5) +
  geom_line(aes(color = sample),size = 1) +
  theme_classic(base_size = 16) +
  scale_color_d3(name = '') +
  scale_fill_d3(name = '') +
  # x label
  scale_x_continuous(breaks = c(0,60,120),
                     labels = c('-3 kb','TSS','+3 kb')) +
  xlab('') + ylab('Normalized signal') +
  theme(aspect.ratio = 0.8)

scale-regions 数据

这个数据与上面不同的是多了中间的 regionBodyLength 的参数,所以 bin 的数量也包含了这个区间的长度:

####################################################
# load data
scaler <- fread('combine-scaleRegion-data',header = F)

# check
head(scaler[1:3,1:8])
#      V1       V2       V3      V4 V5 V6    V7     V8
# 1: chrY 59001389 59002517 CTBP2P1  .  + 1.193 0.9338
# 2: chrY 28772469 28773515  CCNQP2  .  - 0.105 0.2480
# 3: chrY 28740814 28780802 PARP4P1  .  - 0.000 0.0000

dim(scaler)
# [1] 93070  1106

# 间距除以50 binsize得220,再乘以5个样本得1100列,加上前6列基因信息为1106列
(3000 + 5000 + 3000)/50*5
# [1] 1100

到这里可以明白了数据的内容构成。

继续处理:

# 保留基因名
clean_scaler <- scaler %>% select(c(-1:-3,-5,-6))

# check
head(clean_scaler[1:3,1:8])
#         V4    V7     V8     V9    V10    V11    V12   V13
# 1: CTBP2P1 1.193 0.9338 1.0962 1.6124 2.3094 2.5792 2.727
# 2:  CCNQP2 0.105 0.2480 0.3950 0.4400 0.4400 0.4400 0.335
# 3: PARP4P1 0.000 0.0000 0.0000 0.0000 0.0000 0.0000 0.000

# wide to long
long_scaler <- melt(clean_scaler,id.vars = 'V4',value.name = 'signal') %>%
  select(-variable)

# add sample name
long_scaler$sample <- rep(c("Input","H3K4me1","H3K4me3","H3K27ac","H3K9ac"),
                        each = nrow(scaler)*220)

# add x position
long_scaler$pos <- rep(c(1:220),each = nrow(scaler),times = 5)

# calculate means
filnal_scaler <- long_scaler %>%
  # remove na
  tidyr::drop_na() %>%
  dplyr::group_by(sample,pos) %>%
  # mean and 95% interval confidence
  dplyr::summarise(mean_signal = mean(signal),
                   sd = sd(signal),
                   upper = CI(signal,ci = 0.95)[1],
                   lower = CI(signal,ci = 0.95)[3])

# check
head(filnal_scaler)
# # A tibble: 6 x 6
# # Groups:   sample [1]
#    sample    pos mean_signal    sd upper lower
#     <chr>   <int>       <dbl> <dbl> <dbl> <dbl>
# 1 H3K27ac     1        2.28  7.12  2.32  2.23
# 2 H3K27ac     2        2.29  7.17  2.34  2.24
# 3 H3K27ac     3        2.30  7.20  2.35  2.25
# 4 H3K27ac     4        2.31  7.20  2.35  2.26
# 5 H3K27ac     5        2.32  7.20  2.36  2.27
# 6 H3K27ac     6        2.32  7.19  2.37  2.28

最后可视化:

# plot
p <- ggplot(filnal_scaler,aes(x = pos,y = mean_signal)) +
  geom_line(aes(color = sample),size = 1) +
  theme_classic(base_size = 14) +
  scale_color_d3(name = '') +
  # x label
  scale_x_continuous(breaks = c(0,60,160,220),
                     labels = c('-3 kb','TSS','TES','+3 kb')) +
  xlab('') + ylab('Normalized signal') +
  theme(aspect.ratio = 0.8)

p

分面:

# facet
p + facet_wrap(~sample,scales = 'free_x',ncol = 5) +
  theme(strip.background = element_rect(color = NA,fill = 'grey'),
        axis.text.x = element_text(angle = 45,hjust = 1))

heatmap

这里也尝试了绘制热图,但是数据太大,R 里画的话就非常非常慢, 此外需要对数据进行排序, 这里提供一个代码:

####################################################
# heatmap

head(long_refp,3)
#        V4 signal sample pos
# 1 CTBP2P1  1.193  Input   1
# 2  CCNQP2  0.105  Input   1
# 3 PARP4P1  0.000  Input   1

# plot (slowly)
ggplot(long_refp %>% filter(sample == 'H3K27ac'),
       aes(x = pos,y = V4)) +
  geom_tile(aes(fill = signal)) +
  theme_bw() +
  # coord_cartesian(expand = 0) +
  scale_x_continuous(breaks = c(0,60,120),
                     labels = c('-3 kb','TSS','+3 kb')) +
  scale_fill_gradient(low = 'white',high = 'red') +
  ylab('') + xlab('') +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        panel.grid = element_blank(),
        aspect.ratio = 1.5)