其他
Molecular Cell 文章主图结果复现
没关注?伸出手指点这里---
1引言
今天复现一篇 Molecular Cell 文章里的部分结果。
文章标题:
eIF3 Associates with 80S Ribosomes to Promote Translation Elongation, Mitochondrial Homeostasis, and Muscle Health
复现图:
复现 A 图和 C 图。
文章方法:
我这里不是完全按照文章的方法,比如比对软件我用的是 hisat2。
2数据下载
GSE 号:
GSE131074
3质控
# 1.qc raw-data
mkdir -p 2.QC-data/raw-qc 2.QC-data/clean-qc
fastqc -t 12 -q -o 2.QC-data/raw-qc/ 1.raw-data/*.gz
multiqc 2.QC-data/raw-qc/* -o 2.QC-data/raw-qc/
4去除接头
# 2.trim adaptors
mkdir 3.trim-data
for i in SRR90471{90..95}
do
cutadapt -j 10 -f fastq -a CTGTAGGCACCATCAAT \
-O 6 -m 25 -M 35 \
-o 3.trim-data/${i}.trimmed.fq.gz \
1.raw-data/${i}.fastq.gz
done
fastqc -t 12 -q -o 2.QC-data/clean-qc/ 3.trim-data/*.gz
multiqc 2.QC-data/clean-qc/* -o 2.QC-data/clean-qc/
5建立 rRNA 索引并去除 rRNA 序列污染
# 3.build index for rRNA/tRNA and genome
mkdir rRNA-index
bowtie2-build human_rRNA.fasta rRNA-index/human_rRNA
# 4.rm rtRNA contamination from fastq files
mkdir 4.rmrRNA-data
for i in SRR90471{90..95}
do
bowtie2 -p 20 -x ./index/rRNA-index/human_rRNA \
--un-gz 4.rmrRNA-data/${i}.rmrRNA.fq.gz \
-U 3.trim-data/${i}.trimmed.fq.gz \
-S 4.rmrRNA-data/null
done
6准备转录组序列
# 5.map to trancriptome
GetCDSLongestFromGTF --database ensembl \
--gtffile Homo_sapiens.GRCh38.103.gtf.gz \
--genome Homo_sapiens.GRCh38.dna.primary_assembly.fa \
--outfile longest_cds_trans.fa
7比对到转录组上
## make index
mkdir trans-index
hisat2-build longest_cds_trans.fa trans-index/longest_cds_trans
mkdir 5.map-data
for i in SRR90471{90..95}
do
hisat2 -p 20 -x ./index/trans-index/longest_cds_trans \
-k 1 --summary-file 5.map-data/${i}.mapinfo.txt \
-U 4.rmrRNA-data/${i}.rmrRNA.fq.gz \
-S 5.map-data/${i}.sam
done
8计算 Ribo density
mkdir("./3.density-data")
sample = ["../5.map-data/SRR90471$i.sam" for i in range(90,95)]
outputname = ["siEif3e-rep1","siEif3e-rep2","siEif3e-rep3","siControl-rep1","siControl-rep2","siControl-rep3"]
# batch run
for i in range(1,6)
CalculateCWRibosomeDensity(sample[i],"./3.density-data/"*outputname[i]*".denisty.txt")
end
9metagene analysis
import os
# make folder
os.mkdir('./4.metagene-data')
# relative to start codon
outputname = ["siEif3e-rep1","siEif3e-rep2","siEif3e-rep3","siControl-rep1","siControl-rep2","siControl-rep3"]
# batch run
for i in range(0,6):
MetageneAnalysis(inputFile=''.join(["./3.density-data/",outputname[i],".denisty.txt"]),
outputFile=''.join(["./4.metagene-data/",outputname[i],".metageneST.txt"]),
mode="st",cdslength=900,expression=50)
# relative to stop codon
outputname = ["siEif3e-rep1","siEif3e-rep2","siEif3e-rep3","siControl-rep1","siControl-rep2","siControl-rep3"]
# batch run
for i in range(0,6):
MetageneAnalysis(inputFile=''.join(["./3.density-data/",outputname[i],".denisty.txt"]),
outputFile=''.join(["./4.metagene-data/",outputname[i],".metageneSP.txt"]),
mode="sp",cdslength=900,expression=50)
10C 图可视化
数据预处理:
library(ggplot2)
library(data.table)
library(tidyverse)
library(ggsci)
# load data
name <- list.files('3.density-data/','.txt')
map_df(1:length(name),function(x){
tmp <- fread(paste('3.density-data/',name[x],sep = ''),sep = '\t') %>%
select(-V3)
colnames(tmp) <- c('id','pos','rpm')
# add sample
tmp$sample <- sapply(strsplit(name[x],split = '-'),'[',1)
# add replicate
tmp$rep <- sapply(strsplit(name[x],split = '\\.'),'[',1)
return(tmp)
}) -> df_density
# add gene name
df_density[, c("gene_name") := tstrsplit(id, "|", fixed=TRUE)[1]]
# filter gene
df <- df_density %>% filter(gene_name == 'SDHA')
# start and end
start <- sapply(strsplit(df$id[1],split = '\\|'),'[',4) %>% as.numeric()
end <- sapply(strsplit(df$id[1],split = '\\|'),'[',5) %>% as.numeric()
# filter CDS region
cds_df <- df %>% filter(pos >= start & pos <= end) %>%
mutate(nt_pos = pos - start)
# transform to codon pos
sq <- seq(1,(end - start + 1),3);rg <- c(1:length(sq))
map_df(1:length(sq),function(z){
tmp2 = cds_df[nt_pos >= sq[z] & nt_pos <= sq[z] + 2
][,.(mean_rpm = mean(rpm)),by = .(gene_name,sample,rep)
][,`:=`(codon_pos = rg[z])]
return(tmp2)
}) -> codon_res
# mean for replicates
merge_rep <- codon_res %>%
dplyr::group_by(gene_name,sample,codon_pos) %>%
dplyr::summarise(mean_rep_rpm = mean(mean_rpm))
绘图:
# plot
ggplot(merge_rep,aes(x = codon_pos,y = mean_rep_rpm)) +
geom_rect(aes(xmin = 25,xmax = 75,ymin = -Inf,ymax = Inf),
color = NA,fill = 'yellow',alpha = 0.005) +
geom_col(aes(fill = sample),show.legend = F) +
theme_classic(base_size = 16) +
scale_fill_d3(name = '') +
scale_x_continuous(breaks = c(0,51,200,400,600)) +
theme(legend.background = element_blank(),
aspect.ratio = 0.25,
strip.background = element_rect(color = NA,fill = 'grey')) +
ylab('Ralative footprint density(RPM)') +
xlab('Ribosome position (codons/amino acids)') +
facet_wrap(~sample,scales = 'free_x',ncol = 1) +
ggtitle('gene: SDHA')
11A 图可视化
数据预处理:
library(ggplot2)
library(tidyverse)
library(ggsci)
library(Rmisc)
library(data.table)
###################################################################### start codon
name <- list.files(path = '4.metagene-data/',pattern = '.metageneST.txt')
map_df(1:length(name), function(x){
tmp = read.table(paste('4.metagene-data/',name[x],sep = ''))
colnames(tmp) <- c('pos','density')
tmp$sample <- sapply(strsplit(name[x],split = '-'),'[',1)
tmp$rep <- sapply(strsplit(name[x],split = '\\.'),'[',1)
return(tmp)
}) %>% data.table() -> df_st
########################## codon position transform
sq <- seq(-51,1500,3);rg <- c(-17:500)
map_df(unique(df_st$rep),function(x){
tmp = df_st[rep %in% x] %>% arrange(pos)
map_df(1:length(sq),function(z){
tmp1 = tmp[pos >= sq[z] & pos <= sq[z] + 2
][,.(mean_norm_density = mean(density)),by = .(rep,sample)
][,`:=`(codon_pos = rg[z])]
return(tmp1)
}) -> res
}) -> codon_meta_st
# calculate replicates means
mean_sdst <- codon_meta_st %>%
dplyr::group_by(sample,codon_pos) %>%
dplyr::summarise(mean_density = mean(mean_norm_density),
sd = sd(mean_norm_density))
绘图:
# mean sd
pst <- ggplot(mean_sdst,aes(x = codon_pos,y = mean_density)) +
geom_ribbon(aes(ymin = mean_density - sd,
ymax = mean_density + sd,
fill = sample),
alpha = 0.25) +
geom_line(aes(color = sample),size = 1) +
# geom_hline(yintercept = 1,lty = 'dashed',size = 1,color = 'grey50') +
theme_classic(base_size = 16) +
scale_color_lancet(name = '') +
scale_fill_lancet(name = '') +
scale_x_continuous(expand = c(0,0),limits = c(0,500)) +
theme(legend.position = c(0.9,0.9),
legend.background = element_blank(),
aspect.ratio = 0.6) +
ylab('Mean read density [AU]') +
xlab('Disatnce from start codon(codons/amino acids)') +
ylim(0,2)
pst
对于 A 图第二个小图类似:
###################################################################### stop codon
name <- list.files(path = '4.metagene-data/',pattern = '.metageneSP.txt')
map_df(1:length(name), function(x){
tmp = read.table(paste('4.metagene-data/',name[x],sep = ''))
colnames(tmp) <- c('pos','density')
tmp$sample <- sapply(strsplit(name[x],split = '-'),'[',1)
tmp$rep <- sapply(strsplit(name[x],split = '\\.'),'[',1)
return(tmp)
}) %>% data.table() -> df_sp
########################## codon position transform
sq <- seq(-1501,50,3);rg <- c(-500:17)
map_df(unique(df_sp$rep),function(x){
tmp = df_sp[rep %in% x] %>% arrange(pos)
map_df(1:length(sq),function(z){
tmp1 = tmp[pos >= sq[z] & pos <= sq[z] + 2
][,.(mean_norm_density = mean(density)),by = .(rep,sample)
][,`:=`(codon_pos = rg[z])]
return(tmp1)
}) -> res
}) -> codon_meta_sp
# calculate replicates means
mean_sdsp <- codon_meta_sp %>%
dplyr::group_by(sample,codon_pos) %>%
dplyr::summarise(mean_density = mean(mean_norm_density),
sd = sd(mean_norm_density))
# mean sd
psp <- ggplot(mean_sdsp,aes(x = codon_pos,y = mean_density)) +
geom_ribbon(aes(ymin = mean_density - sd,
ymax = mean_density + sd,
fill = sample),
alpha = 0.25) +
geom_line(aes(color = sample),size = 1) +
# geom_hline(yintercept = 1,lty = 'dashed',size = 1,color = 'grey50') +
theme_classic(base_size = 16) +
scale_color_lancet(name = '') +
scale_fill_lancet(name = '') +
scale_x_continuous(expand = c(0,0),limits = c(-500,0)) +
# coord_cartesian(expand = 0) +
theme(legend.position = c(0.9,0.9),
legend.background = element_blank(),
aspect.ratio = 0.6) +
ylab('Mean read density [AU]') +
xlab('Disatnce from start codon(codons/amino acids)') +
ylim(0,2)
psp
拼图:
# combine
library(patchwork)
pst / psp
可以看到和原文还是有较大差别的。
12结尾
这篇文章 rRNA 的比对率在十几二十几左右的,感觉做 Ribo-seq 还是提前去除 rRNA 是比较好的,不然大部分数据量都白测了。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
哦,数据代码已上传至QQ群,欢迎加入下载。
群二维码:
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀...