示例数据:GSE178911→GSM5400792→ SRR14915863~SRR14915866
- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE178911
- wget 下载到linux服务器
https://sra-download.ncbi.nlm.nih.gov/traces/sra7/SRZ/014915/SRR14915863/201008_D0_scRNA_fastq_S1_L001_R1_001.fastq.gz
https://sra-download.ncbi.nlm.nih.gov/traces/sra7/SRZ/014915/SRR14915863/201008_D0_scRNA_fastq_S1_L001_R2_001.fastq.gz
https://sra-download.ncbi.nlm.nih.gov/traces/sra30/SRZ/014915/SRR14915864/201008_D0_scRNA_fastq_S1_L002_R1_001.fastq.gz
https://sra-download.ncbi.nlm.nih.gov/traces/sra30/SRZ/014915/SRR14915864/201008_D0_scRNA_fastq_S1_L002_R2_001.fastq.gz
https://sra-download.ncbi.nlm.nih.gov/traces/sra51/SRZ/014915/SRR14915865/201008_D0_scRNA_fastq_S1_L003_R1_001.fastq.gz
https://sra-download.ncbi.nlm.nih.gov/traces/sra51/SRZ/014915/SRR14915865/201008_D0_scRNA_fastq_S1_L003_R2_001.fastq.gz
https://sra-download.ncbi.nlm.nih.gov/traces/sra67/SRZ/014915/SRR14915866/201008_D0_scRNA_fastq_S1_L004_R1_001.fastq.gz
https://sra-download.ncbi.nlm.nih.gov/traces/sra67/SRZ/014915/SRR14915866/201008_D0_scRNA_fastq_S1_L004_R2_001.fastq.gz
1、cellranger比对(linux)
- 首先下载安装cellranger
- https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/latest
- 解压即可用~
-
其次下载该软件团队提供的配套的参考基因组文件,有人/鼠的,按需下载、解压、备用。
-
开始比对~~~
bin=~/biosoft/cellranger-6.0.2/bin/cellranger
db=./refdata-gex-GRCh38-2020-A
fq_dir=./fastq/GSM5400792
$bin count --id=201008_D0_scRNA_fastq \
--localcores=10 \
--transcriptome=$db \
--fastqs=$fq_dir \
--sample=201008_D0_scRNA_fastq \
--expect-cells=3000 \
--nosecondary
#比对时间视使用线程数而定
- 查看比对结果
out
文件夹
2、velocyto生成loom文件(linux)
- https://mp.weixin.qq.com/s/TJsdj7qesZGlvmEkNLAy0A
- 使用conda 构建一个velocyto软件分析环境
conda create -n velocyto
conda activate velocyto
conda install samtools
conda install numpy scipy cython numba matplotlib scikit-learn h5py click
pip install pysam
# `PyPI`安装
pip install velocyto
- 下载基因组gtf注释文件
需要两个,一个是标准的基因组gtf注释文件,在上一步下载的refdata-gex-GRCh38-2020-A
文件夹里已有。
另一个是对repeat region注释的gtf文件
http://velocyto.org/velocyto.py/tutorial/cli.html
You might want to mask expressed repetitive elements, since those count could constitute a confounding factor in the downstream analysis. To do so you would need to download an appropriate expressed repeat annotation (for example from UCSC genome browser and make sure to select GTF as output format).
https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=611454127_NtvlaW6xBSIRYJEBI0iRDEWisITa&clade=mammal&org=Human&db=0&hgta_group=allTracks&hgta_track=rmsk&hgta_table=rmsk&hgta_regionType=genome&position=&hgta_outputType=gff&hgta_outFileName=mm10_rmsk.gtf
- 开始分析
cellranger_outDir=201008_D0_scRNA_fastq
cellranger_gtf=refdata-gex-GRCh38-2020-A/genes/genes.gtf
rmsk_gtf=rmsk.gtf
ls -lh $rmsk_gtf $cellranger_outDir $cellranger_gtf
velocyto run10x -m $rmsk_gtf $cellranger_outDir $cellranger_gtf
#比较耗时
- 结果储存在cellranger_outDir文件夹里的
velocyto/
子文件夹里
3、Seurat标准流程分析(Rstudio)
- 使用Seurat包对
filtered_feature_bc_matrix
文件夹的三个文件(feature/barcode/matrix)进行标准流程分析 - 其中必须包括tSNE/uMAP降维,以及cluster/celltype分群注释信息。
- 可参看以前笔记,就不多做介绍了~
4 、velocyto.R速率分析、可视化(linux环境的R)
-
首先需要安装velocyto.R包,比较有难度,好像不可以安装在windows环境。可参考
https://blog.csdn.net/weixin_39568781/article/details/111249461 -
然后把之前得到loom文件以及seurat对象都准备到当前工作目录下
library(Seurat)
library(velocyto.R)
library(tidyverse)
# sce_all = readRDS("sce.rds")
# sce_all = SplitObject(sce_all, split.by = "orig.ident")
# sce_1 = sce_all[[1]]
sce_1 = readRDS("sce.rds")
sce_1@assays$RNA@counts[1:4,1:4]
velo_1 <- read.loom.matrices("201008_D0_scRNA_fastq.loom")
velo_1$spliced[1:4,1:4]
#以Seurat对象为标准,统一loom文件里的细胞名和数量
colnames(velo_1$spliced)<-gsub("x","-1",colnames(velo_1$spliced))
colnames(velo_1$spliced)<-gsub("201008_D0_scRNA_fastq:","1_",colnames(velo_1$spliced))
colnames(velo_1$unspliced)<-colnames(velo_1$spliced)
colnames(velo_1$ambiguous)<-colnames(velo_1$spliced)
velo_1$spliced<-velo_1$spliced[,rownames(sce_1@meta.data)]
velo_1$unspliced<-velo_1$unspliced[,rownames(sce_1@meta.data)]
velo_1$ambiguous<-velo_1$ambiguous[,rownames(sce_1@meta.data)]
#速率分析
sp <- velo_1$spliced
unsp <- velo_1$unspliced
sce_1_umap <- sce_1@reductions$umap@cell.embeddings
ident.colors <- (scales::hue_pal())(n = length(x = levels(x = sce_1)))
names(x = ident.colors) <- levels(x = sce_1)
cell.colors <- ident.colors[Idents(object = sce_1)]
names(x = cell.colors) <- colnames(x = sce_1)
fit.quantile = 0.02
cell.dist <- as.dist(1-armaCor(t(sce_1@reductions$pca@cell.embeddings)))
rvel.cd <- gene.relative.velocity.estimates(sp,unsp,deltaT=2, kCells=10,
cell.dist=cell.dist,fit.quantile=fit.quantile,n.cores=24)
#可视化
Idents(sce_1) = "fine"
gg <- UMAPPlot(sce_1)
colors <- as.list(ggplot_build(gg)$data[[1]]$colour)
names(colors) <- rownames(sce_1_umap)
show.velocity.on.embedding.cor(sce_1_umap,rvel.cd,n=30,scale='sqrt',cell.colors=ac(colors,alpha=0.5),cex=0.8,arrow.scale=2,show.grid.flow=T,min.grid.cell.mass=1.0,grid.n=50,arrow.lwd=1,do.par=F,cell.border.alpha =0.1,USE_OPENMP=1,n.cores=24,main="Cell Velocity")
#比较耗时~~
在加载Seurat对象时,可以仅提取感兴趣的亚群,进行分析~
- 关于RNA速率分析的背景、结果解读之后再学习一下~。但从上面的流程来收看,还是比较复杂的。
网友评论