下载安装
conda install -c hcc aspera-cli
# path of the ascp
which ascp
下载链接获取
-
已知文章中所用数据的BioProject number: PRJNAxxxxx
-
搜索项目文件信息
-
展开项目文件
-
选择链接类型
-
下载链接文件
解析链接文件
setlinks <- function(file_path, ena_tsv){
raw_data <- read.delim(paste0(file_path, ena_tsv), header = TRUE)
# fastq_aspera/fastq_ftp
raw_links <- stringr::str_split(raw_data$fastq_aspera, ";")
links <- unlist(raw_links)
# links <- paste0("ftp://", links)
temp <- gsub("_tsv.txt", "", ena_tsv)
file_name <- gsub("filereport_read_run_", "", temp)
write.table(links, quote = FALSE, row.names = FALSE,
col.names = FALSE,
file = paste0(file_path, file_name, "_links.txt"))
}
setlinks(file_path, ena_tsv)
下载
# find path
which ascp
find -name asperaweb_id_dsa.openssh
# download
ascp -i path_to/asperaweb_id_dsa.openssh --overwrite=diff -P33001
-T -l 30m era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/-XXXX
针对ftp链接
# 去掉末尾换行符
sed 's/\r//' .*url.txt | while read url;
do
wget -b $url;
done
# 查看文件情况
cat wget-log* | grep 100% | wc -l
# 抓取未完成链接
cat wget-log* | grep "Giving up" -B3 | grep "SRR"
| sed "s/ (try:20) => ‘//" | sed "s/’//"
md5 检查
md5检查可以校验文件完整性;在“下载链接获取”的 “5. 选择链接类型”中有fastq_md5,同样下载TSV格式并构建md5检查文件。
setmd5 <- function(file_path, md5_tsv){
raw_data <- read.delim(paste0(file_path, md5_tsv),
header = TRUE)
raw_md5 <- stringr::str_split(raw_data$fastq_md5, ";")
clean_md5 <- data.frame(unlist(raw_md5),
paste0(rep(raw_data$run_accession,
each=2),
c("_1.fastq.gz", "_2.fastq.gz")))
# set file name
temp <- gsub("_tsv.txt", "", ena_tsv)
file_name <- gsub("filereport_read_run_", "", temp)
write.table(clean_md5, quote = FALSE,
row.names = FALSE,
col.names = FALSE,
file = paste0(file_path,
file_name,
"_md5.txt"))
}
setmd5(file_path, md5_tsv)
md5sum -c _md5.txt
网友评论