##### 利用R作base frequency plot #####
library(Biostrings)
fastq <- readDNAStringSet("Sg2-30-filter-reverse-R2.fastq","fastq")
## At each position, base frequency
afmc=consensusMatrix(fastq, baseOnly=T,as.prob = T)
tafmc=t(afmc)
matplot(tafmc[,-5], main='Sg2-30-R2',type="l", lwd=2, xlab="Read Length", ylab= "Base frequency at each position", cex.lab=1.1)
legend(legend = colnames(tafmc)[-5],"top",col=1:4, lty=1:4, lwd=2)
####method2##library(seqTools)
library(seqTools)
# Reads fastq file
fq=fastqq("Sg2-30_L2_P707505.R1.clean.fastq.gz")
# Plots nucleotide frequency
plotNucFreq(fq,1)
####end######
##从5‘端开始切割###
cutadapt -j 15 -g ACGGCG -O 3 Sg2-0_L2_P706504.R1.clean.fastq> Sg2-0-cutleft.fastq
##从3’端开始切割###
cutadapt -j 15 -a AGATCGGAAGA -O 3 Sg2-0-cutleft.fastq> Sg2-0-cut-R1.fastq
###筛选reads长度####
awk 'BEGIN {OFS = "\n"} {header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= 2 && length(seq) <= 25) {print header, seq, qheader, qseq}}' < your.fastq> filtered.fastq
##取反向互补序列###
seqkit seq Sg2-30-filter-R2.fastq-r -p > Sg2-30-filter-reverse-R2.fastq
###切掉正数或倒数N个碱基(u取正负值分别代表正数或者倒数)####
cutadapt -j 18 -u -2 -o findmotif-sg2-30-R2.fastq motif-sg2-30-R2.fastq
####转换fastq文件为fasta文件#####
seqkit fq2fa findmotif-sg2-5-R2.fastq-o findmotif-sg2-5-R2.fa
####两个文件取交集(保留重复)#####
grep -F -f file1.txt file2.txt> overlap.txt
#####去重复########
sort overlap.txt| uniq > result.txt
#####在文件中搜索特殊字符串的出现行数####
grep -o TTCAGCCGCTACCCC findmotif-sg2-0-R1.fa| wc -l
#####删除指定字符串行#####
sed -i -e '/TTCAGCCGCTACCCC/d' grep5-10-30.txt
#####每隔N行添加字符串JJC#####
sed '0~N s/$/\nJJC/g' file.txt> result.txt
#####每隔N行在末尾添加#####
sed '0~N s/$/JJC/g' < inputfile > outputfile
#####每隔1行添加随机字符串(一般用来做fasta文件)
awk 'BEGIN{OFS="\n";i=1000}{ print ">ENLISH"i,$0}{i+=2}' chr.fa> chr.all.fa
#####meme找motif######
meme 5-10-30.fa-dna -o 5-10-30 -nmotifs 2 -mod oops -minw 4
#### 删除文件最后两个字符 ###
sed 's/\(.\)\{2\}$//'
#### 删除文件前两个字符(一个. 代表一个字符)
sed -i 's/^..//g'
网友评论