首先最重要的参考链接:
image.png啊啊啊,如有侵犯版权, 麻烦请私信我,看到立马删除!
主要用来记录自己可能要用的一些知识点。(基本复制粘贴,建议直达链接)
欧剑虹老师BOOK学习记录:第一章 R/Bioconductor入门(1)
生物字符串
Biological strings
- 生物字符串的常见操作比如求
互补序列
,反向序列
,反向互补序列
,翻译
,转录
,逆转录
,碱基频率统计
,序列比对
等。
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("Biostrings")
library(Biostrings) ## 加载包
dna<-DNAString("TCTCCCAACCCTTGTACCAGTATAAATCGT")
# DNA反向序列
> reverse(dna)
30-letter "DNAString" instance
seq: TGCTAAATATGACCATGTTCCCAACCCTCT
# DNA转换成RNA
> RNAString(dna)
30-letter "RNAString" instance
seq: UCUCCCAACCCUUGUACCAGUAUAAAUCGU
# DNA的互补序列
> complement(dna)
30-letter "DNAString" instance
seq: AGAGGGTTGGGAACATGGTCATATTTAGCA
# DNA反向互补序列
> reverseComplement(dna)
30-letter "DNAString" instance
seq: ACGATTTATACTGGTACAAGGGTTGGGAGA
# DNA转录
> rna<-RNAString(complement(dna)) ## 转录,注意它与dna2rna的不同
> rna
30-letter "RNAString" instance
seq: AGAGGGUUGGGAACAUGGUCAUAUUUAGCA
# TCTCCCAACCCTTGTACCAGTATAAATCGT (为了方便对比,把DNA序列放在这里)
# UCUCCCAACCCUUGUACCAGUAUAAAUCGU (dna2rna)
# RNA转换成DNA
> DNAString(rna)
30-letter "DNAString" instance
seq: AGAGGGTTGGGAACATGGTCATATTTAGCA
# 逆转录为cDNA
> DNAString(complement(rna))
30-letter "DNAString" instance
seq: TCTCCCAACCCTTGTACCAGTATAAATCGT
# 将RNA转换成密码子
> codons(rna)
Views on a 30-letter RNAString subject
subject: AGAGGGUUGGGAACAUGGUCAUAUUUAGCA
views:
start end width
[1] 1 3 3 [AGA]
[2] 4 6 3 [GGG]
[3] 7 9 3 [UUG]
[4] 10 12 3 [GGA]
[5] 13 15 3 [ACA]
[6] 16 18 3 [UGG]
[7] 19 21 3 [UCA]
[8] 22 24 3 [UAU]
[9] 25 27 3 [UUA]
[10] 28 30 3 [GCA]
# 将密码子翻译成氨基酸
> translate(rna)
10-letter "AAString" instance
seq: RGLGTWSYLA
# 碱基频数的统计
> alphabetFrequency(dna)
A C G T M R W S Y K V H D B N - + .
8 10 3 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 查看是否只含有四种碱基
> hasOnlyBaseLetters(dna)
[1] TRUE
# 唯一的碱基
> uniqueLetters(dna)
[1] "A" "C" "G" "T"
# 自定义统计碱基比例,如常见的CG频数
> letterFrequency(dna, letters="CG")
C|G
13
# 计算GC频率
> GC_content <- letterFrequency(dna, letters="CG")/letterFrequency(dna, letters="ACGT"); GC_content
C|G
0.4333333
# 搜索自定义的碱基序列
> TATA <- "TATA"
> (mT <- matchPattern(TATA, dna))
Views on a 30-letter DNAString subject
subject: TCTCCCAACCCTTGTACCAGTATAAATCGT
views:
start end width
[1] 21 24 4 [TATA]
# 进行序列比对
## 蛋白质序列的比对
> aa1<-AAString("HXBLVYMGCHFDCXVBEHIKQZ")
> aa2<-AAString("QRNYMYCFQCISGNEYKQN")
# 全局比对
> pairwiseAlignment(aa1, aa2, substitutionMatrix="BLOSUM62", type="global")
Global PairwiseAlignmentsSingleSubject (1 of 1)
pattern: HXBLVYMGCHFDCXVBEHIKQZ
subject: QRN--YMYC-FQCISGNEYKQN
score: 9
# 局部比对
> pairwiseAlignment(aa1, aa2, substitutionMatrix="BLOSUM62", type="local")
Local PairwiseAlignmentsSingleSubject (1 of 1)
pattern: [6] YMGCHFDCXVBEHIKQ
subject: [4] YMYC-FQCISGNEYKQ
score: 24
## DNA序列的比对
> s1 <-
+ DNAString("ACTTCACCAGCTCCCTGGCGGTAAGTTGATCAAAGGAAACGCAAAGTTTTCAAG")
> s2 <-
+ DNAString("GTTTCACTACTTCCTTTCGGGTAAGTAAATATATAAATATATAAAAATATAATTTTCATC")
## 指定罚分
> mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
> mat
A C G T
A 1 -3 -3 -3
C -3 1 -3 -3
G -3 -3 1 -3
T -3 -3 -3 1
> pairwiseAlignment(s1, s2, substitutionMatrix = mat,
+ gapOpening = -5, gapExtension = -2)
Global PairwiseAlignmentsSingleSubject (1 of 1)
pattern: ACTTCACCAGCTCCCTGGCGGTAAGTTGATC---AAAGG---AAACGCAAAGTTTTCAAG
subject: GTTTCACTACTTCCTTTCGGGTAAGTAAATATATAAATATATAAAAATATAATTTTCATC
score: -52
> pairwiseAlignment(s1, s2, type = "local", substitutionMatrix = mat,
+ gapOpening = -5, gapExtension = -2)
Local PairwiseAlignmentsSingleSubject (1 of 1)
pattern: [20] GGTAAGT
subject: [20] GGTAAGT
score: 7
> pairwiseAlignment(s1, s2, type = "overlap", substitutionMatrix = mat,
+ gapOpening = -5, gapExtension = -2)
Overlap PairwiseAlignmentsSingleSubject (1 of 1)
pattern: [54] G
subject: [1] G
score: 1
网友评论