生物字符串
对于生物字符串的处理,基本操作与前文所述一致。因为生物字符串有其特殊性,比如碱基只有ACGTN五种可能(不包含摆动(dna wobble))。生物字符串的常见操作比如求互补序列,反向序列,反向互补序列,翻译,转录,逆转录,碱基频率统计,序列比对等。这些操作使用字符串的基本操作也可以完成。Bioconductor为人们提供了更好的解决方案,这些常见操作已经打包提供给大家使用,极大地方便了使用者。提供这些常操作的软件包是Biostrings。它有一个基础类,BString,下面有三个继承类:DNAString, RNAString以及AAString。比对的结果会放在BStringAlign类当中。
> library(Biostrings)
> dna <- DNAString("TCTCCCAACCCTTGTACCAGTATAAATCGT")
> rna <- RNAString(dna) ## DNA转RNA,只是将T换成U
## 30-letter "RNAString" instance
## seq: UCUCCCAACCCUUGUACCAGUAUAAAUCGU
> complement(dna) ## 互补
## 30-letter "DNAString" instance
## seq: AGAGGGTTGGGAACATGGTCATATTTAGCA
> reverseComplement(dna) ## 反向互补
## 30-letter "DNAString" instance
## seq: ACGATTTATACTGGTACAAGGGTTGGGAGA
> DNAString(rna) ## RNA转DNA,只是将U换成T
## 30-letter "DNAString" instance
## seq: TCTCCCAACCCTTGTACCAGTATAAATCGT
> codons(rna) ## 密码子
## Views on a 30-letter RNAString subject
subject: UCUCCCAACCCUUGUACCAGUAUAAAUCGU
views:
start end width
[1] 1 3 3 [UCU]
[2] 4 6 3 [CCC]
[3] 7 9 3 [AAC]
[4] 10 12 3 [CCU]
[5] 13 15 3 [UGU]
[6] 16 18 3 [ACC]
[7] 19 21 3 [AGU]
[8] 22 24 3 [AUA]
[9] 25 27 3 [AAU]
[10] 28 30 3 [CGU]
> translate(rna) ## 翻译
## 10-letter "AAString" instance
## seq: SPNPCTSINR
> reverse(dna) ## 反向
## 30-letter "DNAString" instance
## seq: TGCTAAATATGACCATGTTCCCAACCCTCT
> alphabetFrequency(dna)
## A C G T M R W S Y K V H D B N - + .
## 8 10 3 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0
> hasOnlyBaseLetters(dna)
## [1] TRUE
> uniqueLetters(dna)
## [1] "A" "C" "G" "T"
> letterFrequency(dna, letters = "CG")
## C|G
## 13
> ## 计算GC比
> GC_content <- letterFrequency(dna, letters = "CG")/letterFrequency(dna, letters = "ACGT")
> GC_content
## C|G
## 0.4333
> TATA <- "TATA"
> ## 搜索
> mT <- matchPattern(TATA, dna)
> mT
## Views on a 30-letter DNAString subject
## subject: TCTCCCAACCCTTGTACCAGTATAAATCGT
## views:
## start end width
## [1] 21 24 4 [TATA]
> mTT <- pmatchPattern(TATA, c(dna, reverse(dna)))
> mTT
## Views on a 60-letter DNAString subject
## Subject: TCTCCCAACCCTTGTACCAGTATAAATCGTTGCTAAATATGACCATGTTCCCAACCCTCT
## views:
## start end width
## [1] 21 24 4 [TATA]
## Views on a 4-letter DNAString pattern
## Pattern: TATA
## views:
## start end width
## [1] 1 4 4 [TATA]
> ## 比对
> aa1 <- AAString("HXBLVYMGCHFDCXVBEHIKQZ")
> aa2 <- AAString("QRNYMYCFQCISGNEYKQN")
> pairwiseAlignment(aa1, aa2, substitutionMatrix = "BLOSUM62", type = "global")
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [1] HXBLVYMGCHFDCXVBEHIKQZ
## subject: [1] QRN--YMYC-FQCISGNEYKQN
## score: 9
> pairwiseAlignment(aa1, aa2, substitutionMatrix = "BLOSUM62", type = "local")
## Local PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [6] YMGCHFDCXVBEHIKQ
## subject: [4] YMYC-FQCISGNEYKQ
## score: 24
> s1 <- DNAString("ACTTCACCAGCTCCCTGGCGGTAAGTTGATCAAAGGAAACGCAAAGTTTTCAAG")
> s2 <- DNAString("GTTTCACTACTTCCTTTCGGGTAAGTAAATATATAAATATATAAAAATATAATTTTCATC")
> mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
> pairwiseAlignment(s1, s2, substitutionMatrix = mat, gapOpening = -5, gapExtension = -2)
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [1] ACTTCACCAGCTCCCTGGCGGTAAGTTGATC---AAAGG---AAACGCAAAGTTTTCAAG
## subject: [1] GTTTCACTACTTCCTTTCGGGTAAGTAAATATATAAATATATAAAAATATAATTTTCATC
## score: -52
> pairwiseAlignment(s1, s2, type = "local", substitutionMatrix = mat, gapOpening = -5,
+ gapExtension = -2)
## Local PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [20] GGTAAGT
## subject: [20] GGTAAGT
## score: 7
> pairwiseAlignment(s1, s2, type = "overlap", substitutionMatrix = mat, gapOpening = -5,
+ gapExtension = -2)
## Overlap PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [54] G
## subject: [1] G
## score: 1