生物信息学生R入门教程–字符串处理

生物字符串

对于生物字符串的处理,基本操作与前文所述一致。因为生物字符串有其特殊性,比如碱基只有ACGTN五种可能(不包含摆动(dna wobble))。生物字符串的常见操作比如求互补序列,反向序列,反向互补序列,翻译,转录,逆转录,碱基频率统计,序列比对等。这些操作使用字符串的基本操作也可以完成。Bioconductor为人们提供了更好的解决方案,这些常见操作已经打包提供给大家使用,极大地方便了使用者。提供这些常操作的软件包是Biostrings。它有一个基础类,BString,下面有三个继承类:DNAString, RNAString以及AAString。比对的结果会放在BStringAlign类当中。

> library(Biostrings)
> dna <- DNAString("TCTCCCAACCCTTGTACCAGTATAAATCGT")
> rna <- RNAString(dna)  ## DNA转RNA,只是将T换成U
##   30-letter "RNAString" instance
## seq: UCUCCCAACCCUUGUACCAGUAUAAAUCGU
> complement(dna)  ## 互补
##   30-letter "DNAString" instance
## seq: AGAGGGTTGGGAACATGGTCATATTTAGCA
> reverseComplement(dna)  ## 反向互补
##   30-letter "DNAString" instance
## seq: ACGATTTATACTGGTACAAGGGTTGGGAGA
> DNAString(rna)  ## RNA转DNA,只是将U换成T
##   30-letter "DNAString" instance
## seq: TCTCCCAACCCTTGTACCAGTATAAATCGT
> codons(rna)  ## 密码子
##   Views on a 30-letter RNAString subject
subject: UCUCCCAACCCUUGUACCAGUAUAAAUCGU
views:
     start end width
 [1]     1   3     3 [UCU]
 [2]     4   6     3 [CCC]
 [3]     7   9     3 [AAC]
 [4]    10  12     3 [CCU]
 [5]    13  15     3 [UGU]
 [6]    16  18     3 [ACC]
 [7]    19  21     3 [AGU]
 [8]    22  24     3 [AUA]
 [9]    25  27     3 [AAU]
[10]    28  30     3 [CGU]
> translate(rna)  ## 翻译
##   10-letter "AAString" instance
## seq: SPNPCTSINR
> reverse(dna)  ## 反向
##   30-letter "DNAString" instance
## seq: TGCTAAATATGACCATGTTCCCAACCCTCT
> alphabetFrequency(dna)
##  A  C  G  T  M  R  W  S  Y  K  V  H  D  B  N  -  +  . 
##  8 10  3  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0
> hasOnlyBaseLetters(dna)
## [1] TRUE
> uniqueLetters(dna)
## [1] "A" "C" "G" "T"
> letterFrequency(dna, letters = "CG")
## C|G 
##  13
> ## 计算GC比
> GC_content <- letterFrequency(dna, letters = "CG")/letterFrequency(dna, letters = "ACGT")
> GC_content
##    C|G 
## 0.4333
> TATA <- "TATA"
> ## 搜索
> mT <- matchPattern(TATA, dna)
> mT
##   Views on a 30-letter DNAString subject
## subject: TCTCCCAACCCTTGTACCAGTATAAATCGT
## views:
##     start end width
## [1]    21  24     4 [TATA]
> mTT <- pmatchPattern(TATA, c(dna, reverse(dna)))
> mTT
##   Views on a 60-letter DNAString subject
## Subject: TCTCCCAACCCTTGTACCAGTATAAATCGTTGCTAAATATGACCATGTTCCCAACCCTCT
## views:
##     start end width
## [1]    21  24     4 [TATA]
##   Views on a 4-letter DNAString pattern
## Pattern: TATA
## views:
##     start end width
## [1]     1   4     4 [TATA]
> ## 比对
> aa1 <- AAString("HXBLVYMGCHFDCXVBEHIKQZ")
> aa2 <- AAString("QRNYMYCFQCISGNEYKQN")
> pairwiseAlignment(aa1, aa2, substitutionMatrix = "BLOSUM62", type = "global")
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [1] HXBLVYMGCHFDCXVBEHIKQZ 
## subject: [1] QRN--YMYC-FQCISGNEYKQN 
## score: 9
> pairwiseAlignment(aa1, aa2, substitutionMatrix = "BLOSUM62", type = "local")
## Local PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [6] YMGCHFDCXVBEHIKQ 
## subject: [4] YMYC-FQCISGNEYKQ 
## score: 24
> s1 <- DNAString("ACTTCACCAGCTCCCTGGCGGTAAGTTGATCAAAGGAAACGCAAAGTTTTCAAG")
> s2 <- DNAString("GTTTCACTACTTCCTTTCGGGTAAGTAAATATATAAATATATAAAAATATAATTTTCATC")
> mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
> pairwiseAlignment(s1, s2, substitutionMatrix = mat, gapOpening = -5, gapExtension = -2)
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [1] ACTTCACCAGCTCCCTGGCGGTAAGTTGATC---AAAGG---AAACGCAAAGTTTTCAAG 
## subject: [1] GTTTCACTACTTCCTTTCGGGTAAGTAAATATATAAATATATAAAAATATAATTTTCATC 
## score: -52
> pairwiseAlignment(s1, s2, type = "local", substitutionMatrix = mat, gapOpening = -5, 
+     gapExtension = -2)
## Local PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [20] GGTAAGT 
## subject: [20] GGTAAGT 
## score: 7
> pairwiseAlignment(s1, s2, type = "overlap", substitutionMatrix = mat, gapOpening = -5, 
+     gapExtension = -2)
## Overlap PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [54] G 
## subject:  [1] G 
## score: 1

发表评论

电子邮件地址不会被公开。 必填项已用*标注