本教程版权为《糗世界》所有,任何组织或个人不得未经书面许可转载。
在阅读本节前请自测,如果全部都知道正确答案,则可跳过本节。
- 如何连接几个字符串?
- 如何分割一个字符串?
- 如何使用正则搜索字符串?
- 如何比对两序列?
字符串处理是生物信息当中经常遇到的问题。常见的操作有:赋值,获取字符串长度,截取,替换,联接,分割,比较,排序,匹配,部分匹配,正则匹配,格式化,分行。对于生物字符串,基本操作有:互补,反向,反向互补,翻译,转录,逆转录,碱基频率统计,模板搜索,搜索回文结构以及间隔成对匹配,比对。下面笔者将逐一介绍在R中如何操作。
常规操作
> # 赋值
> mychar <- c("ACTACCACTAACCACT", "TCATCCATTCGTGGG", "GTTGTTCCATAG")
> # 获取字串长度
> nchar(mychar)
## [1] 16 15 12
> length(mychar)
## [1] 3
> # 截取
> substr(mychar, 2, 4)
## [1] "CTA" "CAT" "TTG"
> substr(mychar, 2, 4) <- "TTT"
> mychar
## [1] "ATTTCCACTAACCACT" "TTTTCCATTCGTGGG" "GTTTTTCCATAG"
> # 替换
> dna2rna <- function(inputStr) {
+ if (!is.character(inputStr))
+ stop("need character input")
+ is = toupper(inputStr)
+ chartr("T", "U", is)
+ }
> dna2rna(mychar)
## [1] "AUUUCCACUAACCACU" "UUUUCCAUUCGUGGG" "GUUUUUCCAUAG"
> # 联接
> paste("prefix", mychar, "postfix", sep = "-")
## [1] "prefix-ATTTCCACTAACCACT-postfix" "prefix-TTTTCCATTCGTGGG-postfix"
## [3] "prefix-GTTTTTCCATAG-postfix"
> paste(mychar, collapse = "--")
## [1] "ATTTCCACTAACCACT--TTTTCCATTCGTGGG--GTTTTTCCATAG"
> # 分割
> strsplit(mychar, "C")
## [[1]]
## [1] "ATTT" "" "A" "TAA" "" "A" "T"
##
## [[2]]
## [1] "TTTT" "" "ATT" "GTGGG"
##
## [[3]]
## [1] "GTTTTT" "" "ATAG"
> strsplit(mychar, "[CG]")
## [[1]]
## [1] "ATTT" "" "A" "TAA" "" "A" "T"
##
## [[2]]
## [1] "TTTT" "" "ATT" "" "T" "" ""
##
## [[3]]
## [1] "" "TTTTT" "" "ATA"
> strsplit(mychar, "")
## [[1]]
## [1] "A" "T" "T" "T" "C" "C" "A" "C" "T" "A" "A" "C" "C" "A" "C" "T"
##
## [[2]]
## [1] "T" "T" "T" "T" "C" "C" "A" "T" "T" "C" "G" "T" "G" "G" "G"
##
## [[3]]
## [1] "G" "T" "T" "T" "T" "T" "C" "C" "A" "T" "A" "G"
> # 比较
> mychar[1] > mychar[2]
## [1] FALSE
> # 排序
> sort(mychar)
## [1] "ATTTCCACTAACCACT" "GTTTTTCCATAG" "TTTTCCATTCGTGGG"
> # 匹配
> exT <- c("Intron", "Exon", "promoter", "enhancer")
> match("Exon", exT)
## [1] 2
> "promoter" %in% exT
## [1] TRUE
> # 部分匹配
> pmatch("E", exT)
## [1] 2
> pmatch("x", exT)
## [1] NA
> charmatch("E", exT)
## [1] 2
> charmatch("x", exT)
## [1] NA
> pmatch(c("Exo", "enh"), exT)
## [1] 2 4
> # 格式化
> format(1:10)
## [1] " 1" " 2" " 3" " 4" " 5" " 6" " 7" " 8" " 9" "10"
> format(1:10, trim = TRUE)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
> zz <- data.frame(`(row names)` = c("aaaaa", "b"), check.names = FALSE)
> format(zz)
## (row names)
## 1 aaaaa
## 2 b
> format(zz, justify = "left")
## (row names)
## 1 aaaaa
## 2 b
> ## use of nsmall
> format(13.7)
## [1] "13.7"
> format(13.7, nsmall = 3)
## [1] "13.700"
> format(c(6, 13.1), digits = 2)
## [1] " 6" "13"
> format(c(6, 13.1), digits = 2, nsmall = 1)
## [1] " 6.0" "13.1"
> ## use of scientific
> format(2^31 - 1)
## [1] "2.147e+09"
> format(2^31 - 1, scientific = TRUE)
## [1] "2.147e+09"
> ## a list
> z <- list(a = letters[1:3], b = (-pi + (0+0i))^((-2:2)/2), c = c(1, 10, 100,
+ 1000), d = c("a", "longer", "character", "string"))
> format(z, digits = 2)
## a
## "a, b, c"
## b
## "-0.32+0.00i, 0.00-0.56i, 1.00+0.00i, 0.00+1.77i, -3.14+0.00i"
## c
## "1, 10, 100, 1000"
## d
## "a , longer , character, string "
> format(z, digits = 2, justify = "left", trim = FALSE)
## a
## "a, b, c"
## b
## "-0.32+0.00i, 0.00-0.56i, 1.00+0.00i, 0.00+1.77i, -3.14+0.00i"
## c
## " 1, 10, 100, 1000"
## d
## "a , longer , character, string "
> # 还有两种C风格的格式化手段,formatC和sprintf。这里就不列举了。 分行
> x <- paste(readLines(file.path(R.home(), "COPYING")), collapse = "\n")
> strwrap(x, 30, prefix = "BIOCONDUCTOR: ")[1:6]
## [1] "BIOCONDUCTOR: GNU GENERAL" "BIOCONDUCTOR: PUBLIC LICENSE"
## [3] "BIOCONDUCTOR: Version 2, June" "BIOCONDUCTOR: 1991"
## [5] "BIOCONDUCTOR: " "BIOCONDUCTOR: Copyright (C)"
> writeLines(strwrap(x, 30, prefix = "BIOCONDUCTOR: ")[1:6])
## BIOCONDUCTOR: GNU GENERAL
## BIOCONDUCTOR: PUBLIC LICENSE
## BIOCONDUCTOR: Version 2, June
## BIOCONDUCTOR: 1991
## BIOCONDUCTOR:
## BIOCONDUCTOR: Copyright (C)