今天遇到一个小问题，在给定的一些序列中，搜索CRISPR Toolbox中GeCKO library中已有的guideRNA。这正好符合PDict的要求，PDict要求所有的片段长度必须一致，而guide sequence的长度正好都是20。

代码如下：

> library(Biostrings) > seq <- dir("tobeSearched", pattern=".seq$") > seq [1] "1-F_A01_015_2015-03-06.seq" "1-R_C02_012_2015-03-06.seq" "10-F_B02_014_2015-03-06.seq" [4] "10-R_D03_025_2015-03-06.seq" "2-F_B01_013_2015-03-06.seq" "2-R_D02_010_2015-03-06.seq" [7] "3-F_C01_011_2015-03-06.seq" "3-R_E02_008_2015-03-06.seq" "4-F_D01_009_2015-03-06.seq" [10] "4-R_F02_006_2015-03-06.seq" "5-F_E01_007_2015-03-06.seq" "5-R_G02_004_2015-03-06.seq" [13] "6-F_F01_005_2015-03-06.seq" "6-R_H02_002_2015-03-06.seq" "7-F_G01_003_2015-03-06.seq" [16] "7-R_A03_031_2015-03-06.seq" "8-F_H01_001_2015-03-06.seq" "8-R_B03_029_2015-03-06.seq" [19] "9-F_A02_016_2015-03-06.seq" "9-R_C03_027_2015-03-06.seq" > sequences <- sapply(seq, function(.ele){ readLines(file.path("tobeSearched", .ele))}) > sequences[1:2] $`1-F_A01_015_2015-03-06.seq` [1] "NNNNNNNNNNNNNNNNNNNNNNNNTNNNNNNNGNNNNNNNNNNNNNNNNNNNNNNNAATATCCCCCNGACGGNNNGNGNN" [2] "NNNNNNNNNNNNTTCAGGCTGNNNNNNNCACNNNNNNCGAGNCATCANNNNNAGGGNGNGNTTGNNNCTCANNNNNNCAG" [3] "GTNCTGNNNGNNGCGTTNAGTGNCATCNCCGGTNGGNNNNNNNNNGGNNAAACCNCNCGNANNCCCNNCNNGAAGCNNNN" [4] "ACNTTNNTTNNNCCCNANNNGCCCNAAATGTTTNANCAANANNTNNNNNNNNNNNGNNNNCTTNTNNTNNNNNNNNNNNN" [5] "NNNNNNGNNNNNCGGNNGGNACNAANCNNCCNNNNNNNNANCANANGCNGGGGNNAAATTTNNCCCCNNCANNNNNNNNA" [6] "ANGGGNNCCCCNGNAANNTTTTTNNNAANNNCTTTCCCCCCCCNNAGGCCTTTCCCCCNNNAANNCCNCCCCCNNNNNNN" [7] "NNNNNGGNAAAAAAAAAAAAAAANNNNNNNAAAAANNNNGNNNNNNNNNNNAAAAAGNATNNNTNNTGNNNNNNNNNNTG" [8] "NNNNNNNNNANNNNAANNNNNNNNNTTNNGAANNANNGNNNGNNNNNNNNGTNNTANNNNNGNGNNTATTNNGNTTANNG" [9] "GNNNNGNGNNTNNNNNNNNGNGNNNNNNNNCNGTNNNTNNNNNNNTNNNNNNNNNNGNNGNNGTNGNNGTNNNNNNNGNN" [10] "NNGGNNNNNNGNNGGGNGNNNGGNNNNNNNNNGNCNNNTNNNNNNGNGTNGNNGNGNNNGNNNNNTANTNTNTNNTAANA" [11] "ANGNGNNNNGTGNTNTTTNNNNNNNTNNNNNNNNNNNNNNNNNNNNNNGNNNTTNNNNNNNNNNNNNNNNNN" $`1-R_C02_012_2015-03-06.seq` [1] "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCGGCNNAAGNCATTATGCTCGGNNAAAGNACTTCCTGGCGCGTGNTTAC" [2] "CAGGGCGCACCGATTGCCATCACCGTTGAAGGGGCTAACATTCTGACCCGCNGCATGATGNTCTTCGGACAAGGNGCGAT" [3] "TCGTTGCCATCCGTACGTGCTGGANGAGANGGAAGCGGCGAANAANAATGANGTCAACGCGTTCGATAAACTGTTGTTCA" [4] "AACATATCGGTCACGNCGGTAGCAACAAAGNTCGCAGCTTCTGGCTGGNCNNGANGCGCGGNTTAACCAGCAGCACGNCA" [5] "ACCGGCGATGCCACTAAACGCTACTATCAGCACCTGANCNGCNGAGCGCCNNCCNCGCNNTGNTNNTGANGNNTCGNNNN" [6] "NGNNNNGGNGNNNNNNAANNNCNNNNNNNNNNGNNNNNTTNTTTNNNNNNANNCNNCCCNCCNNNNNNNCNGNNNNNNNN" [7] "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGNNNNNNN" > sequences <- lapply(sequences, paste, collapse="") > sequences <- unlist(sequences) > sequences[1] 1-F_A01_015_2015-03-06.seq "NNNNNNNNNNNNNNNNNNNNNNNNTNNNNNNNGNNNNNNNNNNNNNNNNNNNNNNNAATATCCCCCNGACGGNNNGNGNNNNNNNNNNNNNNTTCAGGCTGNNNNNNNCACNNNNNNCGAGNCATCANNNNNAGGGNGNGNTTGNNNCTCANNNNNNCAGGTNCTGNNNGNNGCGTTNAGTGNCATCNCCGGTNGGNNNNNNNNNGGNNAAACCNCNCGNANNCCCNNCNNGAAGCNNNNACNTTNNTTNNNCCCNANNNGCCCNAAATGTTTNANCAANANNTNNNNNNNNNNNGNNNNCTTNTNNTNNNNNNNNNNNNNNNNNNGNNNNNCGGNNGGNACNAANCNNCCNNNNNNNNANCANANGCNGGGGNNAAATTTNNCCCCNNCANNNNNNNNAANGGGNNCCCCNGNAANNTTTTTNNNAANNNCTTTCCCCCCCCNNAGGCCTTTCCCCCNNNAANNCCNCCCCCNNNNNNNNNNNNGGNAAAAAAAAAAAAAAANNNNNNNAAAAANNNNGNNNNNNNNNNNAAAAAGNATNNNTNNTGNNNNNNNNNNTGNNNNNNNNNANNNNAANNNNNNNNNTTNNGAANNANNGNNNGNNNNNNNNGTNNTANNNNNGNGNNTATTNNGNTTANNGGNNNNGNGNNTNNNNNNNNGNGNNNNNNNNCNGTNNNTNNNNNNNTNNNNNNNNNNGNNGNNGTNGNNGTNNNNNNNGNNNNGGNNNNNNGNNGGGNGNNNGGNNNNNNNNNGNCNNNTNNNNNNGNGTNGNNGNGNNNGNNNNNTANTNTNTNNTAANAANGNGNNNNGTGNTNTTTNNNNNNNTNNNNNNNNNNNNNNNNNNNNNNGNNNTTNNNNNNNNNNNNNNNNNN" > sequences <- DNAStringSet(sequences) > head(sequences) A DNAStringSet instance of length 6 width seq names [1] 872 NNNNNNNNNNNNNNNNNNNNNNNNTNNNNNNNG...NNNNNNNNGNNNTTNNNNNNNNNNNNNNNNNN 1-F_A01_015_2015-... [2] 528 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCG...NNNNNNNNNNNNNNNNNNNNNNNNGNNNNNNN 1-R_C02_012_2015-... [3] 543 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNG...NNNNNNNNNNNNTNNNNNNNNNNNNNNNNNNN 10-F_B02_014_2015... [4] 525 GNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNC...NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 10-R_D03_025_2015... [5] 697 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNNGNNNNNNNNNCTTTTTTTTTTNNNNNNAN 2-F_B01_013_2015-... [6] 527 NNNNNNNNNNNNNNNNNNNNNNNNGNNNNNNNC...NNNNNNNNNNNNNNNNNNNNNNNNNNAANNNN 2-R_D02_010_2015-... > library1 <- read.csv("mouse_geckov2_library_b_1.10000000053.csv", stringsAsFactors=F) > library2 <- read.csv("mouse_geckov2_library_a_2.10000000053.csv", stringsAsFactors=F) > head(library1) gene_id UID seq 1 0610007P14Rik MGLibB_00001 TGTCTAAGGTTTCTCAATCC 2 0610007P14Rik MGLibB_00002 CAGTGAATGGCCTCCAAGCC 3 0610007P14Rik MGLibB_00003 GGTGCTTACTTGCTACCATC 4 0610009B22Rik MGLibB_00004 ACCTCGTCGACGAAAACATG 5 0610009B22Rik MGLibB_00005 GTTCATAGCTCATGCTGCTC 6 0610009B22Rik MGLibB_00006 GAGCAGCATGAGCTATGAAC > head(library2) gene_id UID seq 1 0610007P14Rik MGLibA_00001 TCCTGAATGTGTTACGAAGC 2 0610007P14Rik MGLibA_00002 GGTCGGGCTCCGGTACCTAG 3 0610007P14Rik MGLibA_00003 GCCAGCTTCGTAACACATTC 4 0610009B22Rik MGLibA_00004 TCATCATGCTGCATGACGTG 5 0610009B22Rik MGLibA_00005 ATTCAATGAGTGGTTCGTCT 6 0610009B22Rik MGLibA_00006 TCGTCACGGCTGGGCACATG > dim(library1) [1] 62804 3 > dim(library2) [1] 67405 3 > library1.DNA <- library1$seq > library2.DNA <- library2$seq > names(library1.DNA) <- library1$UID > names(library2.DNA) <- library2$UID > library1.DNA <- DNAStringSet(library1.DNA) > library2.DNA <- DNAStringSet(library2.DNA) > head(library1.DNA) A DNAStringSet instance of length 6 width seq names [1] 20 TGTCTAAGGTTTCTCAATCC MGLibB_00001 [2] 20 CAGTGAATGGCCTCCAAGCC MGLibB_00002 [3] 20 GGTGCTTACTTGCTACCATC MGLibB_00003 [4] 20 ACCTCGTCGACGAAAACATG MGLibB_00004 [5] 20 GTTCATAGCTCATGCTGCTC MGLibB_00005 [6] 20 GAGCAGCATGAGCTATGAAC MGLibB_00006 > head(library2.DNA) A DNAStringSet instance of length 6 width seq names [1] 20 TCCTGAATGTGTTACGAAGC MGLibA_00001 [2] 20 GGTCGGGCTCCGGTACCTAG MGLibA_00002 [3] 20 GCCAGCTTCGTAACACATTC MGLibA_00003 [4] 20 TCATCATGCTGCATGACGTG MGLibA_00004 [5] 20 ATTCAATGAGTGGTTCGTCT MGLibA_00005 [6] 20 TCGTCACGGCTGGGCACATG MGLibA_00006 > library1.PDict <- PDict(library1.DNA) > library2.PDict <- PDict(library2.DNA) > mT.1 <- lapply(sequences, function(.ele){ whichPDict(library1.PDict, .ele)}) > mT.2 <- lapply(sequences, function(.ele){ whichPDict(library2.PDict, .ele)}) > sapply(mT.1, length) 1-F_A01_015_2015-03-06.seq 1-R_C02_012_2015-03-06.seq 10-F_B02_014_2015-03-06.seq 0 0 0 10-R_D03_025_2015-03-06.seq 2-F_B01_013_2015-03-06.seq 2-R_D02_010_2015-03-06.seq 0 0 0 3-F_C01_011_2015-03-06.seq 3-R_E02_008_2015-03-06.seq 4-F_D01_009_2015-03-06.seq 0 0 0 4-R_F02_006_2015-03-06.seq 5-F_E01_007_2015-03-06.seq 5-R_G02_004_2015-03-06.seq 0 0 0 6-F_F01_005_2015-03-06.seq 6-R_H02_002_2015-03-06.seq 7-F_G01_003_2015-03-06.seq 0 0 0 7-R_A03_031_2015-03-06.seq 8-F_H01_001_2015-03-06.seq 8-R_B03_029_2015-03-06.seq 1 0 0 9-F_A02_016_2015-03-06.seq 9-R_C03_027_2015-03-06.seq 0 0 > sapply(mT.2, length) 1-F_A01_015_2015-03-06.seq 1-R_C02_012_2015-03-06.seq 10-F_B02_014_2015-03-06.seq 0 0 0 10-R_D03_025_2015-03-06.seq 2-F_B01_013_2015-03-06.seq 2-R_D02_010_2015-03-06.seq 0 0 0 3-F_C01_011_2015-03-06.seq 3-R_E02_008_2015-03-06.seq 4-F_D01_009_2015-03-06.seq 0 0 0 4-R_F02_006_2015-03-06.seq 5-F_E01_007_2015-03-06.seq 5-R_G02_004_2015-03-06.seq 0 0 0 6-F_F01_005_2015-03-06.seq 6-R_H02_002_2015-03-06.seq 7-F_G01_003_2015-03-06.seq 0 1 0 7-R_A03_031_2015-03-06.seq 8-F_H01_001_2015-03-06.seq 8-R_B03_029_2015-03-06.seq 0 0 0 9-F_A02_016_2015-03-06.seq 9-R_C03_027_2015-03-06.seq 0 0 |