> library(TxDb.Mmusculus.UCSC.mm10.knownGene) Loading required package: GenomicFeatures > txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene > library(Gviz) > exons <- GeneRegionTrack(txdb) > exons <- exons@range > library(Mus.musculus) Loading required package: OrganismDbi Loading required package: GO.db Loading required package: org.Mm.eg.db > symbol <- select(Mus.musculus, exons$symbol, "SYMBOL", "TXNAME") > exons$symbol <- symbol[match(exons$symbol, symbol[,1]), 2] > exons <- unique(exons) > length(exons) [1] 316926 > head(exons) GRanges with 6 ranges and 7 metadata columns: seqnames ranges strand | feature id exon transcript gene symbol density <Rle> <IRanges> <Rle> | <character> <character> <integer> <character> <character> <character> <numeric> [1] chr1 [4807893, 4807913] + | utr5 unknown 1 uc007afh.1 18777 Lypla1 1 [2] chr1 [4807893, 4807928] + | utr5 unknown 1 uc007afg.1 18777 Lypla1 1 [3] chr1 [4807914, 4807982] + | CDS unknown 1 uc007afh.1 18777 Lypla1 1 [4] chr1 [4807929, 4807982] + | CDS unknown 2 uc007afg.1 18777 Lypla1 1 [5] chr1 [4808455, 4808486] + | CDS unknown 3 uc007afg.1 18777 Lypla1 1 [6] chr1 [4828584, 4828649] + | CDS unknown 4 uc007afg.1 18777 Lypla1 1 --- seqlengths: chr1 chr1_GL456210_random chr1_GL456211_random chr1_GL456212_random ... chrX chrX_GL456233_random chrY NA NA NA NA ... NA NA NA > introns <- intronsByTranscript(txdb, use.names=TRUE) > len <- sapply(introns, length) > introns <- introns[len>0] > len <- len[len>0] > intronNames <- names(introns) > intronNames <- rep(intronNames, len) > introns <- unlist(introns) > introns$feature <- "intron" > introns$transcript <- intronNames > symbol <- select(Mus.musculus, introns$transcript, "SYMBOL", "TXNAME") > introns$symbol <- symbol[match(introns$transcript, symbol[,1]), 2] > head(introns) GRanges with 6 ranges and 3 metadata columns: seqnames ranges strand | feature transcript symbol <Rle> <IRanges> <Rle> | <character> <character> <character> uc007afg.1 chr1 [4807983, 4808454] + | intron uc007afg.1 Lypla1 uc007afg.1 chr1 [4808487, 4828583] + | intron uc007afg.1 Lypla1 uc007afg.1 chr1 [4828650, 4830267] + | intron uc007afg.1 Lypla1 uc007afg.1 chr1 [4830316, 4832310] + | intron uc007afg.1 Lypla1 uc007afg.1 chr1 [4832382, 4837000] + | intron uc007afg.1 Lypla1 uc007afg.1 chr1 [4837075, 4839386] + | intron uc007afg.1 Lypla1 --- seqlengths: chr1 chr2 chr3 chr4 ... chrUn_GL456394 chrUn_GL456396 chrUn_JH584304 195471971 182113224 160039680 156508116 ... 24323 21240 114452 > library(rtracklayer) > export(exons, "mm10.exons.bed", format="BED") > names(introns) <- paste(introns$transcript, introns$symbol, sep=",") > export(introns, "mm10.introns.bed", format="BED") > ##run bedtools subtract -s -a mm10.introns.bed -b mm10.exons.bed > mm10.introns.substracted.exons.bed > introns.s <- read.delim("mm10.introns.substracted.exons.bed", header=F) > introns.s <- GRanges(introns.s$V1, IRanges(introns.s$V2, introns.s$V3), strand=introns.s$V6, feature=introns.s$V4) > feature <- do.call(rbind, strsplit(as.character(introns.s$feature), ",")) > head(feature) [,1] [,2] [1,] "uc007afg.1" "Lypla1" [2,] "uc007afg.1" "Lypla1" [3,] "uc007afg.1" "Lypla1" [4,] "uc007afg.1" "Lypla1" [5,] "uc007afg.1" "Lypla1" [6,] "uc007afg.1" "Lypla1" > introns.s$transcript <- feature[,1] > introns.s$symbol <- feature[,2] > introns.s$feature <- "intron" > mcols(exons) <- mcols(exons)[,c("feature", "transcript", "symbol")] > annoData <- c(exons, introns.s) > annoData <- annoData[order(as.character(seqnames(annoData)), start(annoData))] > head(annoData) GRanges with 6 ranges and 3 metadata columns: seqnames ranges strand | feature transcript symbol <Rle> <IRanges> <Rle> | <character> <character> <character> [1] chr1 [3205904, 3207317] - | ncRNA uc007aet.1 <NA> [2] chr1 [3207317, 3213438] - | intron uc007aet.1 NA [3] chr1 [3213439, 3215632] - | ncRNA uc007aet.1 <NA> [4] chr1 [3214482, 3216021] - | utr3 uc007aeu.1 Xkr4 [5] chr1 [3216022, 3216968] - | CDS uc007aeu.1 Xkr4 [6] chr1 [3216968, 3421701] - | intron uc007aeu.1 Xkr4 --- seqlengths: chr1 chr1_GL456210_random chr1_GL456211_random chr1_GL456212_random ... chrX chrX_GL456233_random chrY 195471971 169725 241735 153618 ... 171031299 336933 91744698 |
我的理解是这样的,`exons <- GeneRegionTrack(txdb)`和`exons <- exons@range` 这两行用GeneRegionTrack生成的其实并不是exon,它是在Translation水平生成的UTR或是CDS的一段坐标。而exon是在Transcriptional层面的,所以用GenomicFeatures里的exon才是真正生成的exon数据组。这些都在你的示例里体现地很清楚: 比如在uc007afh.1里chr1 [4807893, 4807913]是 utr5 、chr1 [4807914, 4807982]是CDS ,而他俩坐标是紧接着的而都是exon1。
p.s. 我又来读你的科普了~~~
你的理解正确。我的原文应该修正成exon优先区域.