使用TranscriptDb获取所有的exons以及introns

> library(TxDb.Mmusculus.UCSC.mm10.knownGene)
Loading required package: GenomicFeatures
> txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
> library(Gviz)
> exons <- GeneRegionTrack(txdb)
> exons <- exons@range
> library(Mus.musculus)
Loading required package: OrganismDbi
Loading required package: GO.db
Loading required package: org.Mm.eg.db
> symbol <- select(Mus.musculus, exons$symbol, "SYMBOL", "TXNAME")
> exons$symbol <- symbol[match(exons$symbol, symbol[,1]), 2]
> exons <- unique(exons)
> length(exons)
[1] 316926
> head(exons)
GRanges with 6 ranges and 7 metadata columns:
      seqnames             ranges strand |     feature          id      exon  transcript        gene      symbol   density
         <Rle>          <IRanges>  <Rle> | <character> <character> <integer> <character> <character> <character> <numeric>
  [1]     chr1 [4807893, 4807913]      + |        utr5     unknown         1  uc007afh.1       18777      Lypla1         1
  [2]     chr1 [4807893, 4807928]      + |        utr5     unknown         1  uc007afg.1       18777      Lypla1         1
  [3]     chr1 [4807914, 4807982]      + |         CDS     unknown         1  uc007afh.1       18777      Lypla1         1
  [4]     chr1 [4807929, 4807982]      + |         CDS     unknown         2  uc007afg.1       18777      Lypla1         1
  [5]     chr1 [4808455, 4808486]      + |         CDS     unknown         3  uc007afg.1       18777      Lypla1         1
  [6]     chr1 [4828584, 4828649]      + |         CDS     unknown         4  uc007afg.1       18777      Lypla1         1
  ---
  seqlengths:
                   chr1 chr1_GL456210_random chr1_GL456211_random chr1_GL456212_random ...                 chrX chrX_GL456233_random                 chrY
                     NA                   NA                   NA                   NA ...                   NA                   NA                   NA
> introns <- intronsByTranscript(txdb, use.names=TRUE)
> len <- sapply(introns, length)
> introns <- introns[len>0]
> len <- len[len>0]
> intronNames <- names(introns)
> intronNames <- rep(intronNames, len)
> introns <- unlist(introns)
> introns$feature <- "intron"
> introns$transcript <- intronNames
> symbol <- select(Mus.musculus, introns$transcript, "SYMBOL", "TXNAME")
> introns$symbol <- symbol[match(introns$transcript, symbol[,1]), 2]
> head(introns)
GRanges with 6 ranges and 3 metadata columns:
             seqnames             ranges strand |     feature  transcript      symbol
                <Rle>          <IRanges>  <Rle> | <character> <character> <character>
  uc007afg.1     chr1 [4807983, 4808454]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4808487, 4828583]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4828650, 4830267]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4830316, 4832310]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4832382, 4837000]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4837075, 4839386]      + |      intron  uc007afg.1      Lypla1
  ---
  seqlengths:
                   chr1                 chr2                 chr3                 chr4 ...       chrUn_GL456394       chrUn_GL456396       chrUn_JH584304
              195471971            182113224            160039680            156508116 ...                24323                21240               114452
> library(rtracklayer)
> export(exons, "mm10.exons.bed", format="BED")
> names(introns) <- paste(introns$transcript, introns$symbol, sep=",")
> export(introns, "mm10.introns.bed", format="BED")
> ##run bedtools subtract -s -a mm10.introns.bed -b mm10.exons.bed > mm10.introns.substracted.exons.bed
> introns.s <- read.delim("mm10.introns.substracted.exons.bed", header=F)
> introns.s <- GRanges(introns.s$V1, IRanges(introns.s$V2, introns.s$V3), strand=introns.s$V6, feature=introns.s$V4)
> feature <- do.call(rbind, strsplit(as.character(introns.s$feature), ","))
> head(feature)
     [,1]         [,2]    
[1,] "uc007afg.1" "Lypla1"
[2,] "uc007afg.1" "Lypla1"
[3,] "uc007afg.1" "Lypla1"
[4,] "uc007afg.1" "Lypla1"
[5,] "uc007afg.1" "Lypla1"
[6,] "uc007afg.1" "Lypla1"
> introns.s$transcript <- feature[,1]
> introns.s$symbol <- feature[,2]
> introns.s$feature <- "intron"
> mcols(exons) <- mcols(exons)[,c("feature", "transcript", "symbol")]
> annoData <- c(exons, introns.s)
> annoData <- annoData[order(as.character(seqnames(annoData)), start(annoData))]
> head(annoData)
GRanges with 6 ranges and 3 metadata columns:
      seqnames             ranges strand |     feature  transcript      symbol
         <Rle>          <IRanges>  <Rle> | <character> <character> <character>
  [1]     chr1 [3205904, 3207317]      - |       ncRNA  uc007aet.1        <NA>
  [2]     chr1 [3207317, 3213438]      - |      intron  uc007aet.1          NA
  [3]     chr1 [3213439, 3215632]      - |       ncRNA  uc007aet.1        <NA>
  [4]     chr1 [3214482, 3216021]      - |        utr3  uc007aeu.1        Xkr4
  [5]     chr1 [3216022, 3216968]      - |         CDS  uc007aeu.1        Xkr4
  [6]     chr1 [3216968, 3421701]      - |      intron  uc007aeu.1        Xkr4
  ---
  seqlengths:
                   chr1 chr1_GL456210_random chr1_GL456211_random chr1_GL456212_random ...                 chrX chrX_GL456233_random                 chrY
              195471971               169725               241735               153618 ...            171031299               336933             91744698

2 thoughts on “使用TranscriptDb获取所有的exons以及introns

  1. 我的理解是这样的,`exons <- GeneRegionTrack(txdb)`和`exons <- exons@range` 这两行用GeneRegionTrack生成的其实并不是exon,它是在Translation水平生成的UTR或是CDS的一段坐标。而exon是在Transcriptional层面的,所以用GenomicFeatures里的exon才是真正生成的exon数据组。这些都在你的示例里体现地很清楚: 比如在uc007afh.1里chr1 [4807893, 4807913]是 utr5 、chr1 [4807914, 4807982]是CDS ,而他俩坐标是紧接着的而都是exon1。
    p.s. 我又来读你的科普了~~~

发表评论

电子邮件地址不会被公开。 必填项已用*标注