如何使用biomaRt做hg19注释

因为现在ensembl已经更新了human的注释库,所以当你直接使用

library("biomaRt")
database <- listDatasets(useMart("ensembl"))
database[database[,1]=='hsapiens_gene_ensembl',]
                 dataset                 description version
32 hsapiens_gene_ensembl Homo sapiens genes (GRCh38)  GRCh38

你会得到的是GRCh38(hg38)的注释库。
那么如何转换至hg19注释库呢?
使用

mart <- useMart("ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice")
database <- listDatasets(mart)
database[database[,1]=='hsapiens_gene_ensembl',]

[……]

Read more

R/Bioconductor开发中import和depend的区别

import:将命名空间载入

depends:将命名空间载入,并且放入搜索目录。

两者在使用上的区别是:import只载入对开发者有用的,而depends需要载入对用户有用的。比如说GenomicRanges,
如果你开发的包使用到了GRanges类,并且它会成为结果返回给用户,那么你就必须使用depends了,如果你只是用了GenomicRanges中的类或者函数,比如说GRang[……]

Read more

如何得到exon numbering

以前,NCBI的genomic browser中是可以得到一个所谓的exon number的。其规则为同一个基因内所有exon按5′->3’排序,用排序序号来做为exon number。对于有重叠的exon使用同一序号,以排序先后为其加上a,b,c…的后缀。
art9-cc-figure-1(fix)
但是这其实是一个很不靠谱的表达方式,因为随着基因的转录本的变化,有可能会新增加exon。所以一些文章中使用exon number来标[……]

Read more

使用R绘制带比例的饼图 Pie Chart with Annotated Percentages

pie1 <- function (x, labels = names(x), edges = 200, radius = 0.8, clockwise = FALSE, 
    init.angle = if (clockwise) 90 else 0, density = NULL, angle = 45, 
    col = NULL, border = NULL, lty = NULL, main = NULL, percentage=T, rawNumber=F, digits=3, cutoff=0.01, 
    legend=F, legendpos="topright", legendcol=2, ...) 
{
    if (!is.numeric(x) || any(is.na(x) | x < 0)) 
        stop("'x' values must be positive.")
    if (is.null(labels)) 
        labels <- as.character(seq_along(x))
    else labels <- as.graphicsAnnot(labels)
    rawX <- x
    x <- c(0, cumsum(x)/sum(x))
    dx <- diff(x)
    nx <- length(dx)
    plot.new()
    pin <- par("pin")
    xlim <- ylim <- c(-1, 1)
    if (pin[1L] > pin[2L]) 
        xlim <- (pin[1L]/pin[2L]) * xlim
    else ylim <- (pin[2L]/pin[1L]) * ylim
    dev.hold()
    on.exit(dev.flush())
    plot.window(xlim, ylim, "", asp = 1)
    if (is.null(col)) 
        col <- if (is.null(density)) 
            c("white", "lightblue", "mistyrose", "lightcyan", 
                "lavender", "cornsilk", "pink")
        else par("fg")
    if (!is.null(col)) 
        col <- rep_len(col, nx)
    if (!is.null(border)) 
        border <- rep_len(border, nx)
    if (!is.null(lty)) 
        lty <- rep_len(lty, nx)
    angle <- rep(angle, nx)
    if (!is.null(density)) 
        density <- rep_len(density, nx)
    twopi <- if (clockwise) 
        -2 * pi
    else 2 * pi
    t2xy <- function(t) {
        t2p <- twopi * t + init.angle * pi/180
        list(x = radius * cos(t2p), y = radius * sin(t2p))
    }
    for (i in 1L:nx) {
        n <- max(2, floor(edges * dx[i]))
        P <- t2xy(seq.int(x[i], x[i + 1], length.out = n))
        polygon(c(P$x, 0), c(P$y, 0), density = density[i], angle = angle[i], 
            border = border[i], col = col[i], lty = lty[i])
        if(!legend){
        	P <- t2xy(mean(x[i + 0:1]))
	        lab <- as.character(labels[i])
	        if (!is.na(lab) && nzchar(lab)) {
	            lines(c(1, 1.05) * P$x, c(1, 1.05) * P$y)
	            text(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, 
	                adj = ifelse(P$x < 0, 1, 0), ...)
	        }
        }
    }
    if (percentage) {
    	for (i in 1L:nx){
    		if(dx[i]>cutoff){
    			P <- t2xy(mean(x[i + 0:1]))
            	text(.8 * P$x, .8 * P$y, paste(formatC(dx[i]*100, digits=digits), "%", sep=""), xpd = TRUE, 
                	adj = .5, ...)
    		}
        }
    }else{
        if(rawNumber){
		for (i in 1L:nx){
    			if(dx[i]>cutoff){
    				P <- t2xy(mean(x[i + 0:1]))
            		text(.8 * P$x, .8 * P$y, rawX[i], xpd = TRUE, 
                		adj = .5, ...)
    			}
        	}
        }
    }
    if(legend) legend(legendpos, legend=labels, fill=col, border="black", bty="n", ncol = legendcol)
    title(main = main, ...)
    invisible(NULL)
}
 
pie1(1:5)

pie

使用Rcurl提交表单

有时候需要向在线工具批量提交申请。手动操作非常繁琐,使用curl工具提交会非常方便。

Rcurl已经整合好了相关代码,使用简便。下面就给出一个示例:

library(RCurl) #调用curl
curl <- getCurlHandle() #虚拟一个浏览器
curlSetOpt(cookiejar=tempfile(), curl=curl) #生成cookie
getURL("http://some.webpage.com/index.php", curl=curl) #填充cookie
response <- postForm("http://some.webpage.com/form.handle.page.php", style="HTTPPOST", data=data, file=fileUpload(filename="tobeuploadedfile.txt", contentType="text/plain"), curl=curl) #提交表单,表单对应的网址写在第一个参数中,具体帮助请使用?postForm查看。

使用R按照指定宽度计算覆盖率(calculate coverage by bins)

如何使用R来按照bin(比如说每200碱基宽)来计数覆盖度并生成一个bedgraph文件呢?
这个其实就是所谓的resample问题。可以使用?tileGenome来解决。

library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg19) ## human
tiles <- tileGenome(seqinfo(Hsapiens), tilewidth=200, cut.last.tile.in.chrom=TRUE) ##
tail(tiles[seqnames(tiles)=="chr1"]) ##
head(tiles[seqnames(tiles)=="chr2"]) ##
library(Rsamtools)
library(GenomicAlignments)
binnedAverage <- function(bins, numvar, mcolname)
{
    stopifnot(is(bins, "GRanges"))
    stopifnot(is(numvar, "RleList"))
    stopifnot(identical(seqlevels(bins), names(numvar)))
    bins_per_chrom <- split(ranges(bins), seqnames(bins))
    means_list <- lapply(names(numvar),
        function(seqname) {
            views <- Views(numvar[[seqname]],
                           bins_per_chrom[[seqname]])
            viewMeans(views)
        })
    new_mcol <- unsplit(means_list, as.factor(seqnames(bins)))
    mcols(bins)[[mcolname]] <- new_mcol
    bins
}
 
library(rtracklayer)
bams <- dir(pattern="*.bam$")
for(bam in bams){
	dat <- readGAlignmentsFromBam(bam)
	dat <- granges(dat)
	score <- coverage(dat)
	tiles.s <- keepSeqlevels(tiles, names(score))
	bins <- binnedAverage(tiles.s, score, "score")
	export(bins, gsub(".bam", ".bedgraph", bam), "BedGraph")
}

注意到viewMeans是和rowMeans类似的一种函数,相应的,它也有viewSums, viewMins, viewMaxs等。
如[……]

Read more

R的内存管理

当大数据量到来的时候,使用R分析会显得力不从心,最主要的问题在于内存。因为在大多数情况下,R需要将数据全部读入内存,然后再将数据做为一个整体对象来处理。同时,R的一些数据结构本身也需要大量的内存,比如list。一些数据读入R后,会占用几倍甚至几十倍的内存,比如将一个bigwig文件import成GRanges对象。

R的内存管理和java类似的,使用了Garbage collection。但[……]

Read more

使用TranscriptDb获取所有的exons以及introns

> library(TxDb.Mmusculus.UCSC.mm10.knownGene)
Loading required package: GenomicFeatures
> txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
> library(Gviz)
> exons <- GeneRegionTrack(txdb)
> exons <- exons@range
> library(Mus.musculus)
Loading required package: OrganismDbi
Loading required package: GO.db
Loading required package: org.Mm.eg.db
> symbol <- select(Mus.musculus, exons$symbol, "SYMBOL", "TXNAME")
> exons$symbol <- symbol[match(exons$symbol, symbol[,1]), 2]
> exons <- unique(exons)
> length(exons)
[1] 316926
> head(exons)
GRanges with 6 ranges and 7 metadata columns:
      seqnames             ranges strand |     feature          id      exon  transcript        gene      symbol   density
         <Rle>          <IRanges>  <Rle> | <character> <character> <integer> <character> <character> <character> <numeric>
  [1]     chr1 [4807893, 4807913]      + |        utr5     unknown         1  uc007afh.1       18777      Lypla1         1
  [2]     chr1 [4807893, 4807928]      + |        utr5     unknown         1  uc007afg.1       18777      Lypla1         1
  [3]     chr1 [4807914, 4807982]      + |         CDS     unknown         1  uc007afh.1       18777      Lypla1         1
  [4]     chr1 [4807929, 4807982]      + |         CDS     unknown         2  uc007afg.1       18777      Lypla1         1
  [5]     chr1 [4808455, 4808486]      + |         CDS     unknown         3  uc007afg.1       18777      Lypla1         1
  [6]     chr1 [4828584, 4828649]      + |         CDS     unknown         4  uc007afg.1       18777      Lypla1         1
  ---
  seqlengths:
                   chr1 chr1_GL456210_random chr1_GL456211_random chr1_GL456212_random ...                 chrX chrX_GL456233_random                 chrY
                     NA                   NA                   NA                   NA ...                   NA                   NA                   NA
> introns <- intronsByTranscript(txdb, use.names=TRUE)
> len <- sapply(introns, length)
> introns <- introns[len>0]
> len <- len[len>0]
> intronNames <- names(introns)
> intronNames <- rep(intronNames, len)
> introns <- unlist(introns)
> introns$feature <- "intron"
> introns$transcript <- intronNames
> symbol <- select(Mus.musculus, introns$transcript, "SYMBOL", "TXNAME")
> introns$symbol <- symbol[match(introns$transcript, symbol[,1]), 2]
> head(introns)
GRanges with 6 ranges and 3 metadata columns:
             seqnames             ranges strand |     feature  transcript      symbol
                <Rle>          <IRanges>  <Rle> | <character> <character> <character>
  uc007afg.1     chr1 [4807983, 4808454]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4808487, 4828583]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4828650, 4830267]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4830316, 4832310]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4832382, 4837000]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4837075, 4839386]      + |      intron  uc007afg.1      Lypla1
  ---
  seqlengths:
                   chr1                 chr2                 chr3                 chr4 ...       chrUn_GL456394       chrUn_GL456396       chrUn_JH584304
              195471971            182113224            160039680            156508116 ...                24323                21240               114452
> library(rtracklayer)
> export(exons, "mm10.exons.bed", format="BED")
> names(introns) <- paste(introns$transcript, introns$symbol, sep=",")
> export(introns, "mm10.introns.bed", format="BED")
> ##run bedtools subtract -s -a mm10.introns.bed -b mm10.exons.bed > mm10.introns.substracted.exons.bed
> introns.s <- read.delim("mm10.introns.substracted.exons.bed", header=F)
> introns.s <- GRanges(introns.s$V1, IRanges(introns.s$V2, introns.s$V3), strand=introns.s$V6, feature=introns.s$V4)
> feature <- do.call(rbind, strsplit(as.character(introns.s$feature), ","))
> head(feature)
     [,1]         [,2]    
[1,] "uc007afg.1" "Lypla1"
[2,] "uc007afg.1" "Lypla1"
[3,] "uc007afg.1" "Lypla1"
[4,] "uc007afg.1" "Lypla1"
[5,] "uc007afg.1" "Lypla1"
[6,] "uc007afg.1" "Lypla1"
> introns.s$transcript <- feature[,1]
> introns.s$symbol <- feature[,2]
> introns.s$feature <- "intron"
> mcols(exons) <- mcols(exons)[,c("feature", "transcript", "symbol")]
> annoData <- c(exons, introns.s)
> annoData <- annoData[order(as.character(seqnames(annoData)), start(annoData))]
> head(annoData)
GRanges with 6 ranges and 3 metadata columns:
      seqnames             ranges strand |     feature  transcript      symbol
         <Rle>          <IRanges>  <Rle> | <character> <character> <character>
  [1]     chr1 [3205904, 3207317]      - |       ncRNA  uc007aet.1        <NA>
  [2]     chr1 [3207317, 3213438]      - |      intron  uc007aet.1          NA
  [3]     chr1 [3213439, 3215632]      - |       ncRNA  uc007aet.1        <NA>
  [4]     chr1 [3214482, 3216021]      - |        utr3  uc007aeu.1        Xkr4
  [5]     chr1 [3216022, 3216968]      - |         CDS  uc007aeu.1        Xkr4
  [6]     chr1 [3216968, 3421701]      - |      intron  uc007aeu.1        Xkr4
  ---
  seqlengths:
                   chr1 chr1_GL456210_random chr1_GL456211_random chr1_GL456212_random ...                 chrX chrX_GL456233_random                 chrY
              195471971               169725               241735               153618 ...            171031299               336933             91744698

R为什么认为它们不相等

在最近的编程过程中遇到一个小问题:

> floor(29/50*50)
[1] 28
> floor(29*50/50)
[1] 29

也许你看到这个的第一反应是,这应该是一个浮点数问题。但是如何解释这个问题呢?也许你并没有一个清晰的答案(我也是)。
于是询问他人。询问之下得到了如下的答案(感谢Steven McKinney):

> options(digits = 22)
> 29/50
[1] 0.579999999999999960032
> 29/50 * 50
[1] 28.99999999999999644729
> 29*50/50
[1] 29.00000000000000000000
> .58 * 50
[1] 28.99999999999999644729
> floor(28.999999)
[1] 28.00000000000000000000
> floor(29.000000)
[1] 29.00000000000000000000

[……]

Read more

Bioconductor并行(parallel)运算

当我们进入大数据时代之后,并行运算就十分重要了。什么是大数据呢?引用一句流行语吧:

Big data is hype. Everybody talks about big data; nobody knows exactly what it is.

对于生物学来说,大数据对应的应该是第二代测序技术的兴起。所以我们谈到并行运算,主要是为了处理测序数据。在R中,有许多并行运算的包,Hi[……]

Read more