使用R按照指定宽度计算覆盖率(calculate coverage by bins)

如何使用R来按照bin(比如说每200碱基宽)来计数覆盖度并生成一个bedgraph文件呢?
这个其实就是所谓的resample问题。可以使用?tileGenome来解决。

library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg19) ## human
tiles <- tileGenome(seqinfo(Hsapiens), tilewidth=200, cut.last.tile.in.chrom=TRUE) ##
tail(tiles[seqnames(tiles)=="chr1"]) ##
head(tiles[seqnames(tiles)=="chr2"]) ##
library(Rsamtools)
library(GenomicAlignments)
binnedAverage <- function(bins, numvar, mcolname)
{
    stopifnot(is(bins, "GRanges"))
    stopifnot(is(numvar, "RleList"))
    stopifnot(identical(seqlevels(bins), names(numvar)))
    bins_per_chrom <- split(ranges(bins), seqnames(bins))
    means_list <- lapply(names(numvar),
        function(seqname) {
            views <- Views(numvar[[seqname]],
                           bins_per_chrom[[seqname]])
            viewMeans(views)
        })
    new_mcol <- unsplit(means_list, as.factor(seqnames(bins)))
    mcols(bins)[[mcolname]] <- new_mcol
    bins
}
 
library(rtracklayer)
bams <- dir(pattern="*.bam$")
for(bam in bams){
	dat <- readGAlignmentsFromBam(bam)
	dat <- granges(dat)
	score <- coverage(dat)
	tiles.s <- keepSeqlevels(tiles, names(score))
	bins <- binnedAverage(tiles.s, score, "score")
	export(bins, gsub(".bam", ".bedgraph", bam), "BedGraph")
}

注意到viewMeans是和rowMeans类似的一种函数,相应的,它也有viewSums, viewMins, viewMaxs等。
如[……]

Read more

每天更新R原码 svn R source

在一个特定的目录下,每天checkout开发版的R,并且完成安装,应该怎么做呢?

cd path/to/put/R
svn checkout http://svn.r-project.org/R/trunk/ R-devel
cd R-devel
./configure --without-recommended-packages
make
make install
svn cleanup .

可以把上面的命令写成一个shell文件,然后通过daemons来自动调用。
比如在mac下,shell文件(autoUpdateRdev.sh)可以写成

#!/bin/bash
cd /qiuworld/R
svn checkout http://svn.r-project.org/R/trunk/ R-devel
cd R-devel
./configure --without-recommended-packages
make
make install
svn cleanup .

[……]

Read more

理解edgeR

from: edgeR: a Bioconductor package for differential expression analysis of digital gene expression data

edgeR假设RNA-seq的reads计数对于每个基因来说是符合负二项式分布(Negative Binomial distribution)的。其公式可以表示为Pr(K[......]

Read more

使用DEXSeq分析NGS数据中的exon表达差异

对于RNA-seq,除了gene水平的差异分析外,还可以进行exon水平的差异分析。这时可以使用Bioconductor的DEXSeq软件包。

使用DEXSeq软件包,其输入为Count table。要生成这个Count table,与DESeq, edgeR类似的,都需要使用到htseq。DEXSeq提供了两个python角本,来调用htseq生成计数文件。

关于HTSeq的安装,请[……]

Read more

GO/KEGG富集分析enrichment analysis

本教程使用GOstats来对给定的基因symbols做富集分析。

> library("org.Hs.eg.db")
> library("GSEABase")
> library("GOstats")
> genes <- c("AREG", "FKBP5", "CXCL13", "KLF9", "ZC3H12A", "P4HA1", "TLE1", "CREB3L2", "TXNIP", "PBX1", "GJA1", "ITGB8", "CCL3", "CCND2", "KCNJ15", "CFLAR", "CXCL10", "CYSLTR1", "IGFBP7", "RHOB", "MAP3K5", "CAV2", "CAPN2", "AKAP13", "RND3", "IL6ST", "RGS1", "IRF4", "G3BP1", "SEL1L", "VEGFA", "SMAD1", "CCND1", "CLEC3B", "NEB", "AMD1", "PDCD4", "SCD", "TM2D3", "BACH2", "LDLR", "BMPR1B", "RFXAP", "ASPH", "PTK2B", "SLC1A5", "ENO2", "TRPM8", "SATB1", "MIER1", "SRSF1", "ATF3", "CCL5", "MCM6", "GCH1", "CAV1", "SLC20A1")
> ##GO BP enrichment analysis
> goAnn <- get("org.Hs.egGO")
> universe <- Lkeys(goAnn)
> head(universe)
[1] "1"         "10"        "100"       "1000"      "10000"     "100008586"
> entrezIDs <- mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
> head(entrezIDs)
$AREG
[1] "374"
 
$FKBP5
[1] "2289"
 
$CXCL13
[1] "10563"
 
$KLF9
[1] "687"
 
$ZC3H12A
[1] "80149"
 
$P4HA1
[1] "5033"
 
> entrezIDs <- as.character(entrezIDs)
> head(entrezIDs)
[1] "374"   "2289"  "10563" "687"   "80149" "5033" 
> params <- new("GOHyperGParams",
+                  geneIds=entrezIDs,
+                  universeGeneIds=universe,
+                  annotation="org.Hs.eg.db",
+                  ontology="BP",
+                  pvalueCutoff=0.01,
+                  conditional=FALSE,
+                  testDirection="over")
> over <- hyperGTest(params)
> library(Category)
> glist <- geneIdsByCategory(over)
> glist <- sapply(glist, function(.ids) {
+ 	.sym <- mget(.ids, envir=org.Hs.egSYMBOL, ifnotfound=NA)
+ 	.sym[is.na(.sym)] <- .ids[is.na(.sym)]
+ 	paste(.sym, collapse=";")
+ 	})
> head(glist)
                                                                                                                                                                                         GO:0002690 
                                                                                                                                                              "PTK2B;CXCL10;CCL3;CCL5;VEGFA;CXCL13" 
                                                                                                                                                                                         GO:0002688 
                                                                                                                                                              "PTK2B;CXCL10;CCL3;CCL5;VEGFA;CXCL13" 
                                                                                                                                                                                         GO:0048518 
"RHOB;ASPH;ATF3;CCND1;BMPR1B;CAV1;CAV2;CCND2;PTK2B;GJA1;IL6ST;CXCL10;IRF4;LDLR;SMAD1;MAP3K5;PBX1;RFXAP;CCL3;CCL5;SLC20A1;TLE1;CLEC3B;VEGFA;CFLAR;CXCL13;TXNIP;CYSLTR1;AKAP13;MIER1;CREB3L2;ZC3H12A" 
                                                                                                                                                                                         GO:1901623 
                                                                                                                                                                           "PTK2B;CCL3;CCL5;CXCL13" 
                                                                                                                                                                                         GO:0002687 
                                                                                                                                                              "PTK2B;CXCL10;CCL3;CCL5;VEGFA;CXCL13" 
                                                                                                                                                                                         GO:2000403 
                                                                                                                                                                           "PTK2B;CCL3;CCL5;CXCL13" 
> bp <- summary(over)
> bp$Symbols <- glist[as.character(bp$GOBPID)]
> head(bp)
      GOBPID       Pvalue OddsRatio    ExpCount Count Size                                        Term
1 GO:0002690 3.490023e-08  39.24490  0.19267044     6   52 positive regulation of leukocyte chemotaxis
2 GO:0002688 9.278865e-08  32.80297  0.22601725     6   61          regulation of leukocyte chemotaxis
3 GO:0048518 1.170957e-07   4.28014 13.56103476    32 3660   positive regulation of biological process
4 GO:1901623 1.176400e-07 128.80174  0.04816761     4   13         regulation of lymphocyte chemotaxis
5 GO:0002687 1.496978e-07  30.05918  0.24454325     6   66  positive regulation of leukocyte migration
6 GO:2000403 2.969857e-07  96.58170  0.05928321     4   16 positive regulation of lymphocyte migration
                                                                                                                                                                                            Symbols
1                                                                                                                                                               PTK2B;CXCL10;CCL3;CCL5;VEGFA;CXCL13
2                                                                                                                                                               PTK2B;CXCL10;CCL3;CCL5;VEGFA;CXCL13
3 RHOB;ASPH;ATF3;CCND1;BMPR1B;CAV1;CAV2;CCND2;PTK2B;GJA1;IL6ST;CXCL10;IRF4;LDLR;SMAD1;MAP3K5;PBX1;RFXAP;CCL3;CCL5;SLC20A1;TLE1;CLEC3B;VEGFA;CFLAR;CXCL13;TXNIP;CYSLTR1;AKAP13;MIER1;CREB3L2;ZC3H12A
4                                                                                                                                                                            PTK2B;CCL3;CCL5;CXCL13
5                                                                                                                                                               PTK2B;CXCL10;CCL3;CCL5;VEGFA;CXCL13
6                                                                                                                                                                            PTK2B;CCL3;CCL5;CXCL13
> dim(bp)
[1] 525   8
> ##KEGG enrichment analysis
> keggAnn <- get("org.Hs.egPATH")
> universe <- Lkeys(keggAnn)
> params <- new("KEGGHyperGParams", 
+                     geneIds=entrezIDs, 
+                     universeGeneIds=universe, 
+                     annotation="org.Hs.eg.db", 
+                     categoryName="KEGG", 
+                     pvalueCutoff=0.01,
+                     testDirection="over")
> over <- hyperGTest(params)
> kegg <- summary(over)
> library(Category)
> glist <- geneIdsByCategory(over)
> glist <- sapply(glist, function(.ids) {
+ 	.sym <- mget(.ids, envir=org.Hs.egSYMBOL, ifnotfound=NA)
+ 	.sym[is.na(.sym)] <- .ids[is.na(.sym)]
+ 	paste(.sym, collapse=";")
+ 	})
> kegg$Symbols <- glist[as.character(kegg$KEGGID)]
> kegg
  KEGGID       Pvalue OddsRatio ExpCount Count Size                                   Term                                    Symbols
1  04510 9.643801e-05  7.873256 1.124361     7  200                         Focal adhesion    CCND1;CAPN2;CAV1;CAV2;CCND2;ITGB8;VEGFA
2  04060 5.487123e-04  5.821855 1.489779     7  265 Cytokine-cytokine receptor interaction BMPR1B;IL6ST;CXCL10;CCL3;CCL5;VEGFA;CXCL13
3  04062 3.739699e-03  5.486219 1.062521     5  189            Chemokine signaling pathway              PTK2B;CXCL10;CCL3;CCL5;CXCL13
> library("pathview")
> gIds <- mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
> gEns <- unlist(gIds)
> gene.data <- rep(1, length(gEns))
> names(gene.data) <- gEns
> for(i in 1:3){pv.out <- pathview(gene.data, pathway.id=as.character(kegg$KEGGID)[i], species="hsa", out.suffix="pathview", kegg.native=T)}

写成函数

enrich <- function(entrezIDs, orgDbName="org.Hs.eg.db", pvalueCutoff=.01){
	require(orgDbName, character.only=TRUE)
	require("GSEABase")
	require("GOstats")
	require("Category")
	goAnn <- get(gsub(".db", "GO", orgDbName))
	universe <- Lkeys(goAnn)
	onto <- c("BP", "MF", "CC")
	res <- lapply(onto, function(.onto){
		param <- new('GOHyperGParams',
					 geneIds= entrezIDs,
					 universeGeneIds=universe,
					 annotation=orgDbName,
					 ontology=.onto,
					 pvalueCutoff=pvalueCutoff,
					 conditional=FALSE,
					 testDirection="over")
		over <- hyperGTest(param)
		glist <- geneIdsByCategory(over)
		glist <- sapply(glist, function(.ids) {
			.sym <- mget(.ids, envir=get(gsub(".db", "SYMBOL", orgDbName)), ifnotfound=NA)
			.sym[is.na(.sym)] <- .ids[is.na(.sym)]
			paste(.sym, collapse=";")
		})
		summary <- summary(over)
		if(nrow(summary)>1) summary$Symbols <- glist[as.character(summary[, 1])]
		summary
	})
	names(res) <- onto
	keggAnn <- get(gsub(".db", "PATH", orgDbName))
	universe <- Lkeys(keggAnn)
	param <- new("KEGGHyperGParams",
				 geneIds=entrezIDs,
				 universeGeneIds=universe,
				 annotation=orgDbName,
				 categoryName="KEGG",
				 pvalueCutoff=pvalueCutoff,
				 testDirection="over")
	over <- hyperGTest(param)
	kegg <- summary(over)
	glist <- geneIdsByCategory(over)
	glist <- sapply(glist, function(.ids) {
		.sym <- mget(.ids, envir=get(gsub(".db", "SYMBOL", orgDbName)), ifnotfound=NA)
		.sym[is.na(.sym)] <- .ids[is.na(.sym)]
		paste(.sym, collapse=";")
	})
	kegg$Symbols <- glist[as.character(kegg$KEGGID)]
	res[["kegg"]] <- kegg
	res
}
 
go <- enrich(entrezIDs, "org.Hs.eg.db", .05)

R的内存管理

当大数据量到来的时候,使用R分析会显得力不从心,最主要的问题在于内存。因为在大多数情况下,R需要将数据全部读入内存,然后再将数据做为一个整体对象来处理。同时,R的一些数据结构本身也需要大量的内存,比如list。一些数据读入R后,会占用几倍甚至几十倍的内存,比如将一个bigwig文件import成GRanges对象。

R的内存管理和java类似的,使用了Garbage collection。但[……]

Read more

使用TranscriptDb获取所有的exons以及introns

> library(TxDb.Mmusculus.UCSC.mm10.knownGene)
Loading required package: GenomicFeatures
> txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
> library(Gviz)
> exons <- GeneRegionTrack(txdb)
> exons <- exons@range
> library(Mus.musculus)
Loading required package: OrganismDbi
Loading required package: GO.db
Loading required package: org.Mm.eg.db
> symbol <- select(Mus.musculus, exons$symbol, "SYMBOL", "TXNAME")
> exons$symbol <- symbol[match(exons$symbol, symbol[,1]), 2]
> exons <- unique(exons)
> length(exons)
[1] 316926
> head(exons)
GRanges with 6 ranges and 7 metadata columns:
      seqnames             ranges strand |     feature          id      exon  transcript        gene      symbol   density
         <Rle>          <IRanges>  <Rle> | <character> <character> <integer> <character> <character> <character> <numeric>
  [1]     chr1 [4807893, 4807913]      + |        utr5     unknown         1  uc007afh.1       18777      Lypla1         1
  [2]     chr1 [4807893, 4807928]      + |        utr5     unknown         1  uc007afg.1       18777      Lypla1         1
  [3]     chr1 [4807914, 4807982]      + |         CDS     unknown         1  uc007afh.1       18777      Lypla1         1
  [4]     chr1 [4807929, 4807982]      + |         CDS     unknown         2  uc007afg.1       18777      Lypla1         1
  [5]     chr1 [4808455, 4808486]      + |         CDS     unknown         3  uc007afg.1       18777      Lypla1         1
  [6]     chr1 [4828584, 4828649]      + |         CDS     unknown         4  uc007afg.1       18777      Lypla1         1
  ---
  seqlengths:
                   chr1 chr1_GL456210_random chr1_GL456211_random chr1_GL456212_random ...                 chrX chrX_GL456233_random                 chrY
                     NA                   NA                   NA                   NA ...                   NA                   NA                   NA
> introns <- intronsByTranscript(txdb, use.names=TRUE)
> len <- sapply(introns, length)
> introns <- introns[len>0]
> len <- len[len>0]
> intronNames <- names(introns)
> intronNames <- rep(intronNames, len)
> introns <- unlist(introns)
> introns$feature <- "intron"
> introns$transcript <- intronNames
> symbol <- select(Mus.musculus, introns$transcript, "SYMBOL", "TXNAME")
> introns$symbol <- symbol[match(introns$transcript, symbol[,1]), 2]
> head(introns)
GRanges with 6 ranges and 3 metadata columns:
             seqnames             ranges strand |     feature  transcript      symbol
                <Rle>          <IRanges>  <Rle> | <character> <character> <character>
  uc007afg.1     chr1 [4807983, 4808454]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4808487, 4828583]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4828650, 4830267]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4830316, 4832310]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4832382, 4837000]      + |      intron  uc007afg.1      Lypla1
  uc007afg.1     chr1 [4837075, 4839386]      + |      intron  uc007afg.1      Lypla1
  ---
  seqlengths:
                   chr1                 chr2                 chr3                 chr4 ...       chrUn_GL456394       chrUn_GL456396       chrUn_JH584304
              195471971            182113224            160039680            156508116 ...                24323                21240               114452
> library(rtracklayer)
> export(exons, "mm10.exons.bed", format="BED")
> names(introns) <- paste(introns$transcript, introns$symbol, sep=",")
> export(introns, "mm10.introns.bed", format="BED")
> ##run bedtools subtract -s -a mm10.introns.bed -b mm10.exons.bed > mm10.introns.substracted.exons.bed
> introns.s <- read.delim("mm10.introns.substracted.exons.bed", header=F)
> introns.s <- GRanges(introns.s$V1, IRanges(introns.s$V2, introns.s$V3), strand=introns.s$V6, feature=introns.s$V4)
> feature <- do.call(rbind, strsplit(as.character(introns.s$feature), ","))
> head(feature)
     [,1]         [,2]    
[1,] "uc007afg.1" "Lypla1"
[2,] "uc007afg.1" "Lypla1"
[3,] "uc007afg.1" "Lypla1"
[4,] "uc007afg.1" "Lypla1"
[5,] "uc007afg.1" "Lypla1"
[6,] "uc007afg.1" "Lypla1"
> introns.s$transcript <- feature[,1]
> introns.s$symbol <- feature[,2]
> introns.s$feature <- "intron"
> mcols(exons) <- mcols(exons)[,c("feature", "transcript", "symbol")]
> annoData <- c(exons, introns.s)
> annoData <- annoData[order(as.character(seqnames(annoData)), start(annoData))]
> head(annoData)
GRanges with 6 ranges and 3 metadata columns:
      seqnames             ranges strand |     feature  transcript      symbol
         <Rle>          <IRanges>  <Rle> | <character> <character> <character>
  [1]     chr1 [3205904, 3207317]      - |       ncRNA  uc007aet.1        <NA>
  [2]     chr1 [3207317, 3213438]      - |      intron  uc007aet.1          NA
  [3]     chr1 [3213439, 3215632]      - |       ncRNA  uc007aet.1        <NA>
  [4]     chr1 [3214482, 3216021]      - |        utr3  uc007aeu.1        Xkr4
  [5]     chr1 [3216022, 3216968]      - |         CDS  uc007aeu.1        Xkr4
  [6]     chr1 [3216968, 3421701]      - |      intron  uc007aeu.1        Xkr4
  ---
  seqlengths:
                   chr1 chr1_GL456210_random chr1_GL456211_random chr1_GL456212_random ...                 chrX chrX_GL456233_random                 chrY
              195471971               169725               241735               153618 ...            171031299               336933             91744698