如何使用R来按照bin(比如说每200碱基宽)来计数覆盖度并生成一个bedgraph文件呢?
这个其实就是所谓的resample问题。可以使用?tileGenome来解决。
library(GenomicRanges) library(BSgenome.Hsapiens.UCSC.hg19) ## human tiles <- tileGenome(seqinfo(Hsapiens), tilewidth=200, cut.last.tile.in.chrom=TRUE) ## tail(tiles[seqnames(tiles)=="chr1"]) ## head(tiles[seqnames(tiles)=="chr2"]) ## library(Rsamtools) library(GenomicAlignments) binnedAverage <- function(bins, numvar, mcolname) { stopifnot(is(bins, "GRanges")) stopifnot(is(numvar, "RleList")) stopifnot(identical(seqlevels(bins), names(numvar))) bins_per_chrom <- split(ranges(bins), seqnames(bins)) means_list <- lapply(names(numvar), function(seqname) { views <- Views(numvar[[seqname]], bins_per_chrom[[seqname]]) viewMeans(views) }) new_mcol <- unsplit(means_list, as.factor(seqnames(bins))) mcols(bins)[[mcolname]] <- new_mcol bins } library(rtracklayer) bams <- dir(pattern="*.bam$") for(bam in bams){ dat <- readGAlignmentsFromBam(bam) dat <- granges(dat) score <- coverage(dat) tiles.s <- keepSeqlevels(tiles, names(score)) bins <- binnedAverage(tiles.s, score, "score") export(bins, gsub(".bam", ".bedgraph", bam), "BedGraph") } |
注意到viewMeans是和rowMeans类似的一种函数,相应的,它也有viewSums, viewMins, viewMaxs等。
如果我们需要的是reads tags计数,而不是coverage,应该怎么办呢?
这个问题的解决思路是
1.将reads区分为两部分,一部分是不会跨窗口的,另外的是会跨窗口的。
2.将会跨窗口的按每个窗口分配一次,并使其的宽度设定为1.
3.将不会跨窗口的reads宽度设置为1.
4.计算coverage。
5.计算binnedSums。
library(GenomicRanges) library(BSgenome.Mmusculus.UCSC.mm10) ## mouse tiles <- tileGenome(seqinfo(Mmusculus), tilewidth=200, cut.last.tile.in.chrom=TRUE) ## edges <- tiles width(edges) <- 1 tail(tiles[seqnames(tiles)=="chr1"]) ## head(tiles[seqnames(tiles)=="chr2"]) ## library(Rsamtools) library(GenomicAlignments) binnedSums <- function(bins, numvar, mcolname) { stopifnot(is(bins, "GRanges")) stopifnot(is(numvar, "RleList")) stopifnot(identical(seqlevels(bins), names(numvar))) bins_per_chrom <- split(ranges(bins), seqnames(bins)) means_list <- lapply(names(numvar), function(seqname) { views <- Views(numvar[[seqname]], bins_per_chrom[[seqname]]) viewSums(views) }) new_mcol <- unsplit(means_list, as.factor(seqnames(bins))) mcols(bins)[[mcolname]] <- new_mcol bins } library(rtracklayer) bams <- dir(pattern="*.bam$") for(bam in bams){ dat <- readGAlignmentsFromBam(bam) dat <- granges(dat) ol <- findOverlaps(dat, edges) dat.ol <- dat[queryHits(ol)] dat.nol <- dat[-queryHits(ol)] dat.ols <- dat.ol ## note the bin size must larger than reads length width(dat.ols) <- 1 dat.ole <- dat.ol start(dat.ole) <- end(dat.ole) width(dat.nol) <- 1 dat <- c(dat.nol, dat.ols, dat.ole) dat <- sort(dat) score <- coverage(dat) tiles.s <- keepSeqlevels(tiles, names(score)) bins <- binnedSums(tiles.s, score, "score") bins <- bins[bins$score>0] export(bins, gsub(".bam", ".bedgraph", bam), "BedGraph") } |