vignettes/TFBS.Rmd
TFBS.Rmd
We skip all computations if pwmMatchesOki.Rds
, because
it is heavy.
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
##
library(TFBSTools)
library(JASPAR2020)
genomes <- OikScrambling:::loadAllGenomes()
## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8
## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8
searchSeq2GB <- function(pwm, genomeseqs, mc.cores=1L) {
siteSet <- TFBSTools::searchSeq(pwm, genomeseqs, mc.cores = mc.cores) |> suppressWarnings()
gr <- as(siteSet, "GRanges")
# Slim the result object
gr$siteSeqs <- NULL
# Most PWMs are palindromic, so strand information has little relevance but
# bloats the object with redundant copies
strand(gr) <- '*'
unique(sort(gr, ignore.strand=TRUE))
}
Select all vertebrate entries; convert to a list of position-weight matrices.
if (! file.exists("pwmMatchesOki.Rds")) {
PFMatrixList <- getMatrixSet(JASPAR2020, list(tax_group="vertebrates"))
PFMatrixList
PWMatrixList <- toPWM(PFMatrixList)
}
Scan the Okinawa genome, save the results and save a reduced version for the Gen
if (! file.exists("pwmMatchesOki.Rds")) {
Oki_DNAStringSet <- lapply(seqnames(genomes$Oki), \(name) genomes$Oki[[name]]) |> as("DNAStringSet")
names(Oki_DNAStringSet) <- seqnames(genomes$Oki)
pwmMatchesOnOki <- lapply(PWMatrixList |> as.list(), searchSeq2GB, Oki_DNAStringSet)
# x_15_98 <- lapply(pwmMatchesOnOki, \(gr) gr[gr$relScore > 0.98 & gr$absScore > 15]) |> as("GRangesList") |> unlist() |> reduce()
# x_10_95 <- lapply(pwmMatchesOnOki, \(gr) gr[gr$relScore > 0.95 & gr$absScore > 10]) |> as("GRangesList") |> unlist() |> reduce()
# x_15_95 <- lapply(pwmMatchesOnOki, \(gr) gr[gr$relScore > 0.95 & gr$absScore > 15]) |> as("GRangesList") |> unlist() |> reduce()
# x_12_90 <- lapply(pwmMatchesOnOki, \(gr) gr[gr$relScore > 0.90 & gr$absScore > 12]) |> as("GRangesList") |> unlist() |> reduce()
# x_15_90 <- lapply(pwmMatchesOnOki, \(gr) gr[gr$relScore > 0.90 & gr$absScore > 15]) |> as("GRangesList") |> unlist() |> reduce()
# Selected this one as a compromise between lower baseline and visible peaks.
pwmMatchesOnOki_12_95 <- lapply(pwmMatchesOnOki, \(gr) gr[gr$relScore > 0.95 & gr$absScore > 12]) |> as("GRangesList") |> unlist() |> reduce()
# x_10_99 <- lapply(pwmMatchesOnOki, \(gr) gr[gr$relScore > 0.99 & gr$absScore > 10]) |> as("GRangesList") |> unlist() |> reduce()
saveRDS(pwmMatchesOnOki, "pwmMatchesOki.Rds")
saveRDS(pwmMatchesOnOki_12_95, "pwmMatchesOki_12_95.Rds")
}
if (! file.exists("pwmMatchesOsa.Rds")) {
Osa_DNAStringSet <- lapply(seqnames(genomes$Osa), \(name) genomes$Osa[[name]]) |> as("DNAStringSet")
names(Osa_DNAStringSet) <- seqnames(genomes$Osa)
pwmMatchesOnOsa <- lapply(PWMatrixList |> as.list(), searchSeq2GB, Osa_DNAStringSet)
# x_15_98 <- lapply(pwmMatchesOnOsa, \(gr) gr[gr$relScore > 0.98 & gr$absScore > 15]) |> as("GRangesList") |> unlist() |> reduce()
# x_10_95 <- lapply(pwmMatchesOnOsa, \(gr) gr[gr$relScore > 0.95 & gr$absScore > 10]) |> as("GRangesList") |> unlist() |> reduce()
# x_15_95 <- lapply(pwmMatchesOnOsa, \(gr) gr[gr$relScore > 0.95 & gr$absScore > 15]) |> as("GRangesList") |> unlist() |> reduce()
# x_12_90 <- lapply(pwmMatchesOnOsa, \(gr) gr[gr$relScore > 0.90 & gr$absScore > 12]) |> as("GRangesList") |> unlist() |> reduce()
# x_15_90 <- lapply(pwmMatchesOnOsa, \(gr) gr[gr$relScore > 0.90 & gr$absScore > 15]) |> as("GRangesList") |> unlist() |> reduce()
# Selected this one as a compromise between lower baseline and visible peaks.
pwmMatchesOnOsa_12_95 <- lapply(pwmMatchesOnOsa, \(gr) gr[gr$relScore > 0.95 & gr$absScore > 12]) |> as("GRangesList") |> unlist() |> reduce()
# x_10_99 <- lapply(pwmMatchesOnOsa, \(gr) gr[gr$relScore > 0.99 & gr$absScore > 10]) |> as("GRangesList") |> unlist() |> reduce()
saveRDS(pwmMatchesOnOsa, "pwmMatchesOsa.Rds")
saveRDS(pwmMatchesOnOsa_12_95, "pwmMatchesOsa_12_95.Rds")
}