I am using the CNEr package to detect the conserved non-coding elements in Oikopleura genomes.
As a data source, I use I use the postmasked alignment files produced
by the pairwise alignment pipeline. As last-split
enforces
a one-to-one relationship this data is more stringent than the “nets”
expected by CNEr. Also, there is no whole-genome duplication in
the species we study. Therefore I do not need to run the realignment
step detailed in CNEr’s vignette.
##
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
##
## Attaching package: 'GenomicBreaks'
## The following object is masked from 'package:CNEr':
##
## swap
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
genomes <- OikScrambling:::loadAllGenomes(compat = FALSE)
## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8
## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8
annots <- OikScrambling:::loadAllAnnotations() |> suppressWarnings()
reps <- OikScrambling:::loadAllRepeats(compat = FALSE)
Load the alignments in Axt format.
axt <- SimpleList()
readAxt_ <- function(file, genomeT, genomeQ=NULL, basename="/bucket/LuscombeU/common/Breakpoints/Alignments/Oidioi_pairwise_v3/") {
if(is.null(genomeQ)) {
axt <- readAxt(paste0(basename, file),
system.file("extdata", package=genomeT, "single_sequences.2bit"))
seqinfo(first(axt)) <- seqinfo(getBSgenome(genomeT))
second(axt) <- forceSeqLengths(second(axt))
} else {
axt <- readAxt(paste0(basename, file),
system.file("extdata", package=genomeT, "single_sequences.2bit"),
system.file("extdata", package=genomeQ, "single_sequences.2bit"))
seqinfo(first(axt)) <- seqinfo(getBSgenome(genomeT))
seqinfo(second(axt)) <- seqinfo(getBSgenome(genomeQ))
}
axt
}
axt$Oki_Osa <- readAxt_("OKI2018_I69_1.0__OSKA2016v1.9.axt.gz", "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 30852
axt$Oki_Bar <- readAxt_("OKI2018_I69_1.0__Bar2_p4.axt.gz", "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 30841
axt$Oki_Kum <- readAxt_("OKI2018_I69_1.0__KUM-M3-7f.axt.gz", "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.OIST.KUM.M3.7f")
## The number of axt files 1
## The number of axt alignments is 15086
axt$Oki_Aom <- readAxt_("OKI2018_I69_1.0__AOM-5-5f.axt.gz", "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.OIST.AOM.5.5f")
## The number of axt files 1
## The number of axt alignments is 32398
axt$Oki_Nor <- readAxt_("OKI2018_I69_1.0__OdB3.axt.gz", "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.genoscope.OdB3")
## The number of axt files 1
## The number of axt alignments is 30055
axt$Oki_Olo <- readAxt_("OKI2018_I69_1.0__O_lon.axt.gz", "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 7763
axt$Oki_Ova <- readAxt_("OKI2018_I69_1.0__O_van.axt.gz", "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 17635
axt$Oki_Oal <- readAxt_("OKI2018_I69_1.0__O_alb.axt.gz", "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 15989
axt$Osa_Oki <- readAxt_("OSKA2016v1.9__OKI2018_I69_1.0.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 31322
axt$Osa_Bar <- readAxt_("OSKA2016v1.9__Bar2_p4.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 24430
axt$Osa_Aom <- readAxt_("OSKA2016v1.9__AOM-5-5f.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.OIST.AOM.5.5f")
## The number of axt files 1
## The number of axt alignments is 15261
axt$Osa_Nor <- readAxt_("OSKA2016v1.9__OdB3.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.genoscope.OdB3")
## The number of axt files 1
## The number of axt alignments is 24497
axt$Osa_Kum <- readAxt_("OSKA2016v1.9__KUM-M3-7f.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.OIST.KUM.M3.7f")
## The number of axt files 1
## The number of axt alignments is 31182
axt$Oki_Olo <- readAxt_("OSKA2016v1.9__O_lon.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 8031
axt$Oki_Ova <- readAxt_("OSKA2016v1.9__O_van.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 17606
axt$Oki_Oal <- readAxt_("OSKA2016v1.9__O_alb.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 16125
axt$Bar_Oki <- readAxt_("Bar2_p4__OKI2018_I69_1.0.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4", "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 32265
axt$Bar_Osa <- readAxt_("Bar2_p4__OSKA2016v1.9.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4", "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 24851
axt$Bar_Nor <- readAxt_("Bar2_p4__OdB3.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4", "BSgenome.Oidioi.genoscope.OdB3")
## The number of axt files 1
## The number of axt alignments is 15184
axt$Bar_Kum <- readAxt_("Bar2_p4__KUM-M3-7f.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4", "BSgenome.Oidioi.OIST.KUM.M3.7f")
## The number of axt files 1
## The number of axt alignments is 31557
axt$Bar_Aom <- readAxt_("Bar2_p4__AOM-5-5f.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4", "BSgenome.Oidioi.OIST.AOM.5.5f")
## The number of axt files 1
## The number of axt alignments is 24717
axt$Bar_Olo <- readAxt_("Bar2_p4__O_lon.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 7818
axt$Bar_Ova <- readAxt_("Bar2_p4__O_van.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 17467
axt$Bar_Oal <- readAxt_("Bar2_p4__O_alb.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 15499
GPairs2GBreaks <- function(GPairs, fix = TRUE) {
if(isTRUE(fix))
GPairs <- fixCoordinates(GPairs)
gb <- GBreaks(target = granges(first(GPairs)), query = granges(second(GPairs)))
strand(gb) <- strand(gb$query)
strand(gb$query) <- "*"
mcols(gb) <- cbind(mcols(gb), mcols(GPairs))
sort(gb, ignore.strand = TRUE)
}
ceScan_to_GB <- function (axt, tFilter = NULL, qFilter = NULL, window, identity) {
axt <- fixCoordinates(axt)
cne <- ceScan(axt, tFilter = tFilter, qFilter = qFilter, window = window, identity = identity)
GPairs2GBreaks(cne[[1]], fix = FALSE)
}
## CNEs are filtered against coding sequences
cne <- SimpleList()
cne$Oki_Osa <- ceScan_to_GB(axt$Oki_Osa, window = 50L, identity = 50L,
tFilter = exons(annots$Oki), qFilter = exons(annots$Osa))
# CE are not filtered against coding sequences
calculateCEs <- function(window, identity)
ce <- lapply(axt, ceScan_to_GB, window = window, identity = identity) |> SimpleList()
ces <- SimpleList()
ces$ce_200_200 <- calculateCEs(200L, 200L)
ces$ce_200_198 <- calculateCEs(200L, 198L)
ces$ce_200_196 <- calculateCEs(200L, 196L)
ces$ce_200_194 <- calculateCEs(200L, 194L)
ces$ce_100_100 <- calculateCEs(100L, 100L)
ces$ce_100__99 <- calculateCEs(100L, 99L)
ces$ce_100__98 <- calculateCEs(100L, 98L)
ces$ce_100__97 <- calculateCEs(100L, 97L)
ces$ce_100__96 <- calculateCEs(100L, 96L)
ces$ce_100__95 <- calculateCEs(100L, 95L)
ces$ce__50__50 <- calculateCEs(50L, 50L)
ces$ce__50__49 <- calculateCEs(50L, 49L)
ces$ce__50__48 <- calculateCEs(50L, 48L)
ces$ce__50__47 <- calculateCEs(50L, 47L)
sapply(ces, sapply, length)
## ce_200_200 ce_200_198 ce_200_196 ce_200_194 ce_100_100 ce_100__99
## Oki_Osa 2 25 62 117 234 647
## Oki_Bar 1 16 51 109 204 597
## Oki_Kum 29885 41845 48171 49057 97305 101081
## Oki_Aom 2 27 64 114 226 639
## Oki_Nor 1 15 51 110 190 558
## Oki_Olo 0 0 0 0 0 0
## Oki_Ova 0 1 1 1 1 1
## Oki_Oal 0 0 0 0 1 1
## Osa_Oki 2 25 62 118 234 646
## Osa_Bar 96 739 2069 4108 3560 9155
## Osa_Aom 29237 45308 46894 44721 101662 102365
## Osa_Nor 99 742 2048 4113 3536 8966
## Osa_Kum 4 25 60 117 228 660
## Bar_Oki 1 16 52 111 203 597
## Bar_Osa 94 737 2065 4098 3560 9148
## Bar_Nor 36071 45980 43697 38403 112235 97265
## Bar_Kum 1 15 49 108 198 601
## Bar_Aom 106 754 2094 4135 3592 9213
## Bar_Olo 0 0 0 0 0 0
## Bar_Ova 0 0 0 0 0 0
## Bar_Oal 0 0 0 0 0 0
## ce_100__98 ce_100__97 ce_100__96 ce_100__95 ce__50__50 ce__50__49
## Oki_Osa 1242 2026 3022 4237 4357 10285
## Oki_Bar 1118 1873 2866 4109 4085 9880
## Oki_Kum 98635 91482 83044 74344 244955 185728
## Oki_Aom 1250 2034 3019 4251 4348 10255
## Oki_Nor 1083 1774 2735 3903 3859 9346
## Oki_Olo 2 2 5 9 5 24
## Oki_Ova 1 1 1 3 2 11
## Oki_Oal 1 2 2 3 7 20
## Osa_Oki 1244 2028 3023 4231 4347 10274
## Osa_Bar 16449 25776 36947 49377 36292 79678
## Osa_Aom 90748 78582 68514 60606 243426 163295
## Osa_Nor 16224 25183 36241 48228 35519 77553
## Osa_Kum 1232 2026 3044 4288 4364 10314
## Bar_Oki 1118 1875 2864 4109 4089 9874
## Bar_Osa 16442 25767 36928 49371 36291 79649
## Bar_Nor 78592 63719 52874 45519 245136 135911
## Bar_Kum 1137 1882 2854 4156 4078 9927
## Bar_Aom 16555 25986 37273 49727 36293 79946
## Bar_Olo 0 1 1 5 3 25
## Bar_Ova 0 0 0 2 2 11
## Bar_Oal 0 1 1 2 8 14
## ce__50__48 ce__50__47
## Oki_Osa 18999 30813
## Oki_Bar 18318 30050
## Oki_Kum 140344 108188
## Oki_Aom 19000 30929
## Oki_Nor 17408 28535
## Oki_Olo 95 235
## Oki_Ova 34 121
## Oki_Oal 55 140
## Osa_Oki 18994 30811
## Osa_Bar 123621 155600
## Osa_Aom 113985 87658
## Osa_Nor 120232 151103
## Osa_Kum 18939 30789
## Bar_Oki 18321 30044
## Bar_Osa 123604 155535
## Bar_Nor 86383 63479
## Bar_Kum 18359 30036
## Bar_Aom 123991 156036
## Bar_Olo 71 188
## Bar_Ova 38 130
## Bar_Oal 46 131
saveRDS(ces, "CEs.Rds")
Export the results for supplemental material.
dir.create("CEs_BED", showWarnings = FALSE)
export_CEs_to_BED <- function(gr, thresholdname, genomepairname) {
x <- ces[[thresholdname]][[genomepairname]]
names(x) <- x$cigar
rtracklayer::export(x, paste0("CEs_BED/", thresholdname, "_", genomepairname, ".bed"))
}
for (thresholdname in names(ces))
for (genomepairname in names(ces[[thresholdname]]))
export_CEs_to_BED(ces, thresholdname, genomepairname)