I am using the CNEr package to detect the conserved non-coding elements in Oikopleura genomes.

As a data source, I use I use the postmasked alignment files produced by the pairwise alignment pipeline. As last-split enforces a one-to-one relationship this data is more stringent than the “nets” expected by CNEr. Also, there is no whole-genome duplication in the species we study. Therefore I do not need to run the realignment step detailed in CNEr’s vignette.

## 
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'GenomicBreaks'
## The following object is masked from 'package:CNEr':
## 
##     swap
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
genomes     <- OikScrambling:::loadAllGenomes(compat = FALSE)
## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8

## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8
annots      <- OikScrambling:::loadAllAnnotations()   |> suppressWarnings()
reps        <- OikScrambling:::loadAllRepeats(compat = FALSE)

Load the alignments in Axt format.

axt <- SimpleList()
readAxt_ <- function(file, genomeT, genomeQ=NULL, basename="/bucket/LuscombeU/common/Breakpoints/Alignments/Oidioi_pairwise_v3/") {
  if(is.null(genomeQ)) {
    axt <- readAxt(paste0(basename, file),
                   system.file("extdata", package=genomeT, "single_sequences.2bit"))
    seqinfo(first(axt)) <- seqinfo(getBSgenome(genomeT))
    second(axt) <- forceSeqLengths(second(axt))
  } else {
    axt <-  readAxt(paste0(basename, file),
                    system.file("extdata", package=genomeT, "single_sequences.2bit"),
                    system.file("extdata", package=genomeQ, "single_sequences.2bit"))
    seqinfo(first(axt))  <- seqinfo(getBSgenome(genomeT))
    seqinfo(second(axt)) <- seqinfo(getBSgenome(genomeQ))
  }
  axt
}

axt$Oki_Osa <- readAxt_("OKI2018_I69_1.0__OSKA2016v1.9.axt.gz", "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 30852
axt$Oki_Bar <- readAxt_("OKI2018_I69_1.0__Bar2_p4.axt.gz",      "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 30841
axt$Oki_Kum <- readAxt_("OKI2018_I69_1.0__KUM-M3-7f.axt.gz",    "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.OIST.KUM.M3.7f")
## The number of axt files 1
## The number of axt alignments is 15086
axt$Oki_Aom <- readAxt_("OKI2018_I69_1.0__AOM-5-5f.axt.gz",     "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.OIST.AOM.5.5f")
## The number of axt files 1
## The number of axt alignments is 32398
axt$Oki_Nor <- readAxt_("OKI2018_I69_1.0__OdB3.axt.gz",         "BSgenome.Oidioi.OIST.OKI2018.I69", "BSgenome.Oidioi.genoscope.OdB3")
## The number of axt files 1
## The number of axt alignments is 30055
axt$Oki_Olo <- readAxt_("OKI2018_I69_1.0__O_lon.axt.gz",        "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 7763
axt$Oki_Ova <- readAxt_("OKI2018_I69_1.0__O_van.axt.gz",        "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 17635
axt$Oki_Oal <- readAxt_("OKI2018_I69_1.0__O_alb.axt.gz",        "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 15989
axt$Osa_Oki <- readAxt_("OSKA2016v1.9__OKI2018_I69_1.0.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 31322
axt$Osa_Bar <- readAxt_("OSKA2016v1.9__Bar2_p4.axt.gz",         "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 24430
axt$Osa_Aom <- readAxt_("OSKA2016v1.9__AOM-5-5f.axt.gz",        "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.OIST.AOM.5.5f")
## The number of axt files 1
## The number of axt alignments is 15261
axt$Osa_Nor <- readAxt_("OSKA2016v1.9__OdB3.axt.gz",         "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.genoscope.OdB3")
## The number of axt files 1
## The number of axt alignments is 24497
axt$Osa_Kum <- readAxt_("OSKA2016v1.9__KUM-M3-7f.axt.gz", "BSgenome.Oidioi.OIST.OSKA2016v1.9", "BSgenome.Oidioi.OIST.KUM.M3.7f")
## The number of axt files 1
## The number of axt alignments is 31182
axt$Oki_Olo <- readAxt_("OSKA2016v1.9__O_lon.axt.gz",        "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 8031
axt$Oki_Ova <- readAxt_("OSKA2016v1.9__O_van.axt.gz",        "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 17606
axt$Oki_Oal <- readAxt_("OSKA2016v1.9__O_alb.axt.gz",        "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 16125
axt$Bar_Oki <- readAxt_("Bar2_p4__OKI2018_I69_1.0.axt.gz",      "BSgenome.Oidioi.OIST.Bar2.p4",      "BSgenome.Oidioi.OIST.OKI2018.I69")
## The number of axt files 1
## The number of axt alignments is 32265
axt$Bar_Osa <- readAxt_("Bar2_p4__OSKA2016v1.9.axt.gz",      "BSgenome.Oidioi.OIST.Bar2.p4",      "BSgenome.Oidioi.OIST.OSKA2016v1.9")
## The number of axt files 1
## The number of axt alignments is 24851
axt$Bar_Nor <- readAxt_("Bar2_p4__OdB3.axt.gz",      "BSgenome.Oidioi.OIST.Bar2.p4",      "BSgenome.Oidioi.genoscope.OdB3")
## The number of axt files 1
## The number of axt alignments is 15184
axt$Bar_Kum <- readAxt_("Bar2_p4__KUM-M3-7f.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4", "BSgenome.Oidioi.OIST.KUM.M3.7f")
## The number of axt files 1
## The number of axt alignments is 31557
axt$Bar_Aom <- readAxt_("Bar2_p4__AOM-5-5f.axt.gz", "BSgenome.Oidioi.OIST.Bar2.p4", "BSgenome.Oidioi.OIST.AOM.5.5f")
## The number of axt files 1
## The number of axt alignments is 24717
axt$Bar_Olo <- readAxt_("Bar2_p4__O_lon.axt.gz",        "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 7818
axt$Bar_Ova <- readAxt_("Bar2_p4__O_van.axt.gz",        "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 17467
axt$Bar_Oal <- readAxt_("Bar2_p4__O_alb.axt.gz",        "BSgenome.Oidioi.OIST.Bar2.p4")
## The number of axt files 1
## The number of axt alignments is 15499
GPairs2GBreaks <- function(GPairs, fix = TRUE) {
  if(isTRUE(fix))
    GPairs <- fixCoordinates(GPairs)
  gb <- GBreaks(target = granges(first(GPairs)), query = granges(second(GPairs)))
  strand(gb) <- strand(gb$query)
  strand(gb$query) <- "*"
  mcols(gb) <- cbind(mcols(gb), mcols(GPairs))
  sort(gb, ignore.strand = TRUE)
}

ceScan_to_GB <- function (axt, tFilter = NULL, qFilter = NULL, window, identity) {
  axt <- fixCoordinates(axt)
  cne <- ceScan(axt, tFilter = tFilter, qFilter = qFilter, window = window, identity = identity)
  GPairs2GBreaks(cne[[1]], fix = FALSE)
}
## CNEs are filtered against coding sequences
cne <- SimpleList()
cne$Oki_Osa <- ceScan_to_GB(axt$Oki_Osa, window = 50L, identity = 50L,
                            tFilter = exons(annots$Oki), qFilter = exons(annots$Osa))
# CE are not filtered against coding sequences
calculateCEs <- function(window, identity)
  ce <- lapply(axt, ceScan_to_GB, window = window, identity = identity) |> SimpleList()

ces <- SimpleList()
ces$ce_200_200 <- calculateCEs(200L, 200L)
ces$ce_200_198 <- calculateCEs(200L, 198L)
ces$ce_200_196 <- calculateCEs(200L, 196L)
ces$ce_200_194 <- calculateCEs(200L, 194L)
ces$ce_100_100 <- calculateCEs(100L, 100L)
ces$ce_100__99 <- calculateCEs(100L, 99L)
ces$ce_100__98 <- calculateCEs(100L, 98L)
ces$ce_100__97 <- calculateCEs(100L, 97L)
ces$ce_100__96 <- calculateCEs(100L, 96L)
ces$ce_100__95 <- calculateCEs(100L, 95L)
ces$ce__50__50 <- calculateCEs(50L, 50L)
ces$ce__50__49 <- calculateCEs(50L, 49L)
ces$ce__50__48 <- calculateCEs(50L, 48L)
ces$ce__50__47 <- calculateCEs(50L, 47L)
sapply(ces, sapply, length)
##         ce_200_200 ce_200_198 ce_200_196 ce_200_194 ce_100_100 ce_100__99
## Oki_Osa          2         25         62        117        234        647
## Oki_Bar          1         16         51        109        204        597
## Oki_Kum      29885      41845      48171      49057      97305     101081
## Oki_Aom          2         27         64        114        226        639
## Oki_Nor          1         15         51        110        190        558
## Oki_Olo          0          0          0          0          0          0
## Oki_Ova          0          1          1          1          1          1
## Oki_Oal          0          0          0          0          1          1
## Osa_Oki          2         25         62        118        234        646
## Osa_Bar         96        739       2069       4108       3560       9155
## Osa_Aom      29237      45308      46894      44721     101662     102365
## Osa_Nor         99        742       2048       4113       3536       8966
## Osa_Kum          4         25         60        117        228        660
## Bar_Oki          1         16         52        111        203        597
## Bar_Osa         94        737       2065       4098       3560       9148
## Bar_Nor      36071      45980      43697      38403     112235      97265
## Bar_Kum          1         15         49        108        198        601
## Bar_Aom        106        754       2094       4135       3592       9213
## Bar_Olo          0          0          0          0          0          0
## Bar_Ova          0          0          0          0          0          0
## Bar_Oal          0          0          0          0          0          0
##         ce_100__98 ce_100__97 ce_100__96 ce_100__95 ce__50__50 ce__50__49
## Oki_Osa       1242       2026       3022       4237       4357      10285
## Oki_Bar       1118       1873       2866       4109       4085       9880
## Oki_Kum      98635      91482      83044      74344     244955     185728
## Oki_Aom       1250       2034       3019       4251       4348      10255
## Oki_Nor       1083       1774       2735       3903       3859       9346
## Oki_Olo          2          2          5          9          5         24
## Oki_Ova          1          1          1          3          2         11
## Oki_Oal          1          2          2          3          7         20
## Osa_Oki       1244       2028       3023       4231       4347      10274
## Osa_Bar      16449      25776      36947      49377      36292      79678
## Osa_Aom      90748      78582      68514      60606     243426     163295
## Osa_Nor      16224      25183      36241      48228      35519      77553
## Osa_Kum       1232       2026       3044       4288       4364      10314
## Bar_Oki       1118       1875       2864       4109       4089       9874
## Bar_Osa      16442      25767      36928      49371      36291      79649
## Bar_Nor      78592      63719      52874      45519     245136     135911
## Bar_Kum       1137       1882       2854       4156       4078       9927
## Bar_Aom      16555      25986      37273      49727      36293      79946
## Bar_Olo          0          1          1          5          3         25
## Bar_Ova          0          0          0          2          2         11
## Bar_Oal          0          1          1          2          8         14
##         ce__50__48 ce__50__47
## Oki_Osa      18999      30813
## Oki_Bar      18318      30050
## Oki_Kum     140344     108188
## Oki_Aom      19000      30929
## Oki_Nor      17408      28535
## Oki_Olo         95        235
## Oki_Ova         34        121
## Oki_Oal         55        140
## Osa_Oki      18994      30811
## Osa_Bar     123621     155600
## Osa_Aom     113985      87658
## Osa_Nor     120232     151103
## Osa_Kum      18939      30789
## Bar_Oki      18321      30044
## Bar_Osa     123604     155535
## Bar_Nor      86383      63479
## Bar_Kum      18359      30036
## Bar_Aom     123991     156036
## Bar_Olo         71        188
## Bar_Ova         38        130
## Bar_Oal         46        131
saveRDS(ces, "CEs.Rds")

Export the results for supplemental material.

dir.create("CEs_BED", showWarnings = FALSE)
export_CEs_to_BED <- function(gr, thresholdname, genomepairname) {
  x <- ces[[thresholdname]][[genomepairname]]
  names(x) <- x$cigar
  rtracklayer::export(x, paste0("CEs_BED/", thresholdname, "_", genomepairname, ".bed"))
}
for (thresholdname in names(ces))
  for (genomepairname in names(ces[[thresholdname]]))
    export_CEs_to_BED(ces, thresholdname, genomepairname)