Purpose

We want to put numbers on visual impressions such as:

  • There is a lot of scrambling.
  • Aligned regions stay on their chromosome arm.
  • Aligned regions are more “off the diagonal” on short arms than on long arms.

** Figure 1D’s scrambling index table is computed here under the name strand randomisation indexes.**

Load R pacakges and data

See vignette("LoadGenomicBreaks", package = "OikScrambling") for how the different GBreaks objects are prepared.

Declare service functions

pairs2table6x3 <- function (x) {
  x <- as.list(x)
  cbind(
    Oki = c(Oki = NA,        Osa = x$Osa_Oki, Bar = x$Bar_Oki),
    Kum = c(Oki = x$Oki_Kum, Osa = x$Osa_Kum, Bar = x$Bar_Kum),
    Osa = c(Oki = x$Oki_Osa, Osa = NA,        Bar = x$Bar_Osa),
    Aom = c(Oki = x$Oki_Aom, Osa = x$Osa_Aom, Bar = x$Bar_Aom),
    Bar = c(Oki = x$Oki_Bar, Osa = x$Osa_Bar, Bar = NA       ),
    Nor = c(Oki = x$Oki_Nor, Osa = x$Osa_Nor, Bar = x$Bar_Nor)
  )
}

pairs2Kable6x3 <- function (df) df |> pairs2table6x3() |> round(2) |> knitr::kable()

Strand randomisation index

See ?GenomicBreaks::strand_randomisation_index for details.

strand_randomisation_index
## function (gb) 
## {
##     if (length(gb) == 0) 
##         return(numeric(0))
##     if (length(gb) == 1) 
##         return(1)
##     gbl <- split(gb, seqnames(gb), drop = TRUE)
##     idx <- sapply(gbl, function(x) {
##         onPlus <- sum(width(x[strand(x) == "+"]))
##         onMinus <- sum(width(x[strand(x) == "-"]))
##         abs((onPlus - onMinus)/(onPlus + onMinus))
##     })
##     weighted.mean(idx, sum(width(gbl)))
## }
## <bytecode: 0x5631f449d200>
## <environment: namespace:GenomicBreaks>

The computation of strand randomisation indices is not symmetric: the results differ a bit when the roles of target and query genomes are swapped.

SRIs         <- BiocParallel::bplapply(gbs,                 strand_randomisation_index  )
SRIS_swapped <- BiocParallel::bplapply(gbs, \(g) swap(g) |> strand_randomisation_index())

Table 1

We averaged the numbers computed with and without swapping target and query genomes.

as.list(1 / 2 * (unlist(SRIs) + unlist(SRIS_swapped))) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.67 0.04 0.08 0.10 0.18
Osa 0.04 0.06 NA 0.54 0.23 0.28
Bar 0.10 0.11 0.23 0.22 NA 0.50

Unidirectional computation

The raw data is shown here to prove that the differences do not affect general conclusions.

SRIs         |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.52 0.04 0.06 0.10 0.05
Osa 0.05 0.06 NA 0.43 0.23 0.10
Bar 0.10 0.09 0.23 0.19 NA 0.18
SRIS_swapped |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.83 0.05 0.10 0.10 0.31
Osa 0.04 0.06 NA 0.66 0.23 0.47
Bar 0.10 0.14 0.23 0.25 NA 0.82

Synteny index

Ad-hoc index measuring to what extent a scaffold of the target genome is mostly aligned to a single scaffold in the query genome. See ?GenomicBreaks::synteny_index for details. One limitation to the use of this index is that it requires that at least the query genome is a complete chromosome assembly.

synteny_index
## function (gb) 
## {
##     if (length(gb) == 0) 
##         return(numeric(0))
##     if (length(gb) == 1) 
##         return(1)
##     gbl <- split(gb, seqnames(gb), drop = TRUE)
##     synIdx <- sapply(gbl, function(x) {
##         alnOnMainHit <- tail(sort(sum(width(split(x, seqnames(x$query))))), 
##             1)
##         alnOnMainHit/sum(width(x))
##     })
##     weighted.mean(synIdx, sum(width(gbl)))
## }
## <bytecode: 0x5631ef2a6748>
## <environment: namespace:GenomicBreaks>
BiocParallel::bplapply(gbs,                 synteny_index  ) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.51 0.94 0.65 0.94 0.17
Osa 0.94 0.52 NA 0.61 0.97 0.17
Bar 0.94 0.53 0.97 0.66 NA 0.16
BiocParallel::bplapply(gbs, \(g) swap(g) |> synteny_index()) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.97 0.94 0.94 0.94 0.93
Osa 0.94 0.94 NA 0.97 0.97 0.96
Bar 0.94 0.94 0.97 0.97 NA 0.95

Correlation

Ad-hoc index measuring the correlation of the coordinates of the syntenic alignments in scaffolds of a target genome and their best match in the query genome. See ?GenomicBreaks::correlation_index for details. It is intended to be more robust to the presence of uncollapsed haplotypes in the query genome but probably needs further testing.

correlation_index
## function (gb) 
## {
##     if (length(gb) == 0) 
##         return(numeric(0))
##     if (length(gb) == 1) 
##         return(1)
##     gbl <- split(gb, seqnames(gb), drop = TRUE)
##     corIdx <- sapply(gbl, function(x) {
##         bestQuery <- names(tail(sort(table(seqnames(x$query))), 
##             1))
##         x <- x[seqnames(x$query) == bestQuery]
##         if (length(x) == 1) 
##             return(1)
##         weights <- width(x)
##         x <- resize(x, fix = "center", width = 1)
##         x$query <- resize(x$query, fix = "center", width = 1)
##         weights::wtd.cor(start(x), start(x$query), weights)[1]
##     })
##     weighted.mean(corIdx, sum(width(gbl)))
## }
## <bytecode: 0x5631de11af38>
## <environment: namespace:GenomicBreaks>
BiocParallel::bplapply(gbs,                 correlation_index  ) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.89 0.48 -0.03 0.44 -0.19
Osa 0.48 -0.06 NA -0.43 0.37 0.30
Bar 0.44 -0.01 0.36 0.07 NA 0.36
BiocParallel::bplapply(gbs, \(g) swap(g) |> correlation_index()) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.51 0.48 -0.01 0.44 0.01
Osa 0.48 -0.01 NA -0.22 0.36 0.01
Bar 0.44 0.03 0.36 0.11 NA 0.20

Gene Order Conservation

See ?GenomicBreaks::GOC for details.

GOC
## function (gb, vicinity = 4, debug = FALSE) 
## {
##     if (length(gb) == 0) 
##         return(numeric(0))
##     if (length(gb) == 1) 
##         return(1)
##     gb <- sort(gb, ignore.strand = TRUE)
##     gb <- flagColinearAlignments(gb, details = TRUE)
##     gb$GOCi <- abs(gb$qfoll) <= vicinity
##     if (debug) 
##         return(gb)
##     gb[is.na(gb$tfoll)] <- NULL
##     sum(gb$GOCi %in% TRUE)/length(gb)
## }
## <bytecode: 0x5632001b2478>
## <environment: namespace:GenomicBreaks>
BiocParallel::bplapply(gbs,                 GOC  ) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.76 0.84 0.85 0.85 0.82
Osa 0.84 0.84 NA 0.78 0.89 0.85
Bar 0.85 0.85 0.89 0.89 NA 0.70
BiocParallel::bplapply(gbs, \(g) swap(g) |> GOC()) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.76 0.84 0.84 0.84 0.84
Osa 0.84 0.84 NA 0.77 0.89 0.88
Bar 0.85 0.85 0.89 0.89 NA 0.73

Strand proportion index

Attempts to use pure proportions lead to scores drifting towards 0.5 when comparing fragmented assemblies, as the fragments are not oriented.

strand_proportion_index <- function(gb) {
  gbl <- split(gb, droplevels(seqnames(gb)))
  # Calculate an index for each sequence feature
  idx <- sapply(gbl, \(x) {
    onPlus  <- sum(width(x[strand(x) == '+']))
    onMinus <- sum(width(x[strand(x) == '-']))
    onPlus / (onPlus + onMinus)
  })
  # Average by the sum of all widths
  weighted.mean(idx, sum(width(gbl)))
}
BiocParallel::bplapply(gbs,                 strand_proportion_index  ) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.73 0.49 0.47 0.47 0.49
Osa 0.49 0.50 NA 0.41 0.38 0.46
Bar 0.47 0.46 0.38 0.54 NA 0.59
BiocParallel::bplapply(gbs, \(g) swap(g) |> strand_proportion_index()) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.73 0.49 0.47 0.47 0.49
Osa 0.49 0.50 NA 0.41 0.38 0.46
Bar 0.47 0.46 0.38 0.54 NA 0.59

Strand proportion majority index

One possible solution to the problem above could be to report the highest proportion, so that the value returned is not sensitive to orientation. But it scales the index between 0.5 and 1, which may be counter-intuitive.

strand_proportion_majority_index <- function(gb) {
  gbl <- split(gb, droplevels(seqnames(gb)))
  # Calculate an index for each sequence feature
  idx <- sapply(gbl, \(x) {
    onPlus  <- sum(width(x[strand(x) == '+']))
    onMinus <- sum(width(x[strand(x) == '-']))
    p <- onPlus / (onPlus + onMinus)
    if(p < 0.5) p <- 1 - p
    p
  })
  # Average by the sum of all widths
  weighted.mean(idx, sum(width(gbl)))
}
BiocParallel::bplapply(gbs,                 strand_proportion_majority_index  ) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.76 0.52 0.53 0.55 0.53
Osa 0.52 0.53 NA 0.72 0.62 0.55
Bar 0.55 0.54 0.62 0.59 NA 0.59
BiocParallel::bplapply(gbs, \(g) swap(g) |> strand_proportion_majority_index()) |> pairs2Kable6x3()
Oki Kum Osa Aom Bar Nor
Oki NA 0.91 0.52 0.55 0.55 0.66
Osa 0.52 0.53 NA 0.83 0.62 0.74
Bar 0.55 0.57 0.62 0.63 NA 0.91