Modifying coalescent cutoffs

knitr::opts_chunk$set(cache = TRUE, cache.lazy = FALSE)
knitr::opts_knit$set(verbose = TRUE)

Introduction

In this vignette, we examine the behaviour of the coalescing algorithm under varying thresholds of minimum alignment width. In doing so we attempt to fully describe the contiguiuty of the pairwise alignments.

The brunt of this vignette is the product of Charles’ hard work.

The core functions used here are maintained in our GenomicBreaks R package, which is fully documented at: https://oist.github.io/GenomicBreaks.

Load packages

Load R packages and data

library('OikScrambling') |> suppressPackageStartupMessages()
library('dplyr')

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:Biostrings':
## 
##     collapse, intersect, setdiff, setequal, union

## The following object is masked from 'package:XVector':
## 
##     slice

## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union

## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect

## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union

## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library('ggplot2')
library('pheatmap')
library('purrr')

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:XVector':
## 
##     compact

## The following object is masked from 'package:GenomicRanges':
## 
##     reduce

## The following object is masked from 'package:IRanges':
## 
##     reduce

library('viridis') |> suppressPackageStartupMessages()
(genomes <- OikScrambling:::loadAllGenomes())

## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8

## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8

## List of length 6
## names(6): Oki Osa Bar Kum Aom Nor

(load("BreakPoints.Rdata"))

##  [1] "gbs"       "unal"      "coa"       "unmap"     "bri"       "tra"      
##  [7] "tra2"      "coa2"      "unmap2"    "wgo"       "longShort"

Generating coalesced alignments

The function coalesce_contigs() is the primary driver of our synteny analysis. It accepts an argument, minwidth, which controls the minimum width of segments to be considered for synteny analyses.

In the next session, I calculate these alignments at a large number of minimum width cutoffs.

# use coalesce_contigs at different cutoffs
minwidthCutoffs = c(0, 25, 50, 100, 250, 500, 750, 1000, 1250, 1500, 2000, 2500, 5000, 7500, 10000, 12500, 25000, 50000, 100000)
cutoff_coas = lapply(minwidthCutoffs, function(MinWidth) BiocParallel::bplapply(gbs, function(gb) coalesce_contigs(gb=gb, minwidth=MinWidth)) |> SimpleList()) |> SimpleList()
names(cutoff_coas) <- sapply(minwidthCutoffs, function(s) paste('coa_minWidth_', as.character(s), sep=''))
# Append uncoalesced alignment
cutoff_coas$uncoalesced <- gbs
# Reorder to put uncoalesced first
cutoff_coas <- cutoff_coas[c(20,1:19)]

Let’s make some tables to help understand what’s going on, in terms of how much is being removed, how big the syntenic regions are, and so on.

# You can then make a simple table that tabulates the number of segments at a number of cutoffs.
cutoff_coas_tables = SimpleList()
cutoff_coas_tables_map <- function(f)
  purrr::map_dfr(cutoff_coas,
    \(cutoff) purrr::map(cutoff, f)
  )
# Count number of segments. We make 0-count cells to NA to make them more obvious in the heatmap later.
cutoff_coas_tables$NumSegments <- cutoff_coas_tables_map(length)

# Show how many total bases are left after coalescing.
# Divide by 1,000,000 to convert it to Mbp. 0-counts are converted to NA.
cutoff_coas_tables$TotalAligned <- cutoff_coas_tables_map(\(gr) sum(width(gr)/1000000))
  
# Show largest segment. Combine it with the query name to see which chromosome segment is biggest.
cutoff_coas_tables$LargestSegment <- cutoff_coas_tables_map(\(gr) {
    if (length(gr) == 0) return(NA)
    max(width(gr))
  })

# Get name of largest segment, which can change based on the cutoffs.
cutoff_coas_tables$LargestSegmentName <- cutoff_coas_tables_map(\(gb) {
    if (length(gb) == 0) return(NA)
    as.character(seqnames(gb)[which.max(width(gb))])
  })

# Write a little function to relate a pair of species (A_B) to their genome size.
# named list of genome sizes (name=species, content=genome size).
genome_sizes=list(Aom=sum(seqlengths(genomes$Aom)), Bar=sum(seqlengths(genomes$Bar)),
                  Oki=sum(seqlengths(genomes$Oki)), Osa=sum(seqlengths(genomes$Osa)),
                  Kum=sum(seqlengths(genomes$Kum)), Nor=sum(seqlengths(genomes$Nor)),
                  Ply=128639392, Ros=134327879, Sav=177003750, Rob=117529544,
                  Dme=137547960, Dbu=110837441, Dsu=130045071, Dya=136883712,
                  Cin=sum(guessSeqLengths(gbs$Cni_Cin)),
                  Cel=sum(guessSeqLengths(gbs$Cbr_Cel$query)),
                  Cre=sum(guessSeqLengths(gbs$Cni_Cre$query)),
                  Cbr=sum(guessSeqLengths(gbs$Cbr_Cni)),
                  Cni=sum(guessSeqLengths(gbs$Cni_Cbr))
                  )
match_pair_with_sizes = function(pair, genome_sizes) {
  spp = unlist(strsplit(pair, "_"))
  g1 = spp[1]
  g2 = spp[2]
  gs1 = genome_sizes[[g1]]
  gs2 = genome_sizes[[g2]]
  result = list()
  result[[g1]] = gs1
  result[[g2]] = gs2
  return(result)
}
# Use the genome sizes to calculate how much of the genomes are covered.
# This coverage value is just the total alignment length divided by the mean of the two genome sizes.
cutoff_coas_tables$PercentGenomeCovered <- as.data.frame(do.call(rbind, lapply(cutoff_coas, function(cutoff) {
  cut_gb <- sapply(names(cutoff), function(pair) {
    gb <- cutoff[[pair]]
    pair_sizes <- match_pair_with_sizes(pair, genome_sizes=genome_sizes)
    mean_genome_size <- mean(unlist(pair_sizes))
    covered <- (sum(width(gb))/mean_genome_size)*100
    covered
  })
  cut_gb
})))
# Show how many bases of the original alignment are covered.
uncoalescedAliTot <- cutoff_coas_tables$TotalAligned[1,]
cutoff_coas_tables$PercentOriginalAlignment <- do.call(rbind, apply(cutoff_coas_tables$TotalAligned, 1, function(x) x/uncoalescedAliTot)) *100

Note the names of the largest segments.

(cutoff_coas_tables$LargestSegmentName)

## # A tibble: 20 × 33
##    Oki_Osa Oki_Bar Oki_Kum Oki_Aom Oki_Nor Osa_Oki Osa_Bar Osa_Kum Osa_Aom
##    <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
##  1 PAR     PAR     XSR     PAR     PAR     PAR     PAR     PAR     XSR    
##  2 PAR     chr2    XSR     XSR     XSR     XSR     Chr2    XSR     XSR    
##  3 PAR     chr2    XSR     XSR     XSR     XSR     Chr2    XSR     XSR    
##  4 PAR     chr2    XSR     XSR     XSR     XSR     Chr2    XSR     XSR    
##  5 PAR     chr2    XSR     XSR     XSR     XSR     Chr2    XSR     XSR    
##  6 PAR     chr2    XSR     XSR     XSR     XSR     XSR     XSR     XSR    
##  7 chr2    chr2    XSR     chr2    XSR     XSR     Chr2    XSR     XSR    
##  8 chr2    chr2    XSR     chr2    XSR     XSR     XSR     XSR     XSR    
##  9 chr2    XSR     XSR     XSR     XSR     XSR     XSR     XSR     XSR    
## 10 chr2    chr2    XSR     XSR     XSR     XSR     XSR     XSR     XSR    
## 11 XSR     chr2    XSR     XSR     XSR     XSR     XSR     XSR     XSR    
## 12 XSR     XSR     XSR     XSR     XSR     XSR     XSR     XSR     XSR    
## 13 XSR     XSR     XSR     YSR     XSR     XSR     XSR     XSR     XSR    
## 14 PAR     chr2    XSR     PAR     XSR     PAR     XSR     PAR     XSR    
## 15 XSR     chr1    XSR     XSR     XSR     XSR     XSR     XSR     XSR    
## 16 XSR     chr1    XSR     XSR     PAR     XSR     XSR     XSR     XSR    
## 17 chr2    chr1    XSR     chr2    chr2    Chr2    XSR     XSR     XSR    
## 18 PAR     PAR     XSR     PAR     PAR     PAR     XSR     PAR     XSR    
## 19 NA      NA      XSR     NA      NA      NA      NA      NA      XSR    
## 20 NA      NA      NA      NA      NA      NA      NA      NA      XSR    
## # ℹ 24 more variables: Osa_Nor <chr>, Bar_Oki <chr>, Bar_Osa <chr>,
## #   Bar_Kum <chr>, Bar_Aom <chr>, Bar_Nor <chr>, Ply_Ros <chr>, Ply_Rob <chr>,
## #   Ply_Sav <chr>, Ply_Oki <chr>, Rob_Ros <chr>, Rob_Ply <chr>, Rob_Sav <chr>,
## #   Rob_Oki <chr>, Dme_Dbu <chr>, Dme_Dsu <chr>, Dme_Dya <chr>, Dme_Dma <chr>,
## #   Cni_Cbr <chr>, Cni_Cre <chr>, Cni_Cin <chr>, Cbr_Cni <chr>, Cbr_Cre <chr>,
## #   Cbr_Cel <chr>

Plotting features at different cutoffs

pheatmap_ <- function(x, main)
  pheatmap(x, col=magma(100), cluster_cols=F, cluster_row=F, gaps_col=c(15,23,27), angle_col = 90, main=main, display_numbers=T, number_color='#d9d9d9', number_format="%.0f", fontsize_number=6)

Number of segments

# Number of segments
pheatmap_(cutoff_coas_tables$NumSegments, main='Number of alignments (uncoalesced) or syntenic regions (coalesced)')

Total aligned

pheatmap_(cutoff_coas_tables$TotalAligned, main='Sum of all alignment widths (Mbp)')

Largest segment

pheatmap_(cutoff_coas_tables$LargestSegment, main='Size of largest segment (Mbp)')

Percent of genome covered by alignments

Note the denominator here is the mean of the query and target genome sizes.

pheatmap_(cutoff_coas_tables$PercentGenomeCovered, main='Percentage of query/target genomes covered (%)')

Percent of total aligned positions represented in alignments

Note the denominator here is the mean of the query and target genome sizes.

pheatmap_(cutoff_coas_tables$PercentOriginalAlignment, main='Percentage of originally-aligned positions covered (%)')

The effect of coalescing cutoffs on synteny

Now let’s make some chromosome plots. Here are the original Okinawa-Osaka alignments, with no coalescing.

plotApairOfChrs(cutoff_coas$uncoalesced$Oki_Osa, 'chr1')

You can see many details here, including the huge number of small alignments. Many of these are removed by coalescing:

plotApairOfChrs(cutoff_coas$coa_minWidth_0$Oki_Osa, 'chr1')

However, many small alignments persist after coalescing. As seen above, applying length filters to remove small alignments can drastically reduce the number of segments - or in other words, greatly increase the contiguity of the coalesced alignments.

Here is the result of removing alignments below 250bp - a fairly conservative threshold:

plotApairOfChrs(cutoff_coas$coa_minWidth_250$Oki_Osa, 'chr1')

And if we go too far, we lose too much information for it to be useful:

plotApairOfChrs(cutoff_coas$coa_minWidth_10000$Oki_Osa, 'chr1')

Using “maximal coverage of query/target genomes” as the selector

For the following plots, I use my own arbitrary choices to select a cutoff for each pair that balances retaining as much information as possible with the largest possible synteny. I select the cutoff that maximizes the total coverage of the pair - the maximum “Sum of all alignment widths” cell from the heatmap above.

Okinawa-Osaka

plotApairOfChrs(cutoff_coas$coa_minWidth_250$Oki_Osa, 'chr1')

plotApairOfChrs(cutoff_coas$coa_minWidth_250$Oki_Osa, 'chr2')

plotApairOfChrs(cutoff_coas$coa_minWidth_250$Oki_Osa, 'PAR')

Okinawa-Barcelona

plotApairOfChrs(cutoff_coas$coa_minWidth_100$Oki_Bar, 'chr1')

plotApairOfChrs(cutoff_coas$coa_minWidth_100$Oki_Bar, 'chr2')

plotApairOfChrs(cutoff_coas$coa_minWidth_100$Oki_Bar, 'PAR')

Osaka-Barcelona

plotApairOfChrs(cutoff_coas$coa_minWidth_250$Osa_Bar, 'Chr1')

plotApairOfChrs(cutoff_coas$coa_minWidth_250$Osa_Bar, 'Chr2')

plotApairOfChrs(cutoff_coas$coa_minWidth_250$Osa_Bar, 'PAR')

Okinawa-Kume

The following is taken from the vignette("ParallelPlots", package = "OikScrambling") vignette. It is invoked here to allow later plots between Okinawa and Kume.

requireNamespace("ade4")
treeLeaf <- function(name, length=NULL) {
  if(!is.null(length)) length <- paste0(':', length)
  paste0(name, length)
}

treeNode <- function(branch1, branch2, length = NULL) {
  if(!is.null(length)) length <- paste0(':', length)
  paste0('(', branch1, ',', branch2, ')', length)
}

addRoot <- function(branch)  paste0(branch, ";")

tree <-
  addRoot(
    treeNode(
      treeNode( length = 2,
        treeLeaf("Okinawa", 1),
        treeLeaf("Kume", 1)
      ),
      treeNode( length = 1,
        treeNode( length = 1,
          treeLeaf("Osaka", 1),
          treeLeaf("Aomori", 1)
        ),
        treeNode( length =1,
          treeLeaf("Norway", 1),
          treeLeaf("Barcelona", 1)
        )
      )
    )
  )

The optimal Okinawa-Kume plot:

# Note that this uses the cutoff coalesced object and not the translocation-removed coa2 object.
x <- cutoff_coas$coa_minWidth_2000$Oki_Kum
x[seqnames(x$query) == "contig_90_1"] <- reverse(query = TRUE, x[seqnames(x$query) == "contig_90_1"])
x[seqnames(x$query) == "contig_88_1"] <- reverse(query = TRUE, x[seqnames(x$query) == "contig_88_1"])
x$query <- mergeSeqLevels(x$query, c("contig_27_1", "contig_90_1", "contig_3_1", "contig_88_1"), "new_scaffold")

plotApairOfChrs(x, 'chr1', dna_seg_scale = TRUE)

Note that when you try to increase this further, you begin losing many regions:

x <- cutoff_coas$coa_minWidth_10000$Oki_Kum
x[seqnames(x$query) == "contig_90_1"] <- reverse(query = TRUE, x[seqnames(x$query) == "contig_90_1"])
x[seqnames(x$query) == "contig_88_1"] <- reverse(query = TRUE, x[seqnames(x$query) == "contig_88_1"])
x$query <- mergeSeqLevels(x$query, c("contig_27_1", "contig_90_1", "contig_3_1", "contig_88_1"), "new_scaffold")

plotApairOfChrs(x, 'chr1', dna_seg_scale = TRUE)

Session information

sessionInfo()

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 12 (bookworm)
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.11.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.11.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] BSgenome.Oidioi.genoscope.OdB3_1.0.0   
##  [2] BSgenome.Oidioi.OIST.AOM.5.5f_1.0.1    
##  [3] BSgenome.Oidioi.OIST.KUM.M3.7f_1.0.1   
##  [4] BSgenome.Oidioi.OIST.Bar2.p4_1.0.1     
##  [5] BSgenome.Oidioi.OIST.OSKA2016v1.9_1.0.0
##  [6] BSgenome.Oidioi.OIST.OKI2018.I69_1.0.1 
##  [7] viridis_0.6.4                          
##  [8] viridisLite_0.4.2                      
##  [9] purrr_1.0.2                            
## [10] pheatmap_1.0.12                        
## [11] dplyr_1.1.3                            
## [12] OikScrambling_5.0.0                    
## [13] ggplot2_3.4.3                          
## [14] GenomicBreaks_0.14.2                   
## [15] BSgenome_1.68.0                        
## [16] rtracklayer_1.60.0                     
## [17] Biostrings_2.68.1                      
## [18] XVector_0.40.0                         
## [19] GenomicRanges_1.52.0                   
## [20] GenomeInfoDb_1.36.1                    
## [21] IRanges_2.34.1                         
## [22] S4Vectors_0.38.1                       
## [23] BiocGenerics_0.46.0                    
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.1               BiocIO_1.10.0              
##   [3] bitops_1.0-7                tibble_3.2.1               
##   [5] R.oo_1.25.0                 XML_3.99-0.14              
##   [7] rpart_4.1.19                lifecycle_1.0.3            
##   [9] rprojroot_2.0.3             lattice_0.20-45            
##  [11] MASS_7.3-58.2               backports_1.4.1            
##  [13] magrittr_2.0.3              Hmisc_5.1-0                
##  [15] sass_0.4.7                  rmarkdown_2.23             
##  [17] jquerylib_0.1.4             yaml_2.3.7                 
##  [19] plotrix_3.8-2               DBI_1.1.3                  
##  [21] CNEr_1.36.0                 minqa_1.2.5                
##  [23] RColorBrewer_1.1-3          ade4_1.7-22                
##  [25] abind_1.4-5                 zlibbioc_1.46.0            
##  [27] R.utils_2.12.2              RCurl_1.98-1.12            
##  [29] nnet_7.3-18                 pracma_2.4.2               
##  [31] GenomeInfoDbData_1.2.10     gdata_2.19.0               
##  [33] annotate_1.78.0             pkgdown_2.0.7              
##  [35] codetools_0.2-19            DelayedArray_0.26.7        
##  [37] tidyselect_1.2.0            shape_1.4.6                
##  [39] lme4_1.1-34                 matrixStats_1.0.0          
##  [41] base64enc_0.1-3             GenomicAlignments_1.36.0   
##  [43] jsonlite_1.8.7              mitml_0.4-5                
##  [45] Formula_1.2-5               survival_3.5-3             
##  [47] iterators_1.0.14            systemfonts_1.0.5          
##  [49] foreach_1.5.2               tools_4.3.1                
##  [51] ragg_1.2.5                  Rcpp_1.0.11                
##  [53] glue_1.6.2                  gridExtra_2.3              
##  [55] pan_1.8                     xfun_0.40                  
##  [57] MatrixGenerics_1.12.2       EBImage_4.42.0             
##  [59] withr_2.5.1                 fastmap_1.1.1              
##  [61] boot_1.3-28.1               fansi_1.0.5                
##  [63] digest_0.6.33               R6_2.5.1                   
##  [65] mice_3.16.0                 textshaping_0.3.7          
##  [67] colorspace_2.1-0            GO.db_3.17.0               
##  [69] gtools_3.9.4                poweRlaw_0.70.6            
##  [71] jpeg_0.1-10                 RSQLite_2.3.1              
##  [73] weights_1.0.4               R.methodsS3_1.8.2          
##  [75] utf8_1.2.3                  tidyr_1.3.0                
##  [77] generics_0.1.3              data.table_1.14.8          
##  [79] httr_1.4.7                  htmlwidgets_1.6.2          
##  [81] S4Arrays_1.0.5              pkgconfig_2.0.3            
##  [83] gtable_0.3.4                blob_1.2.4                 
##  [85] htmltools_0.5.6.1           fftwtools_0.9-11           
##  [87] scales_1.2.1                Biobase_2.60.0             
##  [89] png_0.1-8                   knitr_1.44                 
##  [91] heatmaps_1.24.0             rstudioapi_0.15.0          
##  [93] tzdb_0.4.0                  reshape2_1.4.4             
##  [95] rjson_0.2.21                checkmate_2.2.0            
##  [97] nlme_3.1-162                nloptr_2.0.3               
##  [99] cachem_1.0.8                stringr_1.5.0              
## [101] KernSmooth_2.23-20          parallel_4.3.1             
## [103] genoPlotR_0.8.11            foreign_0.8-84             
## [105] AnnotationDbi_1.62.2        restfulr_0.0.15            
## [107] desc_1.4.2                  pillar_1.9.0               
## [109] grid_4.3.1                  vctrs_0.6.3                
## [111] jomo_2.7-6                  xtable_1.8-4               
## [113] cluster_2.1.4               htmlTable_2.4.1            
## [115] evaluate_0.22               readr_2.1.4                
## [117] cli_3.6.1                   locfit_1.5-9.8             
## [119] compiler_4.3.1              Rsamtools_2.16.0           
## [121] rlang_1.1.1                 crayon_1.5.2               
## [123] plyr_1.8.8                  fs_1.6.3                   
## [125] stringi_1.7.12              BiocParallel_1.34.2        
## [127] munsell_0.5.0               tiff_0.1-11                
## [129] glmnet_4.1-7                Matrix_1.5-3               
## [131] hms_1.1.3                   bit64_4.0.5                
## [133] KEGGREST_1.40.0             SummarizedExperiment_1.30.2
## [135] broom_1.0.5                 memoise_2.0.1              
## [137] bslib_0.5.1                 bit_4.0.5

Michael Mansfield

13 October, 2023