knitr::opts_chunk$set(cache = TRUE,
                      dev    = c('png', 'svg'),
                      fig.ext= c('png', 'svg'),
                      fig.width  = 10,
                      fig.height = 10)

Introduction

This vignette produces a scaffolding guide to be used with the KUM-M3 genome for plotting purposes.

Load R pacakges and data

## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8

## Warning in runHook(".onLoad", env, package.lib, package): input string
## 'Génoscope' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid
## UTF-8
load("BreakPoints.Rdata")

Scaffolding

Helper functions

# Helper function to flip many contigs at once.
flipContigsForOxfordPlot <- function(gb, flip_which=NULL) {
  grl <- split(gb, seqnames(gb$query))
  grl <- endoapply(grl, function(s) {
    if( ! identical(unique(as.character(seqnames(s$query))), character(0)) ) {
      if(unique(seqnames(s$query)) %in% flip_which ) {
        s <- GenomicBreaks::reverse(s, query=TRUE)
        s
      }
    }
    s
  })
  ogb <- unlist(grl) |> unname()
  ogb
}

# Let's remove the contigs that have their main match elsewhere.
QTcoverage <- function(gb) {
  stopifnot (length(seqlevels(gb)) == 1) # Not ready for full objects
  stopifnot (!any(is.na(seqlengths(gb$query))))
  grl <- split(gb, seqnames(gb$query))
  lapply(grl, \(gb) sum(width(gb$query))) |> unlist() / seqlengths(gb$query)
}

Scaffolding of Kume chromosomes

The Kume draft assembly contains long contigs that can be scaffolded into chromosomes. Some sequences may be missing between these contigs, but the absence of large gaps in the plots at the junction between contigs suggest that this is not a problem for the use we make of this scaffolding.

Chromosome 1

The best match for chr1 in the Kume assembly is contig_3_1. It maps roughly in the 8,000,000 – 13,000,000 region.

bestMatch <- function(gb)
  tapply(width(gb$query), seqnames(gb$query), sum) |> sort() |> tail(1) |> names()

coa$Oki_Kum |> plyranges::filter(seqnames == "chr1") |> bestMatch()
## [1] "contig_3_1"

The next best match, contig_27_1 is most of chr1’s short arm.

coa$Oki_Kum |> plyranges::filter(seqnames == "chr1", seqnames(query)!="contig_3_1") |> bestMatch()
## [1] "contig_27_1"
coa$Oki_Kum |> plotApairOfChrs(chrQ = "contig_27_1")

Oki_Kum_Chr1 <- gbs$Oki_Kum |> plyranges::filter(seqnames == "chr1")
seqlevels(Oki_Kum_Chr1)       <- seqlevelsInUse(Oki_Kum_Chr1)
seqlevels(Oki_Kum_Chr1$query) <- seqlevelsInUse(Oki_Kum_Chr1$query)
makeOxfordPlots(Oki_Kum_Chr1, col='strand') + ggtitle("Everything from KUM-M3 that matches Oki's Chr1")

scafs$Kum_Oki <- list(Chr1 = data.frame(
  contig = c("contig_27_1", "contig_90_1", "contig_3_1", "contig_88_1", "contig_87_1", "contig_63_1"),
  orientation = c(1, -1, 1, -1, 1, 1)
))

contigsToMerge_start  <- c(1, cumsum(seqlengths(gbs$Oki_Kum$query)[scafs$Kum_Oki$Chr1$contig]))    |> unname() |> head(-1)
contigsToMerge_end    <-     (cumsum(seqlengths(coa$Oki_Kum$query)[scafs$Kum_Oki$Chr1$contig]) -1) |> unname()
contigsToMerge_oldPos <- GRanges("chr1", IRanges(contigsToMerge_start, contigsToMerge_end))
names(contigsToMerge_oldPos)  <- scafs$Kum_Oki$Chr1$contig
contigsToMerge_oldPos
## GRanges object with 6 ranges and 0 metadata columns:
##               seqnames            ranges strand
##                  <Rle>         <IRanges>  <Rle>
##   contig_27_1     chr1         1-4780084      *
##   contig_90_1     chr1   4780085-7534138      *
##    contig_3_1     chr1  7534139-12219121      *
##   contig_88_1     chr1 12219122-13349956      *
##   contig_87_1     chr1 13349957-13492925      *
##   contig_63_1     chr1 13492926-13839560      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
annot <- genoPlotR::annotation(x1=start(contigsToMerge_oldPos), x2=end(contigsToMerge_oldPos), text=names(contigsToMerge_oldPos), rot=30)

gbs$Oki_Kum |> swap() |> scaffoldByFlipAndMerge(scafs$Kum_Oki) |> plotApairOfChrs(dna_seg_scale=TRUE, annotations = annot)

Chromosome 2

Oki_Kum_Chr2 <- gbs$Oki_Kum |> plyranges::filter(seqnames == "chr2")
seqlevels(Oki_Kum_Chr2)       <- seqlevelsInUse(Oki_Kum_Chr2)
seqlevels(Oki_Kum_Chr2$query) <- seqlevelsInUse(Oki_Kum_Chr2$query)

makeOxfordPlots(Oki_Kum_Chr2, col='strand') + ggtitle("Everything from KUM-M3 that matches Oki's Chr2")

Let’s remove the contigs that have their main match elsewhere.

Oki_Kum_Chr2[seqnames(Oki_Kum_Chr2$query) %in% seqlevels(Oki_Kum_Chr2$query)[QTcoverage(Oki_Kum_Chr2) < 0.5]] <- NULL
seqlevels(Oki_Kum_Chr2$query) <- seqlevelsInUse(Oki_Kum_Chr2$query)
makeOxfordPlots(Oki_Kum_Chr2, col='strand') + ggtitle("Main hits from KUM-M3 to OKI's Chr2")

scafs$Kum_Oki[["Chr2"]] <- data.frame(
  contig = c("contig_36_1", "contig_16_1", "contig_24_1", "contig_79_1", "contig_37_1", "contig_81_1"),
  orientation = c(-1, 1, 1, -1, -1, -1)
)

contigsToMerge_start  <- c(1, cumsum(seqlengths(gbs$Oki_Kum$query)[scafs$Kum_Oki$Chr2$contig]))    |> unname() |> head(-1)
contigsToMerge_end    <-     (cumsum(seqlengths(coa$Oki_Kum$query)[scafs$Kum_Oki$Chr2$contig]) -1) |> unname()
contigsToMerge_oldPos <- GRanges("chr2", IRanges(contigsToMerge_start, contigsToMerge_end))
names(contigsToMerge_oldPos)  <- scafs$Kum_Oki$Chr2$contig
contigsToMerge_oldPos
## GRanges object with 6 ranges and 0 metadata columns:
##               seqnames            ranges strand
##                  <Rle>         <IRanges>  <Rle>
##   contig_36_1     chr2          1-319811      *
##   contig_16_1     chr2    319812-5523357      *
##   contig_24_1     chr2   5523358-9335672      *
##   contig_79_1     chr2  9335673-12065686      *
##   contig_37_1     chr2 12065687-12853372      *
##   contig_81_1     chr2 12853373-15214355      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
annot <- genoPlotR::annotation(x1=start(contigsToMerge_oldPos), x2=end(contigsToMerge_oldPos), text=names(contigsToMerge_oldPos), rot=30)

gbs$Oki_Kum |> swap() |> scaffoldByFlipAndMerge(scafs$Kum_Oki) |> plotApairOfChrs(chrT = 'Chr2', dna_seg_scale=TRUE, annotations = annot)

PAR

Oki_Kum_PAR <- gbs$Oki_Kum |> plyranges::filter(seqnames == "PAR")
seqlevels(Oki_Kum_PAR)       <- seqlevelsInUse(Oki_Kum_PAR)
seqlevels(Oki_Kum_PAR$query) <- seqlevelsInUse(Oki_Kum_PAR$query)

makeOxfordPlots(Oki_Kum_PAR, col='strand') + ggtitle("Everything from KUM-M3 that matches OKI's PAR")

Let’s remove the contigs that have their main match elsewhere.

Oki_Kum_PAR[seqnames(Oki_Kum_PAR$query) %in% seqlevels(Oki_Kum_PAR$query)[QTcoverage(Oki_Kum_PAR) < 0.5]] <- NULL
seqlevels(Oki_Kum_PAR$query) <- seqlevelsInUse(Oki_Kum_PAR$query)
makeOxfordPlots(Oki_Kum_PAR, col='strand') + ggtitle("Main hits from KUM-M3 to OKI's PAR")

Let’s flip things around a little to get things as linear as they can be:

makeOxfordPlots( flipContigsForOxfordPlot(Oki_Kum_PAR, flip_which = c('contig_12_1', 'contig_43_1')), col='strand')  + ggtitle("KUM-M3 contigs scaffolded and reoriented to PAR")

scafs$Kum_Oki[["PAR"]] <- data.frame(
  contig =  c('contig_13_1', 'contig_12_1', 'contig_41_1', 'contig_43_1', 'contig_82_1', 'contig_89_1', 'contig_85_1', 'contig_28_1', 'contig_91_1'),
  orientation = c(1, -1, 1, -1, 1, 1, 1, 1, 1)
)

contigsToMerge_start  <- c(1, cumsum(seqlengths(gbs$Oki_Kum$query)[scafs$Kum_Oki$PAR$contig]))    |> unname() |> head(-1)
contigsToMerge_end    <-     (cumsum(seqlengths(coa$Oki_Kum$query)[scafs$Kum_Oki$PAR$contig]) -1) |> unname()
contigsToMerge_oldPos <- GRanges("chr2", IRanges(contigsToMerge_start, contigsToMerge_end))
names(contigsToMerge_oldPos)  <- scafs$Kum_Oki$PAR$contig
contigsToMerge_oldPos
## GRanges object with 9 ranges and 0 metadata columns:
##               seqnames            ranges strand
##                  <Rle>         <IRanges>  <Rle>
##   contig_13_1     chr2          1-793614      *
##   contig_12_1     chr2    793615-3150936      *
##   contig_41_1     chr2   3150937-5762928      *
##   contig_43_1     chr2   5762929-6604164      *
##   contig_82_1     chr2   6604165-8926783      *
##   contig_89_1     chr2   8926784-9168286      *
##   contig_85_1     chr2   9168287-9596337      *
##   contig_28_1     chr2  9596338-16011565      *
##   contig_91_1     chr2 16011566-16138340      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
annot <- genoPlotR::annotation(x1=start(contigsToMerge_oldPos), x2=end(contigsToMerge_oldPos), text=names(contigsToMerge_oldPos), rot=30)

gbs$Oki_Kum |> swap() |> scaffoldByFlipAndMerge(scafs$Kum_Oki) |> plotApairOfChrs(chrT = 'PAR', dna_seg_scale=TRUE, annotations = annot)

XSR

XSR is already complete.

Oki_Kum_XSR <- gbs$Oki_Kum |> plyranges::filter(seqnames == "XSR")
seqlevels(Oki_Kum_XSR)       <- seqlevelsInUse(Oki_Kum_XSR)
seqlevels(Oki_Kum_XSR$query) <- seqlevelsInUse(Oki_Kum_XSR$query)

makeOxfordPlots(Oki_Kum_XSR, col='strand') + ggtitle("Everything from KUM-M3 that matches OKI's XSR")

scafs$Kum_Oki[["XSR"]] <- data.frame(
  contig =  c('contig_42_1'),
  orientation = c(1)
)

YSR

Oki_Kum_YSR <- gbs$Oki_Kum |> plyranges::filter(seqnames == "YSR")
seqlevels(Oki_Kum_YSR)       <- seqlevelsInUse(Oki_Kum_YSR)
seqlevels(Oki_Kum_YSR$query) <- seqlevelsInUse(Oki_Kum_YSR$query)

makeOxfordPlots(Oki_Kum_YSR, col='strand')

scafs$Kum_Oki[["YSR"]] <- data.frame(
  contig =  c('contig_29_1'),
  orientation = c(-1)
)

Bringing it together

# Store the results in the package for re-use in other vignettes
# usethis::use_data(scafs, overwrite = TRUE)
identical(scafs$Kum_Oki, OikScrambling::scafs$Kum_Oki)
## [1] TRUE
scaf_Kum_Oki <- scaffoldByFlipAndMerge(gbs$Oki_Kum |> swap(), scafs$Kum_Oki, drop = TRUE) |> swap()
scaf_Kum_Oki |> makeOxfordPlots()

# Original plot for comparison
gbs$Oki_Kum |> makeOxfordPlots()