1bb425f9fa71e6a0092954afb46e99886e9f3c08 angie Wed Mar 1 16:26:44 2023 -0800 Add a check for a boundary condition that so far has appeared only in zebrafish: an insertion that lands (with HGVS right-shifting) at the boundary between an alignment block and *double-sided* gap, i.e. it's not a clean intron gap but instead has skipped over both genomic and transcript sequence. Due to the double-sided gap, vpTx start txOffset > end txOffset (not ==) so vpTxIsInsertion returns false but it's still an insertion in the genome so there is no vpTx->txRef. This was triggering an errAbort; head that off by returning NULL from vpTxSplitByRegion so we just call it complex_transcript_variant, which is all we can say when the transcript aligner is perhaps trying too hard. refs #29262 diff --git src/hg/lib/variantProjector.c src/hg/lib/variantProjector.c index 4e4ed61..ac227e2 100644 --- src/hg/lib/variantProjector.c +++ src/hg/lib/variantProjector.c @@ -1864,30 +1864,37 @@ /* If vpTx doesn't delete the whole transcript but spans multiple regions *and* is * either a pure deletion or MNV then return a list of vpTx, one per region type, * so we can report functional effects of complex variants in more detail. * Otherwise return NULL to signify that we could not reliably split it up (e.g. when * nonzero but unequal numbers of bases are deleted and inserted across a boundary, * we don't know how to apportion the inserted bases). * Handle single-region vpTxs, insertions and transcript_ablation cases separately -- * those do not need to be split up so don't call this on them. */ { if (vpTx->start.region == vpTx->end.region || vpTxPosIsInsertion(&vpTx->start, &vpTx->end) || (vpTx->start.region == vpUpstream && vpTx->end.region == vpDownstream)) errAbort("vpTxSplitByRegion: don't call this if start and end region (%s, %s) are the same, " "if vpTx is an insertion, or if vpTx deletes the entire transcript", vpTxRegionToString(vpTx->start.region), vpTxRegionToString(vpTx->end.region)); +// Even if vpTxPosIsInsertion() is false because this is not an insertion into the transcript, +// it might still be an insertion into the genome, e.g. danRer11's rs499587024 with NCBI's +// alignment of NM_001002580.1 that places a 12-base "exon" after a double-sided gap "intron" +// that skips 400-odd transcript bases, with rs499587024 at the "exon"/"intron" boundary. +// In this case start > end in tx coords, not equal, so vpTxPosIsInsertion returns false. +if (vpTx->start.gOffset == vpTx->end.gOffset) + return NULL; // If start region and end region are different then at least one of them should be exon or // they should have at least one exon in the middle, so txRef should not be NULL. if (vpTx->txRef == NULL) errAbort("vpTxSplitByRegion: program error: how is txRef NULL if start region and end region " "are different (%s, %s)?", vpTxRegionToString(vpTx->start.region), vpTxRegionToString(vpTx->end.region)); boolean isDeletion = (vpTx->txAlt[0] == '\0' && strlen(vpTx->txRef) > 0); boolean isMnv = (strlen(vpTx->txRef) == strlen(vpTx->txAlt)); if (! (isDeletion || isMnv)) return NULL; // Step through regions and add one vpTx per region type. boolean isRc = pslQStrand(psl) == '-'; struct vpTx *vpTxList = NULL; if (vpTx->start.region == vpUpstream) {