baed67b5aa7f67279b04e4cf2e33cafd2268f10d angie Mon Mar 12 15:46:09 2018 -0700 Adding obscure util pslFixCdsJoinGap, to correct the output of genePredToFakePsl for transcripts with +1 ribosomal frameshift. genePred exons skip over a base to maintain frame; genePredToFakePsl sees a 1-base gap on target, 0-base gap on query. However, the query base is skipped as described in Genbank CDS strings such as "join(168..260,262..741)". Use CDS strings to recognize these cases, and if the PSL has a 1-sided gap on target at the exact position of the skipped base then make the gap 2-sided to avoid making the alignment off-by-1 after that point and becoming a swamp of mismatches. The ideal solution would be to remove the gap and properly interpret join CDS in the rest of our software, but this will at least prevent totally incorrect genePred-derived PSL for transcripts with +1 ribosomal frameshift for now. refs #21048 diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c index c66f33b..07ecbef 100644 --- src/hg/lib/genePred.c +++ src/hg/lib/genePred.c @@ -2457,30 +2457,29 @@ struct psl *psl = pslNew(gp->name, qSize ? qSize : aliSize, 0, aliSize, gp->chrom, chromSize, gp->txStart, gp->txEnd, gp->strand, gp->exonCount, 0); // If qSize is greater than aliSize then we assume the extra bases are at the end of // the transcript (poly-A tail). If the alignment is on the '-' strand, then we need // to offset the reversed qStarts by the number of extra bases. int sizeAdjust = (gp->strand[0] == '-' && qSize > aliSize) ? (qSize - aliSize) : 0; int i = -1; for (e = 0; e < gp->exonCount; ++e) { if (e == 0 || gp->exonStarts[e] != gp->exonEnds[e-1]) { i++; psl->blockSizes[i] = (gp->exonEnds[e] - gp->exonStarts[e]); psl->qStarts[i] = i==0 ? 0 + sizeAdjust : psl->qStarts[i-1] + psl->blockSizes[i-1]; -//#*** TODO: use exonFrame to detect ribosomal frameshift that makes us skip a base on t psl->tStarts[i] = gp->exonStarts[e]; } else { // Merge "exons" that have a 0-length gap between them to avoid pslCheck failure psl->blockSizes[i] += (gp->exonEnds[e] - gp->exonStarts[e]); } } psl->blockCount = i+1; psl->match = aliSize; psl->tNumInsert = psl->blockCount-1; psl->tBaseInsert = (gp->txEnd - gp->txStart) - aliSize; return psl; }