8c802cc665358f0d69c1243f77d7931001b78841 markd Mon Aug 3 12:40:42 2015 -0700 Fixed with psl to genePred frame correction working when there is target deletion. A lot of test cases changed as they were constructed from transMap and so had a lot of indels. (#15803) diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c index 189196b..373fa8a 100644 --- src/hg/lib/genePred.c +++ src/hg/lib/genePred.c @@ -1114,55 +1114,56 @@ if (start < end) { /* compute from end if it is complete in mRNA */ if (cds->endComplete) { int fr = (cds->end-start) % 3; frame = (fr == 2) ? 1 : ((fr == 1) ? 2 : 0); } else frame = (start-cds->start) % 3; } return frame; } -static boolean shouldMergeBlocks(struct genePred *gene, int iExon, - unsigned tStart, unsigned options, +static boolean shouldMergeBlocks(struct genePred *gene, + unsigned tStart, unsigned prevTEnd, + unsigned qStart, unsigned prevQEnd, + unsigned options, int cdsMergeSize, int utrMergeSize) /* determine if a new block starting at tStart whould be merged with - * the preceeding exon, indicated by iExon. + * the preceeding exon. */ { -/* nothing to check if no exons have been added */ -if (iExon >= 0) - { boolean inCds = (gene->cdsStart <= tStart) && (tStart < gene->cdsEnd); - int gapSize = (tStart - gene->exonEnds[iExon]); +int tGapSize = (tStart - prevTEnd); +int qGapSize = (qStart - prevQEnd); if (inCds) { - if ((options & genePredPslCdsMod3) && ((gapSize % 3) != 0)) + if ((options & genePredPslCdsMod3) + && (((tGapSize % 3) != 0) || ((qGapSize % 3) != 0))) return FALSE; /* not a multiple of three */ - if ((cdsMergeSize >= 0) && (gapSize <= cdsMergeSize)) + if ((cdsMergeSize >= 0) && ((tGapSize <= cdsMergeSize) && (qGapSize <= cdsMergeSize))) return TRUE; } else { - if ((utrMergeSize >= 0) && (gapSize <= utrMergeSize)) + // qGapSize only matters for CDS where we are trying to keep frame sane + if ((utrMergeSize >= 0) && (tGapSize <= utrMergeSize)) return TRUE; } - } return FALSE; /* don't merge */ } static void pslToExons(struct psl *psl, struct genePred *gene, struct genbankCds* cds, unsigned options, int cdsMergeSize, int utrMergeSize) /* Convert psl alignment blocks to genePred exons, merging together blocks by * the specified paraemeters. Optionally add frame information. */ { int iBlk, iExon; int startIdx, stopIdx, idxIncr; /* this is bigger than needed if blocks merge */ gene->exonStarts = needMem(psl->blockCount*sizeof(unsigned)); gene->exonEnds = needMem(psl->blockCount*sizeof(unsigned)); @@ -1177,55 +1178,63 @@ /* traverse psl in postive target order */ if (psl->strand[1] == '-') { startIdx = psl->blockCount-1; stopIdx = -1; idxIncr = -1; } else { startIdx = 0; stopIdx = psl->blockCount; idxIncr = 1; } iExon = -1; /* indicate none have been added */ +unsigned prevQEnd = 0; +unsigned prevTEnd = 0; for (iBlk = startIdx; iBlk != stopIdx; iBlk += idxIncr) { + int qStart = psl->qStarts[iBlk]; + int qEnd = qStart + psl->blockSizes[iBlk]; + if (psl->strand[0] == '-') + reverseIntRange(&qStart, &qEnd, psl->qSize); int tStart = psl->tStarts[iBlk]; int tEnd = tStart + psl->blockSizes[iBlk]; if (psl->strand[1] == '-') reverseIntRange(&tStart, &tEnd, psl->tSize); - if (!shouldMergeBlocks(gene, iExon, tStart, options, + if ((iExon == 0) || !shouldMergeBlocks(gene, tStart, prevTEnd, qStart, prevQEnd, options, cdsMergeSize, utrMergeSize)) { /* new exon */ iExon++; gene->exonStarts[iExon] = tStart; } gene->exonEnds[iExon] = tEnd; /* Set the frame. We have to deal with multiple blocks being merged. For * positive strand, the first block frame is used, for negative strand, * the last is used. */ if ((gene->optFields & genePredExonFramesFld) && (cds != NULL) && (((psl->strand[0] == '+') && (gene->exonFrames[iExon] < 0)) || (psl->strand[0] == '-'))) { int fr = getFrame(psl, psl->qStarts[iBlk], psl->qStarts[iBlk]+psl->blockSizes[iBlk], cds); if (fr >= 0) gene->exonFrames[iExon] = fr; } + prevTEnd = tEnd; + prevQEnd = qEnd; } gene->exonCount = iExon+1; assert(gene->exonCount <= psl->blockCount); } struct genePred *genePredFromPsl3(struct psl *psl, struct genbankCds* cds, unsigned optFields, unsigned options, int cdsMergeSize, int utrMergeSize) /* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS * specification string to genomic coordinates. Small genomic inserts are * merged based on the mergeSize parameters. Gaps no larger than the * specified merge sizes result in the adjacent blocks being merged into a * single exon. Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize. If * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged * if a multiple of three. A negative merge sizes disables merging of blocks.