e6cd18ccbe6e255de8cb54bc70da0e42fe9a73b5
markd
  Mon Aug 17 19:17:10 2020 -0700
add report of location of frameshifts for use by CAT

diff --git src/hg/utils/transMapPslToGenePred/transMapPslToGenePred.c src/hg/utils/transMapPslToGenePred/transMapPslToGenePred.c
index eb949c5..9773bff 100644
--- src/hg/utils/transMapPslToGenePred/transMapPslToGenePred.c
+++ src/hg/utils/transMapPslToGenePred/transMapPslToGenePred.c
@@ -40,38 +40,41 @@
   "This is an alternative to mrnaToGene which determines CDS and frame from\n"
   "the original annotation, which may have been imported from GFF/GTF.  This\n"
   "was created because the genbankCds structure use by mrnaToGene doesn't\n"
   "handle partial start/stop codon or programmed frame shifts.  This requires\n"
   "handling the list of CDS regions and the /codon_start attribute,  At some\n"
   "point, this program may be extended to do handle genbank alignments correctly.\n"
   "\n"
   "Options:\n"
   "  -nonCodingGapFillMax=0 - fill gaps in non-coding regions up to this many bases\n"
   "   in length.\n"
   "  -codingGapFillMax=0 - fill gaps in coding regions up to this many bases\n"
   "   in length.  Only coding gaps that are a multiple of three will be fill,\n"
   "   with the max rounded down.\n"
   "  -noBlockMerge - don't do any block merging of genePred, even of adjacent blocks.\n"
   "   This is mainly for debugging.\n"
+  "  -frameShifts=tsv - Write TSV with locations of frame-shifting indels.  The coordinates\n"
+  "   give a context for the shift for browsing, not an exact location.\n"
   "\n");
 }
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"nonCodingGapFillMax", OPTION_INT},
     {"codingGapFillMax", OPTION_INT},
     {"noBlockMerge", OPTION_BOOLEAN},
+    {"frameShifts", OPTION_STRING},
     {NULL, 0}
 };
 static int nonCodingGapFillMax = 0;
 static int codingGapFillMax = 0;
 static boolean noBlockMerge = FALSE;
 
 static void swapBoolean(boolean *a, boolean *b)
 /* swap two booleans */
 {
 boolean hold = *a;
 *a = *b;
 *b = hold;
 }
 
 static int frameIncr(int frame, int amt)
@@ -83,39 +86,30 @@
 else if (amt >= 0)
     return (frame + amt) % 3;
 else
     {
     int amt3 = (-amt) % 3;
     return (frame - (amt - amt3)) % 3;
     }
 }
 
 static int genePredExonSize(struct genePred* gp, int iExon)
 /* calculate size of an exon in a genePred */
 {
 return gp->exonEnds[iExon] - gp->exonStarts[iExon];
 }
 
-static int genePredSize(struct genePred* gp)
-/* calculate size of an all of the exons in a genePred */
-{
-int iExon, size = 0;
-for (iExon = 0; iExon < gp->exonCount; iExon++)
-    size += genePredExonSize(gp, iExon);
-return size;
-}
-
 static int pslBlockQueryToTarget(struct psl *psl, int iBlock, int pos)
 /* project a position from query to target.  Position maybe a start
  * or end */
 {
 assert((pslQStart(psl, iBlock) <= pos) && (pos <= pslQEnd(psl, iBlock)));
 return pslTStart(psl, iBlock) + pos - pslQStart(psl, iBlock);
 }
 
 static boolean srcGenePredCheckSameStruct(struct genePred *gp,
                                           struct genePred *gp0)
 /* check that duplicate genePred frames and exon sizes are the same (PAR
  * case). */
 {
 int iExon;
 if (gp->exonCount != gp0->exonCount)
@@ -150,42 +144,68 @@
 }
 
 static struct hash* srcGenePredLoad(char* srcGenePredFile)
 /* load genepreds into a hash by name */
 {
 struct hash* srcGenePredMap = hashNew(0);
 struct genePred *gp, *gps = genePredReaderLoadFile(srcGenePredFile, NULL);
 while ((gp = slPopHead(&gps)) != NULL)
     srcGenePredAdd(srcGenePredMap, gp);
 return srcGenePredMap;
 }
 
 static struct genePred* srcGenePredFind(struct hash* srcGenePredMap,
                                         char* qName, int qSize)
 {
-// ignore unique suffix everything after the last
-char qNameBase[512];
-safecpy(qNameBase, sizeof(qNameBase), qName);
-char *dash = strrchr(qNameBase, '-');
+// try looking up with or without the unique suffix, which are strings
+// appended like '-1' or '-1.2'
+boolean haveSuffix = strrchr(qName, '-') != NULL;
+
+struct genePred* gp = NULL;
+if (!haveSuffix)
+    {
+    gp = hashFindVal(srcGenePredMap, qName);
+    if (gp == NULL)
+        errAbort("can't find source genePred for %s", qName);
+    }
+else
+    {
+    // try removing single `.N' which would be added during mapping
+    char qNameBase1[512];
+    safecpy(qNameBase1, sizeof(qNameBase1), qName);
+    char *dot = strrchr(qNameBase1, '.');
+    if (dot != NULL)
+        {
+        *dot = '\0';
+        gp = hashFindVal(srcGenePredMap, qNameBase1);
+        }
+    if (gp == NULL)
+        {
+        // try removing full unique extension
+        char qNameBase2[512];
+        safecpy(qNameBase2, sizeof(qNameBase2), qName);
+        char *dash = strrchr(qNameBase2, '-');
         if (dash != NULL)
             *dash = '\0';
-struct genePred* gp = hashFindVal(srcGenePredMap, qNameBase);
+        gp = hashFindVal(srcGenePredMap, qNameBase2);
         if (gp == NULL)
-    errAbort("can't find source genePred for %s with name %s", qName, qNameBase);
-if (genePredSize(gp) != qSize)
+            errAbort("can't find source genePred for %s with name %s or %s", qName, qNameBase1, qNameBase2);
+        }
+    }
+if (genePredBases(gp) != qSize)
     errAbort("size of query computed from genePred %s (%d), doesn't match expected %s (%d)",
-             qNameBase, genePredSize(gp), qName, qSize);
+             gp->name, genePredBases(gp), qName, qSize);
 return gp;
 }
 
 struct range
 /* A range for function returns */
 {
     int start;
     int end;
 };
 
 struct srcQueryExon
 /* source query exon ranges, cds, and frame.  These are all in to positive
  * strand coordinates. */
 {
     int qStart;         // computed query range of exon
@@ -260,31 +280,31 @@
     assert(srcQueryExon->qCdsStart < srcQueryExon->qCdsEnd);
     assert(srcQueryExon->qStart <= srcQueryExon->qCdsStart);
     assert(srcQueryExon->qCdsEnd <= srcQueryExon->qEnd);
     assert(srcQueryExon->frame >= 0);
     }
 return qEnd;
 }
 
 static void srcQueryExonBuild(struct genePred* srcGp,
                               struct srcQueryExon* srcQueryExons)
 /* create a list of query positions to frames.  Array should hold all
  * be the length of the src */
 {
 // building in strand-order generates correct coordinate
 int qStart = 0;
-int qSize = genePredSize(srcGp);
+int qSize = genePredBases(srcGp);
 int iExon;
 if (srcGp->strand[0] == '+')
     {
     for (iExon = 0; iExon < srcGp->exonCount; iExon++)
         qStart = srcQueryExonMake(srcGp, qStart, iExon, qSize, &(srcQueryExons[iExon]));
     }
 else
     {
     for (iExon = srcGp->exonCount-1; iExon >= 0; iExon--)
         qStart = srcQueryExonMake(srcGp, qStart, iExon, qSize, &(srcQueryExons[iExon]));
     }
 }
 
 struct srcQueryExon srcQueryExonReverse(struct srcQueryExon* srcQueryExon)
 /* reverse a srcQueryExon structure */
@@ -477,63 +497,103 @@
                                         genePredAllFlds, currentExonSpace);
 mappedGp->score = srcGp->score;
 mappedGp->name2 = cloneString(srcGp->name2);
 
 int iBlock;
 for (iBlock = 0; iBlock < mappedPsl->blockCount; iBlock++)
     convertPslBlock(mappedPsl, iBlock, srcGp, srcQueryExons, mappedGp, &currentExonSpace,
                     &mappedCdsBounds);
 if (mappedCdsBounds.cdsStart < 0)
     finishWithOutCds(mappedGp);
 else
     finishWithCds(srcGp, mappedGp, &mappedCdsBounds);
 return mappedGp;
 }
 
-static boolean haveAdjacentMergedFrames(struct genePred *mappedGp, int iExon)
-/* Do two block have adjacent frames and will merging them keep frames contiguous?  */
+static int getExonCdsStart(struct genePred *gp, int iExon)
+/* get start of CDS in an exon */
+{
+return max(gp->exonStarts[iExon], gp->cdsStart);
+}
+
+static int getExonCdsEnd(struct genePred *gp, int iExon)
+/* get end of CDS in an exon */
+{
+return min(gp->exonEnds[iExon], gp->cdsEnd);
+}
+
+static int getExonCdsLen(struct genePred *gp, int iExon)
+/* get the length of the the CDS in an exon */
+{
+int start, end;
+genePredCdsExon(gp, iExon, &start, &end);
+return end - start;
+}
+
+
+static int getExonEndFrame(struct genePred *gp, int iExon)
+/* get the end frame of exon (half-open)  */
+{
+if (gp->exonFrames[iExon] == -1)
+    return -1;
+else
+    return frameIncr(gp->exonFrames[iExon], getExonCdsLen(gp, iExon));
+}
+
+static int codonBaseRoundDown(int bases)
+/* round down to an even multiple of three */
+{
+return 3 * (bases / 3);
+}
+
+static int wholeCodonBases(struct genePred *gp, int iExon)
+/* the number of whole codon bases in an exon */
+{
+return codonBaseRoundDown(getExonCdsLen(gp, iExon) - gp->exonFrames[iExon]);
+}
+
+static boolean haveMergableFrames(struct genePred *mappedGp, int iExon)
+/* Do two block have frames that can be merged keeping frames contiguous?  */
 {
 assert(mappedGp->exonFrames[iExon] >= 0);
 assert(mappedGp->exonFrames[iExon+1] >= 0);
 int gapSize = (mappedGp->exonStarts[iExon+1] - mappedGp->exonEnds[iExon]);
 if (mappedGp->strand[0] == '+')
     {
     // how much of CDS is in this exon
-    int cdsOff = max(mappedGp->exonStarts[iExon], mappedGp->cdsStart) - mappedGp->exonStarts[iExon];
-    int cdsIncr = (genePredExonSize(mappedGp, iExon) - cdsOff) + gapSize;
+    int cdsIncr = getExonCdsLen(mappedGp, iExon) + gapSize;
     return (frameIncr(mappedGp->exonFrames[iExon], cdsIncr) == mappedGp->exonFrames[iExon+1]);
     }
 else
     {
     // how much of CDS is in next exon
-    int cdsOff = mappedGp->exonEnds[iExon+1] - min(mappedGp->exonEnds[iExon+1], mappedGp->cdsEnd);
-    int cdsIncr = (genePredExonSize(mappedGp, iExon+1) - cdsOff)  + gapSize;
+    int cdsIncr = getExonCdsLen(mappedGp, iExon+1) + gapSize;
     return (frameIncr(mappedGp->exonFrames[iExon+1], cdsIncr) == mappedGp->exonFrames[iExon]);
     }
 }
 
 static boolean hasAdjacentNonCoding(struct genePred *mappedGp, int iExon)
 /* are one or both adjacent blocks non-coding? */
 {
 return (mappedGp->exonFrames[iExon] == -1) || (mappedGp->exonFrames[iExon+1] == -1);
 }
 
 static boolean canMergeFrames(struct genePred *mappedGp, int iExon)
 /* check if frames can be merged, -1 frames merge with with any adjacent
  * frame */
 {
-return hasAdjacentNonCoding(mappedGp, iExon) || haveAdjacentMergedFrames(mappedGp, iExon);
+return hasAdjacentNonCoding(mappedGp, iExon) || haveMergableFrames(mappedGp, iExon);
 }
 
 static boolean canMergeBlocks(struct genePred *mappedGp, int iExon)
 /* check if blocks are adjacent and can be merged with consistent frame */
 {
 int maxGapFill = hasAdjacentNonCoding(mappedGp, iExon)
     ? nonCodingGapFillMax : codingGapFillMax;
 return ((mappedGp->exonStarts[iExon+1] - mappedGp->exonEnds[iExon]) <= maxGapFill)
         && canMergeFrames(mappedGp, iExon);
 }
 
 static void mergeAdjacentFrames(struct genePred *mappedGp, int iExon)
 /* merge frames for two adjacent blocks that are being merged */
 {
 // update frame, handling case were one block has CDS and the other doesn't
@@ -574,74 +634,202 @@
 }
 
 static void mergeGenePredBlocks(struct genePred *mappedGp)
 /* merge adjacent genePred `exons' as long as they have consistent frame */
 {
 int iExon = 0;
 while (iExon < mappedGp->exonCount-1)
     {
     if (canMergeBlocks(mappedGp, iExon))
         mergeAdjacentBlocks(mappedGp, iExon);
     else
         iExon++;
     }
 }
 
-static void convertPsl(struct hash* srcGenePredMap, struct psl *mappedPsl, FILE* genePredOutFh)
+static void prFrameShift(struct genePred *mappedGp, int chromStart, int chromEnd,
+                         int shift, char *location, FILE *frameShiftsFh)
+/* print a frame description to the TSV */
+{
+fprintf(frameShiftsFh, "%s\t%s\t%d\t%d\t%s\t%d\t%s\n", mappedGp->name, mappedGp->chrom,
+        chromStart, chromEnd, mappedGp->strand, shift, location);
+}
+
+static boolean haveConsistentFrames(struct genePred *mappedGp, int iExon)
+/* is frame between iExon and iExon+1 consistent (no frame shift)? */
+{
+if (mappedGp->strand[0] == '+')
+    return getExonEndFrame(mappedGp, iExon) == mappedGp->exonFrames[iExon+1];
+else
+    return getExonEndFrame(mappedGp, iExon+1) == mappedGp->exonFrames[iExon];
+}
+
+static void checkStartFrameShift(struct genePred *mappedGp, int iExon, FILE* frameShiftsFh)
+/* check if first codon is truncated at the start */
+{
+if (mappedGp->exonFrames[iExon] != 0)
+    {
+    int start, end;
+    if (mappedGp->strand[0] == '+')
+        {
+        start = getExonCdsStart(mappedGp, iExon);
+        end = min(start + (3 - mappedGp->exonFrames[iExon]),
+                  mappedGp->exonEnds[iExon]);
+        }
+    else
+        {
+        end = getExonCdsEnd(mappedGp, iExon);
+        start = max(end - (3 - mappedGp->exonFrames[iExon]),
+                    mappedGp->exonStarts[iExon]);
+        }
+    prFrameShift(mappedGp, start, end, mappedGp->exonFrames[iExon], "first", frameShiftsFh);
+    }
+}
+
+static void checkEndFrameShift(struct genePred *mappedGp, int iExon, FILE* frameShiftsFh)
+/* check if last codon is truncated, causing a frameshift */
+{
+if (getExonEndFrame(mappedGp, iExon) != 0)
+    {
+    int start, end;
+    if (mappedGp->strand[0] == '+')
+        {
+        end = getExonCdsEnd(mappedGp, iExon);
+        start = max(end - (3 - mappedGp->exonFrames[iExon]),
+                    mappedGp->exonStarts[iExon]);
+        }
+    else
+        {
+        start = getExonCdsStart(mappedGp, iExon);
+        end = min(start + (3 - mappedGp->exonFrames[iExon]),
+                  mappedGp->exonEnds[iExon]);
+        }
+    int shift = getExonCdsLen(mappedGp, iExon) - mappedGp->exonFrames[iExon] - wholeCodonBases(mappedGp, iExon);
+    prFrameShift(mappedGp, start, end, shift, "last", frameShiftsFh);
+    }
+}
+
+static void checkInternalFrameShift(struct genePred *mappedGp, int iExon, FILE* frameShiftsFh)
+/* check if there is a frame shift between iExon and iExon+1 */
+{
+if (!haveConsistentFrames(mappedGp, iExon))
+    {
+    // get full codons before and after as bounds
+    int start, end, shift;
+    if (mappedGp->strand[0] == '+')
+        {
+        start = max(getExonCdsStart(mappedGp, iExon) + mappedGp->exonFrames[iExon] + wholeCodonBases(mappedGp, iExon) - 3,
+                    mappedGp->exonStarts[iExon]);
+        end = min(mappedGp->exonStarts[iExon+1] + mappedGp->exonFrames[iExon+1] + 3,
+                  mappedGp->exonEnds[iExon+1]);
+        shift = abs(frameIncr(mappedGp->exonFrames[iExon], getExonCdsLen(mappedGp, iExon)) - mappedGp->exonFrames[iExon+1]);
+        }
+    else
+        {
+        start = max(getExonCdsStart(mappedGp, iExon) + getExonEndFrame(mappedGp, iExon) + wholeCodonBases(mappedGp, iExon) - 3,
+                    mappedGp->exonStarts[iExon]);
+        end = min(mappedGp->exonStarts[iExon+1] + (3 - mappedGp->exonFrames[iExon+1]),
+                  mappedGp->exonEnds[iExon+1]);
+        shift = abs(frameIncr(mappedGp->exonFrames[iExon+1], getExonCdsLen(mappedGp, iExon+1)) - mappedGp->exonFrames[iExon]);
+        }
+    prFrameShift(mappedGp, start, end, shift, "internal", frameShiftsFh);
+    }
+}
+
+static void reportFrameShifts(struct genePred *mappedGp, FILE* frameShiftsFh)
+/* report frame shifts found it a transcript, including truncated start and end */
+{
+assert(mappedGp->cdsStart < mappedGp->cdsEnd);
+
+int iExon = 0;
+
+// find first with frame
+for (; mappedGp->exonFrames[iExon] == -1; iExon++)
+    continue;
+
+// first codon
+if (mappedGp->strand[0] == '+')
+    checkStartFrameShift(mappedGp, iExon, frameShiftsFh);
+else
+    checkEndFrameShift(mappedGp, iExon, frameShiftsFh);
+
+// internal exons
+for (; (iExon < mappedGp->exonCount - 1) && (mappedGp->exonFrames[iExon + 1] > -1); iExon++)
+    checkInternalFrameShift(mappedGp, iExon, frameShiftsFh);
+
+// last codon
+if (mappedGp->strand[0] == '+')
+    checkEndFrameShift(mappedGp, iExon, frameShiftsFh);
+else
+    checkStartFrameShift(mappedGp, iExon, frameShiftsFh);
+}
+
+static void convertPsl(struct hash* srcGenePredMap, struct psl *mappedPsl, FILE* genePredOutFh,
+                       FILE* frameShiftsFh)
 /* convert a single PSL */
 {
 if (pslTStrand(mappedPsl) == '-')
     pslRc(mappedPsl); // target must be `+'
 
 struct genePred* srcGp = srcGenePredFind(srcGenePredMap, mappedPsl->qName, mappedPsl->qSize);
 if (genePredBases(srcGp) != mappedPsl->qSize)
     errAbort("srcGenePred %s exon size %d does not match mappedPsl %s qSize %d",
              srcGp->name, genePredBases(srcGp), mappedPsl->qName, mappedPsl->qSize);
 struct srcQueryExon srcQueryExons[srcGp->exonCount];
 srcQueryExonBuild(srcGp, srcQueryExons);
 
 struct genePred *mappedGp = createGenePred(srcGp, srcQueryExons, mappedPsl);
 if (!noBlockMerge)
     mergeGenePredBlocks(mappedGp);
+if ((frameShiftsFh != NULL) && (mappedGp->cdsStart < mappedGp->cdsEnd))
+    reportFrameShifts(mappedGp, frameShiftsFh);
 if (genePredCheck("mappedGenePred", stderr, -1, mappedGp))
     errAbort("invalid genePred created");
 
 genePredTabOut(mappedGp, genePredOutFh);
 genePredFree(&mappedGp);
 }
 
 static void transMapPslToGenePred(char* srcGenePredFile, char* mappedPslFile,
-                                  char* mappedGenePredFile)
+                                  char* mappedGenePredFile, char* frameShiftsTsv)
 /* convert mapped psl to genePreds */
 {
 struct hash* srcGenePredMap = srcGenePredLoad(srcGenePredFile);
 struct lineFile *pslFh = pslFileOpen(mappedPslFile);
 FILE* genePredOutFh = mustOpen(mappedGenePredFile, "w");
+FILE* frameShiftsFh = NULL;
+if (frameShiftsTsv != NULL)
+    {
+    frameShiftsFh = mustOpen(frameShiftsTsv, "w");
+    fprintf(frameShiftsFh, "name\tchrom\tchromStart\tchromEnd\tstrand\tshift\tlocation\n");
+    }
 struct psl *mappedPsl;
 while ((mappedPsl = pslNext(pslFh)) != NULL)
     {
-    convertPsl(srcGenePredMap, mappedPsl, genePredOutFh);
+    convertPsl(srcGenePredMap, mappedPsl, genePredOutFh, frameShiftsFh);
     pslFree(&mappedPsl);
     }
 
+carefulClose(&frameShiftsFh);
 carefulClose(&genePredOutFh);
 lineFileClose(&pslFh);
 hashFreeWithVals(&srcGenePredMap, genePredFree);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 if (argc != 4)
     usage();
 nonCodingGapFillMax = optionInt("nonCodingGapFillMax", nonCodingGapFillMax);
 codingGapFillMax = optionInt("codingGapFillMax", codingGapFillMax);
 codingGapFillMax = (codingGapFillMax/3)*3; // round
 noBlockMerge = optionExists("noBlockMerge");
 
 char *srcGenePredFile = argv[1];
 char *mappedPslFile = argv[2];
 char *mappedGenePredFile = argv[3];
-transMapPslToGenePred(srcGenePredFile, mappedPslFile, mappedGenePredFile);
+transMapPslToGenePred(srcGenePredFile, mappedPslFile, mappedGenePredFile,
+                      optionVal("frameShifts", NULL));
 return 0;
 }