src/lib/psl.c 1.82

1.82 2010/04/16 17:23:43 markd
have pslCheck catch overall ranges that don't match block ranges
Index: src/lib/psl.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/psl.c,v
retrieving revision 1.81
retrieving revision 1.82
diff -b -B -U 4 -r1.81 -r1.82
--- src/lib/psl.c	31 May 2009 07:28:33 -0000	1.81
+++ src/lib/psl.c	16 Apr 2010 17:23:43 -0000	1.82
@@ -1503,93 +1503,101 @@
         psl->tName, psl->tStart, psl->tEnd,
         psl->strand, pslDesc);
 }
 
-static void chkRanges(char* pslDesc, FILE* out, struct psl* psl,
-                      char* pName, char* pLabel, char pCLabel, char pStrand,
-                      unsigned pSize, unsigned pStart, unsigned pEnd,
-                      unsigned blockCount, unsigned* blockSizes,
-                      unsigned* pBlockStarts, int* errCountPtr)
-/* check the target or query ranges in a PSL, increment errorCnt */
-{
-int errCount = *errCountPtr;
-unsigned iBlk, prevBlkEnd = 0;
 
-if (pStart >= pEnd)
-    {
-    if (errCount == 0)
-        printPslDesc(pslDesc, out, psl);
-    fprintf(out, "\t%s %cStart %u >= %cEnd %u\n",
-            pName, pCLabel, pStart, pCLabel, pEnd);
-    errCount++;
-    }
-if (pEnd > pSize)
-    {
-    if (errCount == 0)
+static void chkError(char* pslDesc, FILE* out, struct psl* psl, int* errCount, char* format, ...)
+/* forward needed to specify printf signature for gcc checking */
+#if defined(__GNUC__)
+__attribute__((format(printf, 5, 6)))
+#endif
+;
+
+static void chkError(char* pslDesc, FILE* out, struct psl* psl, int* errCount, char* format, ...)
+/* error handling on an pslCheck error, counting error and issuing description
+ * of PSL on the first error. */
+{
+if (*errCount == 0)
         printPslDesc(pslDesc, out, psl);
-    fprintf(out, "\t%s %cEnd %u >= %cSize %u\n",
-            pName, pCLabel, pEnd, pCLabel, pSize);
-    errCount++;
-    }
-for (iBlk = 0; iBlk < blockCount; iBlk++)
-    {
-    unsigned blkStart = pBlockStarts[iBlk];
-    unsigned blkEnd = blkStart+blockSizes[iBlk];
-    /* translate stand to genomic coords */
-    unsigned gBlkStart = (pStrand == '+') ? blkStart : (pSize - blkEnd);
-    unsigned gBlkEnd = (pStrand == '+') ? blkEnd : (pSize - blkStart);
+va_list args;
+va_start(args, format);
+vfprintf(out, format, args);
+va_end(args);
+(*errCount)++;
+}
 
-    if ((pSize > 0) && (blkEnd > pSize))
-        {
-        if (errCount == 0)
-            printPslDesc(pslDesc, out, psl);
-        fprintf(out, "\t%s %s block %u end %u > %cSize %u\n",
+static void chkBlkRanges(char* pslDesc, FILE* out, struct psl* psl,
+                         char* pName, char* pLabel, char pCLabel, char pStrand,
+                         unsigned pSize, unsigned pStart, unsigned pEnd,
+                         unsigned iBlk, unsigned* blockSizes,
+                         unsigned* pBlockStarts, int* errCount)
+/* check the target or query ranges in a PSL incrementing errorCnt */
+{
+unsigned blkStart = pBlockStarts[iBlk];
+unsigned blkEnd = blkStart+blockSizes[iBlk];
+/* translate stand to genomic coords */
+unsigned gBlkStart = (pStrand == '+') ? blkStart : (pSize - blkEnd);
+unsigned gBlkEnd = (pStrand == '+') ? blkEnd : (pSize - blkStart);
+
+if ((pSize > 0) && (blkEnd > pSize))
+    chkError(pslDesc, out, psl, errCount,
+             "\t%s %s block %u end %u > %cSize %u\n",
                 pName, pLabel, iBlk, blkEnd, pCLabel, pSize);
-        errCount++;
-        }
-    if (gBlkStart < pStart)
-        {
-        if (errCount == 0)
-            printPslDesc(pslDesc, out, psl);
-        fprintf(out, "\t%s %s block %u start %u < %cStart %u\n",
+if (gBlkStart < pStart)
+    chkError(pslDesc, out, psl, errCount,
+             "\t%s %s block %u start %u < %cStart %u\n",
                 pName, pLabel, iBlk, gBlkStart, pCLabel, pStart);
-        errCount++;
-        }
-    if (gBlkStart >= pEnd)
-        {
-        if (errCount == 0)
-            printPslDesc(pslDesc, out, psl);
-        fprintf(out, "\t%s %s block %u start %u >= %cEnd %u\n",
+if (gBlkStart >= pEnd)
+    chkError(pslDesc, out, psl, errCount,
+             "\t%s %s block %u start %u >= %cEnd %u\n",
                 pName, pLabel, iBlk, gBlkStart, pCLabel, pEnd);
-        errCount++;
-        }
-    if (gBlkEnd < pStart)
-        {
-        if (errCount == 0)
-            printPslDesc(pslDesc, out, psl);
-        fprintf(out, "\t%s %s block %u end %u < %cStart %u\n",
+if (gBlkEnd < pStart)
+    chkError(pslDesc, out, psl, errCount,
+             "\t%s %s block %u end %u < %cStart %u\n",
                 pName, pLabel, iBlk, gBlkEnd, pCLabel, pStart);
-        errCount++;
-        }
-    if (gBlkEnd > pEnd)
-        {
-        if (errCount == 0)
-            printPslDesc(pslDesc, out, psl);
-        fprintf(out, "\t%s %s block %u end %u > %cEnd %u\n",
+if (gBlkEnd > pEnd)
+    chkError(pslDesc, out, psl, errCount,
+             "\t%s %s block %u end %u > %cEnd %u\n",
                 pName, pLabel, iBlk, gBlkEnd, pCLabel, pEnd);
-        errCount++;
-        }
-    if ((iBlk > 0) && (blkStart < prevBlkEnd))
+if (iBlk > 0)
         {
-        if (errCount == 0)
-            printPslDesc(pslDesc, out, psl);
-        fprintf(out, "\t%s %s block %u start %u < previous block end %u\n",
+    unsigned prevBlkEnd = pBlockStarts[iBlk-1]+blockSizes[iBlk-1];
+    if (blkStart < prevBlkEnd)
+        chkError(pslDesc, out, psl, errCount,
+                 "\t%s %s block %u start %u < previous block end %u\n",
                 pName, pLabel, iBlk, blkStart, prevBlkEnd);
-        errCount++;
         }
-    prevBlkEnd = blkEnd;
-    }
-*errCountPtr = errCount;
+}
+
+static void chkRanges(char* pslDesc, FILE* out, struct psl* psl,
+                      char* pName, char* pLabel, char pCLabel, char pStrand,
+                      unsigned pSize, unsigned pStart, unsigned pEnd,
+                      unsigned blockCount, unsigned* blockSizes,
+                      unsigned* pBlockStarts, int blockSizeMult, int* errCount)
+/* check the target or query ranges in a PSL, increment errorCnt */
+{
+unsigned iBlk;
+if (pStart >= pEnd)
+    chkError(pslDesc, out, psl, errCount,
+             "\t%s %cStart %u >= %cEnd %u\n",
+             pName, pCLabel, pStart, pCLabel, pEnd);
+if (pEnd > pSize)
+    chkError(pslDesc, out, psl, errCount,
+             "\t%s %cEnd %u >= %cSize %u\n",
+             pName, pCLabel, pEnd, pCLabel, pSize);
+// check that block start/end matches overall start end
+unsigned pStartStrand = pStart, pEndStrand = pEnd;
+if (pStrand != '+')
+    reverseUnsignedRange(&pStartStrand, &pEndStrand, pSize);
+unsigned lastBlkEnd = pBlockStarts[blockCount-1] + (blockSizeMult * blockSizes[blockCount-1]);
+if ((pStartStrand != pBlockStarts[0]) || (pEndStrand != lastBlkEnd))
+    chkError(pslDesc, out, psl, errCount,
+             "\t%s strand \"%c\" adjusted %cStart-%cEnd range %u-%u != block range %u-%u\n",
+             pName, pStrand, pCLabel, pCLabel, pStartStrand, pEndStrand, pBlockStarts[0], lastBlkEnd);
+
+for (iBlk = 0; iBlk < blockCount; iBlk++)
+    chkBlkRanges(pslDesc, out, psl, pName, pLabel, pCLabel, pStrand,
+                 pSize, pStart, pEnd, iBlk, blockSizes, pBlockStarts, errCount);
 }
 
 int pslCheck(char *pslDesc, FILE* out, struct psl* psl)
 /* Validate a PSL for consistency.  pslDesc is printed the error messages
@@ -1598,50 +1606,27 @@
 static char* VALID_STRANDS[] = {
     "+", "-", "++", "+-", "-+", "--", NULL
 };
 int i, errCount = 0;
-char strand;
-boolean isProt = FALSE;
+int tBlockSizeMult = pslIsProtein(psl) ? 3 : 1;
 
 /* check strand value */
 for (i = 0; VALID_STRANDS[i] != NULL; i++)
     {
     if (strcmp(psl->strand, VALID_STRANDS[i]) == 0)
         break;
     }
 if (VALID_STRANDS[i] == NULL)
-    {
-    if (errCount == 0)
-        printPslDesc(pslDesc, out, psl);
-    fprintf(out, "\tinvalid PSL strand: \"%s\"\n", psl->strand);
-    errCount++;
-    }
+    chkError(pslDesc, out, psl, &errCount,
+             "\tinvalid PSL strand: \"%s\"\n", psl->strand);
 
 /* check target */
-if (pslIsProtein(psl))
-    {
-    isProt = TRUE;
-    for (i = 0; i < psl->blockCount ; i++)
-	psl->blockSizes[i] *= 3;
-    }
-
-strand = ((psl->strand[1] == '\0') ? '+' : psl->strand[1]);
-chkRanges(pslDesc, out, psl, psl->tName, "target", 't',
-          strand, psl->tSize, psl->tStart, psl->tEnd,
-          psl->blockCount, psl->blockSizes, psl->tStarts,
-          &errCount);
-if (isProt)
-    {
-    for (i = 0; i < psl->blockCount ; i++)
-	psl->blockSizes[i] /= 3;
-    }
+chkRanges(pslDesc, out, psl, psl->tName, "target", 't', pslTStrand(psl), psl->tSize, psl->tStart, psl->tEnd,
+          psl->blockCount, psl->blockSizes, psl->tStarts, tBlockSizeMult, &errCount);
 
 /* check query */
-strand = psl->strand[0];
-chkRanges(pslDesc, out, psl, psl->qName, "query", 'q',
-          strand, psl->qSize, psl->qStart, psl->qEnd,
-          psl->blockCount, psl->blockSizes, psl->qStarts,
-          &errCount);
+chkRanges(pslDesc, out, psl, psl->qName, "query", 'q', pslQStrand(psl), psl->qSize, psl->qStart, psl->qEnd,
+          psl->blockCount, psl->blockSizes, psl->qStarts, 1, &errCount);
 
 return errCount;
 }