f1ed9037745e40b85f0e3908f6e8f1aa497cf753
markd
  Sun Nov 6 16:09:27 2016 -0800
added option to pslCheck to string qName uniqueness modifiers

diff --git src/hg/pslCheck/pslCheck.c src/hg/pslCheck/pslCheck.c
index 45fb9b1..8174eba 100644
--- src/hg/pslCheck/pslCheck.c
+++ src/hg/pslCheck/pslCheck.c
@@ -9,65 +9,69 @@
 #include "chromInfo.h"
 #include "verbose.h"
 
 
 /* command line options and values */
 static struct optionSpec optionSpecs[] =
 {
     {"db", OPTION_STRING},
     {"prot", OPTION_BOOLEAN},
     {"quiet", OPTION_BOOLEAN},
     {"noCountCheck", OPTION_BOOLEAN},
     {"targetSizes", OPTION_STRING},
     {"querySizes", OPTION_STRING},
     {"pass", OPTION_STRING},
     {"fail", OPTION_STRING},
+    {"ignoreQUniq", OPTION_BOOLEAN},
     {NULL, 0}
 };
 static char *db = NULL;
 static boolean protCheck = FALSE;
 static boolean quiet = FALSE;
 static boolean noCountCheck = FALSE;
 static char *passFile = NULL;
 static char *failFile = NULL;
+static boolean ignoreQUniq = FALSE;
 static struct hash *targetSizes = NULL;
 static struct hash *querySizes = NULL;
 
 /* global count of alignments checked and errors */
 static int chkCount = 0;
 static int failCount = 0;
 static int errCount = 0;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "pslCheck - validate PSL files\n"
   "usage:\n"
   "   pslCheck fileTbl(s)\n"
   "options:\n"
   "   -db=db - get targetSizes from this database, and if file doesn't exist,\n"
   "    look for a table in this database.\n"
   "   -prot - confirm psls are protein psls\n"
   "   -noCountCheck - don't validate that match/mismatch counts are match\n"
   "    the total size of the alignment blocks\n"
   "   -pass=pslFile - write PSLs without errors to this file\n"
   "   -fail=pslFile - write PSLs with errors to this file\n"
   "   -targetSizes=sizesFile - tab file with columns of target and size.\n"
   "    If specified, psl is check to have a valid target and target\n"
   "    coordinates.\n"
   "   -querySizes=sizesFile - file with query sizes.\n"
+  "   -ignoreQUniq - ignore everything after the last `-' in the qName field, that\n"
+  "    is sometimes used to generate a unique identifier\n"
   "   -quiet - no write error message, just filter\n");
 }
 
 static struct hash *loadSizes(char *sizesFile)
 /* load a sizes file */
 {
 struct hash *sizes = hashNew(20);
 struct lineFile *lf = lineFileOpen(sizesFile, TRUE);
 char *cols[2];
 
 while (lineFileNextRowTab(lf, cols, ArraySize(cols)))
     hashAddInt(sizes, cols[0], sqlUnsigned(cols[1]));
 lineFileClose(&lf);
 return sizes;
 }
@@ -85,30 +89,50 @@
     chromInfoFree(&ci);
     }
 sqlFreeResult(&sr);
 return sizes;
 }
 
 static void prPslDesc(struct psl *psl, char *pslDesc,FILE *errFh)
 /* print a description of psl before the first error.  */
 {
 fprintf(errFh, "Error: invalid PSL: %s:%u-%u %s:%u-%u %s %s\n",
         psl->qName, psl->qStart, psl->qEnd,
         psl->tName, psl->tStart, psl->tEnd,
         psl->strand, pslDesc);
 }
 
+static char *getQName(char *qName)
+/* get query name, optionally dropping trailing unique identifier.
+ * WARNING: static return */
+{
+static struct dyString *buf = NULL;
+if (ignoreQUniq)
+    {
+    if (buf == NULL)
+        buf = dyStringNew(2*strlen(qName));
+    dyStringClear(buf);
+    char *dash = strrchr(qName, '-');
+    if (dash == NULL)
+        return qName;
+    dyStringAppendN(buf, qName, (dash-qName));
+    return buf->string;
+    }
+else
+    return qName;
+}
+
 static int checkSize(struct psl *psl, char *pslDesc, char *sizeDesc,
                      int numErrs, struct hash *sizeTbl, char *name, int size,
                      FILE *errFh)
 /* check a size, error count (0 or 1) */
 {
 int expectSz = hashIntValDefault(sizeTbl, name, -1);
 if (expectSz < 0)
     {
     if (numErrs == 0)
         prPslDesc(psl, pslDesc, errFh);
     fprintf(errFh, "\t%s \"%s\" does not exist\n", sizeDesc, name);
     return 1;
     }
 if (size != expectSz)
     {
@@ -154,31 +178,31 @@
 else
     safef(pslDesc, sizeof(pslDesc), "%s", tbl);
 numErrs += pslCheck(pslDesc, errFh, psl);
 if (!noCountCheck)
     numErrs += checkCounts(psl, pslDesc, numErrs, errFh);
 if (protCheck && !pslIsProtein(psl))
     {
     if (numErrs == 0)
         prPslDesc(psl, pslDesc, errFh);
     fprintf(errFh, "\tnot a protein psl\n");
     numErrs++;
     }
 if (targetSizes != NULL)
     numErrs += checkSize(psl, pslDesc, "target", numErrs, targetSizes, psl->tName, psl->tSize, errFh);
 if (querySizes != NULL)
-    numErrs += checkSize(psl, pslDesc, "query", numErrs, querySizes, psl->qName, psl->qSize, errFh);
+    numErrs += checkSize(psl, pslDesc, "query", numErrs, querySizes, getQName(psl->qName), psl->qSize, errFh);
 if ((passFh != NULL) && (numErrs == 0))
     pslTabOut(psl, passFh);
 if ((failFh != NULL) && (numErrs > 0))
     pslTabOut(psl, failFh);
 errCount += numErrs;
 chkCount++;
 if (numErrs > 0)
     failCount++;
 }
 
 static void checkPslFile(char *fileName, FILE *errFh,
                          FILE *passFh, FILE *failFh)
 /* Check one psl file */
 {
 struct lineFile *lf = pslFileOpen(fileName);
@@ -237,30 +261,31 @@
 carefulClose(&failFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 if (argc < 2)
     usage();
 db = optionVal("db", NULL);
 protCheck = optionExists("prot");
 noCountCheck = optionExists("noCountCheck");
 quiet = optionExists("quiet");
 passFile = optionVal("pass", NULL);
 failFile = optionVal("fail", NULL);
+ignoreQUniq = optionExists("ignoreQUniq");
 struct sqlConnection *conn = NULL;
 if (db != NULL)
     conn = sqlConnect(db);
 
 if (optionExists("targetSizes"))
     targetSizes = loadSizes(optionVal("targetSizes", NULL));
 else if (db != NULL)
     targetSizes = loadChromInfoSizes(conn);
 if (optionExists("querySizes"))
     querySizes = loadSizes(optionVal("querySizes", NULL));
 checkFilesTbls(conn, argc-1, argv+1);
 sqlDisconnect(&conn);
 verbose(1, "checked: %d failed: %d errors: %d\n", chkCount, failCount, errCount);
 return ((errCount == 0) ? 0 : 1);
 }