9ab015136bbc854a5dc03b71d5d465771539f6c3
angie
  Thu Sep 7 11:31:20 2017 -0700
pslRecalcMatch: new option -ignoreQMissing for when we have fasta for only a subset of query sequences.

diff --git src/hg/pslRecalcMatch/pslRecalcMatch.c src/hg/pslRecalcMatch/pslRecalcMatch.c
index 9def72c..631e33a 100644
--- src/hg/pslRecalcMatch/pslRecalcMatch.c
+++ src/hg/pslRecalcMatch/pslRecalcMatch.c
@@ -23,38 +23,41 @@
 /* Explain usage and exit. */
 {
 errAbort(
   "pslRecalcMatch - Recalculate match,mismatch,repMatch columns in psl file.\n"
   "This can be useful if the psl went through pslMap, or if you've added \n"
   "lower-case repeat masking after the fact\n"
   "usage:\n"
   "   pslRecalcMatch in.psl targetSeq querySeq out.psl\n"
   "where targetSeq is either a nib directory or a two bit file\n"
   "and querySeq is a fasta file, nib file, two bit file, or list\n"
   "of such files.  The psl's should be simple non-translated ones.\n"
   "This will work faster if the in.psl is sorted on target.\n"
   "options:\n"
   "   -ignoreQUniq - ignore everything after the last `-' in the qName field, that\n"
   "    is sometimes used to generate a unique identifier\n"
+  "   -ignoreQMissing - pass through the record if querySeq doesn't include qName\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"ignoreQUniq", OPTION_BOOLEAN},
+   {"ignoreQMissing", OPTION_BOOLEAN},
    {NULL, 0},
 };
 static boolean ignoreQUniq = FALSE;
+static boolean ignoreQMissing = FALSE;
 
 
 static char *getQName(char *qName)
 /* get query name, optionally dropping trailing unique identifier.
  * WARNING: static return */
 {
 static struct dyString *buf = NULL;
 if (ignoreQUniq)
     {
     if (buf == NULL)
         buf = dyStringNew(2*strlen(qName));
     dyStringClear(buf);
     char *dash = strrchr(qName, '-');
     if (dash == NULL)
         return qName;
@@ -118,34 +121,39 @@
  * This can be useful if the psl went through pslMap, or if you've added 
  * lower-case repeat masking after the fact. */
 {
 struct nibTwoCache *tCache = nibTwoCacheNew(targetName);
 struct dnaSeq *qSeqList = dnaLoadAll(queryName);
 struct hash *qHash = dnaSeqHash(qSeqList);
 struct psl *psl;
 struct lineFile *lf = pslFileOpen(inName);
 FILE *f = mustOpen(outName, "w");
 
 while ((psl = pslNext(lf)) != NULL)
     {
     int tSize;
     struct dnaSeq *tSeqPart = nibTwoCacheSeqPart(tCache,
     	psl->tName, psl->tStart, psl->tEnd - psl->tStart, &tSize);
-    struct dnaSeq *qSeq = hashMustFindVal(qHash, getQName(psl->qName));
+    char *qName = getQName(psl->qName);
+    struct dnaSeq *qSeq = hashFindVal(qHash, qName);
+    if (!ignoreQMissing && qSeq == NULL)
+        errAbort("Can't find sequence for qName '%s'", qName);
+    else if (qSeq)
         recalcMatches(psl, tSeqPart, psl->tStart, qSeq);
     pslTabOut(psl, f);
     dnaSeqFree(&tSeqPart);
     }
 carefulClose(&f);
 lineFileClose(&lf);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
 ignoreQUniq = optionExists("ignoreQUniq");
+ignoreQMissing = optionExists("ignoreQMissing");
 pslRecalcMatch(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }