e9e2915af4fc81db56ce4b94d2ceccd7f10cfe9b
markd
  Mon Oct 4 09:21:24 2010 -0700
added handling of misidentified psiblast files
diff --git src/hg/blastToPsl/blastXmlToPsl.c src/hg/blastToPsl/blastXmlToPsl.c
index c560443..bb71be2 100644
--- src/hg/blastToPsl/blastXmlToPsl.c
+++ src/hg/blastToPsl/blastXmlToPsl.c
@@ -28,9 +28,11 @@
   "   to nucleic coordinates\n"
   "  -qName=src - define element used to obtain the qName.  The following\n"
   "   values are support:\n"
-  "     o Iteration_query-ID - use contents of the <Iteration_query-ID> element.\n"
-  "     o Iteration_query-def0 - use the first white-space separated word of the\n"
-  "       <Iteration_query-def> element.\n"
+  "     o query-ID - use contents of the <Iteration_query-ID> element if it\n"
+  "       exists, otherwise use <BlastOutput_query-ID>\n"
+  "     o query-def0 - use the first white-space separated word of the\n"
+  "       <Iteration_query-def> element if it exists, otherwise the first word\n"
+  "       of <BlastOutput_query-def>.\n"
   "   Default is query-def0.\n"
   "  -tName=src - define element used to obtain the tName.  The following\n"
   "   values are support:\n"
@@ -39,6 +41,8 @@
   "       <Hit_def> element.\n"
   "     o Hit_accession - contents of the <Hit_accession> element.\n"
   "   Default is Hit-def0.\n"
+  "  -forcePsiBlast - treat as output of PSI-BLAST. blast-2.2.16 and maybe\n"
+  "   others indentify psiblast as blastp."
   "\n"
   "Output only results of last round from PSI BLAST\n");
 }
@@ -50,12 +54,13 @@
     {"convertToNucCoords", OPTION_BOOLEAN},
     {"qName", OPTION_STRING},
     {"tName", OPTION_STRING},
+    {"forcePsiBlast", OPTION_BOOLEAN},
     {NULL, 0},
 };
 
 enum qNameSrc {
-    qNameSrcIterationQueryId,
-    qNameSrcIterationQueryDef0
+    qNameSrcQueryId,
+    qNameSrcQueryDef0
 };
 
 enum tNameSrc {
@@ -69,7 +74,8 @@
 static boolean pslxFmt = FALSE; /* output in pslx format */
 static int errCount = 0; /* count of  PSLs failing checks */
 static boolean convertToNucCoords = FALSE; /* adjust query coordinates */
-static enum qNameSrc qNameSrc = qNameSrcIterationQueryDef0;   /* source of qName */
+static boolean forcePsiBlast = FALSE; /* assume PSI-BLAST output  */
+static enum qNameSrc qNameSrc = qNameSrcQueryDef0;   /* source of qName */
 static enum tNameSrc tNameSrc = tNameSrcHitDef0;   /* source of tName */
 
 struct coords
@@ -101,6 +107,8 @@
 /* determine blast algorithm and other flags */
 {
 unsigned algo = pslBuildGetBlastAlgo(outputRec->ncbiBlastBlastOutputProgram->text);
+if (forcePsiBlast)
+    algo = psiblast;
 if (convertToNucCoords && (algo != tblastn))
     errAbort("-convertToNucCoords only support for TBLASTN");
 return algo | (convertToNucCoords ? cnvNucCoords : 0) | (pslxFmt ? bldPslx : 0);
@@ -113,10 +121,12 @@
 pslCheck("blastXmlToPsl", stderr, psl);
 }
 
-static void outputScore(struct psl *psl, struct ncbiBlastIteration *iterRec, struct ncbiBlastHit *hitRec, struct ncbiBlastHsp *hspRec, FILE* scoreFh)
+static void outputScore(struct psl *psl, struct ncbiBlastBlastOutput *outputRec, struct ncbiBlastIteration *iterRec, struct ncbiBlastHit *hitRec, struct ncbiBlastHsp *hspRec, FILE* scoreFh)
 /* output score record */
 {
-pslBuildScoresWriteWithDefs(scoreFh, psl, hspRec->ncbiBlastHspBitScore->text, hspRec->ncbiBlastHspEvalue->text, iterRec->ncbiBlastIterationQueryDef->text, hitRec->ncbiBlastHitDef->text);
+pslBuildScoresWriteWithDefs(scoreFh, psl, hspRec->ncbiBlastHspBitScore->text, hspRec->ncbiBlastHspEvalue->text, 
+                            (iterRec->ncbiBlastIterationQueryDef != NULL) ? iterRec->ncbiBlastIterationQueryDef->text : outputRec->ncbiBlastBlastOutputQueryDef->text,
+                            hitRec->ncbiBlastHitDef->text);
 }
 
 static void appendFirstWord(struct dyString *buf, char *str)
@@ -128,7 +138,7 @@
 dyStringAppendN(buf, str, (end - str));
 }
 
-static char *getQName(struct ncbiBlastIteration *iterRec)
+static char *getQName(struct ncbiBlastBlastOutput *outputRec, struct ncbiBlastIteration *iterRec)
 /* obtain the qName give the requested source */
 {
 static struct dyString *buf = NULL;
@@ -137,11 +147,15 @@
 dyStringClear(buf);
 switch (qNameSrc)
     {
-    case qNameSrcIterationQueryId:
-        dyStringAppend(buf, iterRec->ncbiBlastIterationQueryID->text);
+    case qNameSrcQueryId:
+        dyStringAppend(buf, (iterRec->ncbiBlastIterationQueryID != NULL)
+                       ? iterRec->ncbiBlastIterationQueryID->text
+                       : outputRec->ncbiBlastBlastOutputQueryID->text);
         break;
-    case qNameSrcIterationQueryDef0:
-        appendFirstWord(buf, iterRec->ncbiBlastIterationQueryDef->text);
+    case qNameSrcQueryDef0:
+        appendFirstWord(buf, (iterRec->ncbiBlastIterationQueryDef != NULL)
+                        ? iterRec->ncbiBlastIterationQueryDef->text
+                        : outputRec->ncbiBlastBlastOutputQueryDef->text);
         break;        
     }
 return buf->string;
@@ -169,27 +183,30 @@
 return buf->string;
 }
 
-static void processHspRec(struct ncbiBlastIteration *iterRec, struct ncbiBlastHit *hitRec,
+static void processHspRec(struct ncbiBlastBlastOutput *outputRec, struct ncbiBlastIteration *iterRec, struct ncbiBlastHit *hitRec,
                           struct ncbiBlastHsp *hspRec, unsigned flags, FILE *pslFh, FILE *scoreFh)
 /* process one HSP record, converting to a PSL */
 {
-struct coords qUcsc = blastToUcsc(hspRec->ncbiBlastHspQueryFrom->text, hspRec->ncbiBlastHspQueryTo->text, iterRec->ncbiBlastIterationQueryLen->text,
+int queryLen = (iterRec->ncbiBlastIterationQueryLen != NULL) 
+    ? iterRec->ncbiBlastIterationQueryLen->text
+    : outputRec->ncbiBlastBlastOutputQueryLen->text;
+struct coords qUcsc = blastToUcsc(hspRec->ncbiBlastHspQueryFrom->text, hspRec->ncbiBlastHspQueryTo->text, queryLen,
                                   ((hspRec->ncbiBlastHspQueryFrame == NULL) ? 0 : hspRec->ncbiBlastHspQueryFrame->text));
 struct coords tUcsc = blastToUcsc(hspRec->ncbiBlastHspHitFrom->text, hspRec->ncbiBlastHspHitTo->text, hitRec->ncbiBlastHitLen->text,
                                   ((hspRec->ncbiBlastHspHitFrame == NULL) ? 0 : hspRec->ncbiBlastHspHitFrame->text));
-struct psl *psl = pslBuildFromHsp(getQName(iterRec), qUcsc.size, qUcsc.start, qUcsc.end, qUcsc.strand, hspRec->ncbiBlastHspQseq->text,
+struct psl *psl = pslBuildFromHsp(getQName(outputRec, iterRec), qUcsc.size, qUcsc.start, qUcsc.end, qUcsc.strand, hspRec->ncbiBlastHspQseq->text,
                                   getTName(hitRec),  tUcsc.size, tUcsc.start, tUcsc.end, tUcsc.strand, hspRec->ncbiBlastHspHseq->text,
                                   flags);
 if  ((psl->blockCount > 0) && ((hspRec->ncbiBlastHspEvalue->text <= eVal) || (eVal == -1)))
     {
     outputPsl(psl, pslFh);
     if (scoreFh != NULL)
-        outputScore(psl, iterRec, hitRec, hspRec, scoreFh);
+        outputScore(psl, outputRec, iterRec, hitRec, hspRec, scoreFh);
     }
 pslFree(&psl);
 }
 
-static void processIterRec(struct ncbiBlastIteration *iterRec, unsigned flags, FILE *pslFh, FILE *scoreFh)
+static void processIterRec(struct ncbiBlastBlastOutput *outputRec, struct ncbiBlastIteration *iterRec, unsigned flags, FILE *pslFh, FILE *scoreFh)
 /* process one iteration record, converting all HSPs to PSLs */
 {
 struct ncbiBlastIterationHits *hitsRec;
@@ -204,7 +221,7 @@
             struct ncbiBlastHsp *hspRec;
             for (hspRec = hspsRec->ncbiBlastHsp; hspRec != NULL; hspRec = hspRec->next)
                 {
-                processHspRec(iterRec, hitRec, hspRec, flags, pslFh, scoreFh);
+                processHspRec(outputRec, iterRec, hitRec, hspRec, flags, pslFh, scoreFh);
                 }
             }
         }
@@ -219,7 +236,7 @@
     {
     struct ncbiBlastIteration *iterRec;
     for (iterRec = itersRec->ncbiBlastIteration; iterRec != NULL; iterRec = iterRec->next)
-        processIterRec(iterRec, flags, pslFh, scoreFh);
+        processIterRec(outputRec, iterRec, flags, pslFh, scoreFh);
     }
 }
 
@@ -229,7 +246,8 @@
 struct ncbiBlastIteration *nextRec;
 for (nextRec = iterRec->next; nextRec != NULL ; iterRec = nextRec, nextRec = nextRec->next)
     {
-    if (!sameString(nextRec->ncbiBlastIterationQueryDef->text, iterRec->ncbiBlastIterationQueryDef->text))
+    if ((nextRec->ncbiBlastIterationQueryDef != NULL)
+        && !sameString(nextRec->ncbiBlastIterationQueryDef->text, iterRec->ncbiBlastIterationQueryDef->text))
         break;
     }
 return iterRec;
@@ -242,7 +260,7 @@
 for (itersRec = outputRec->ncbiBlastBlastOutputIterations; itersRec != NULL; itersRec = itersRec->next)
     {
     struct ncbiBlastIteration *iterRec = findLastIterForQuery(itersRec->ncbiBlastIteration);
-    processIterRec(iterRec, flags, pslFh, scoreFh);
+    processIterRec(outputRec, iterRec, flags, pslFh, scoreFh);
     }
 }
 
@@ -284,14 +302,15 @@
 eVal = optionDouble("eVal", eVal);
 pslxFmt = optionExists("pslx");
 convertToNucCoords = optionExists("convertToNucCoords");
+forcePsiBlast = optionExists("forcePsiBlast");
 
-char *qNameSrcStr = optionVal("qName", "Iteration_query-def0");
-if (sameString(qNameSrcStr, "Iteration_query-ID"))
-    qNameSrc = qNameSrcIterationQueryId;
-else if (sameString(qNameSrcStr, "Iteration_query-def0"))
-    qNameSrc = qNameSrcIterationQueryDef0;
+char *qNameSrcStr = optionVal("qName", "query-def0");
+if (sameString(qNameSrcStr, "query-ID"))
+    qNameSrc = qNameSrcQueryId;
+else if (sameString(qNameSrcStr, "query-def0"))
+    qNameSrc = qNameSrcQueryDef0;
 else
-    errAbort("invalid value for -qName, expect on of: \"Iteration_query-ID\", or \"Iteration_query-def0\", got \"%s\"", qNameSrcStr);
+    errAbort("invalid value for -qName, expect on of: \"query-ID\", or \"query-def0\", got \"%s\"", qNameSrcStr);
 
 char *tNameSrcStr = optionVal("tName", "Hit_def0");
 if (sameString(tNameSrcStr, "Hit_id"))