2f48d68cce72bd8ddfb21a11da72d244b7c8c479 markd Sun Oct 3 08:31:41 2010 -0700 added options to specify how to obtained qName and tName. Output query and target defs in score file diff --git src/hg/blastToPsl/blastXmlToPsl.c src/hg/blastToPsl/blastXmlToPsl.c index 26d06ff..c560443 100644 --- src/hg/blastToPsl/blastXmlToPsl.c +++ src/hg/blastToPsl/blastXmlToPsl.c @@ -18,7 +18,7 @@ "\n" "options:\n" " -scores=file - Write score information to this file. Format is:\n" - " strands qName qStart qEnd tName tStart tEnd bitscore eVal\n" + " strands qName qStart qEnd tName tStart tEnd bitscore eVal qDef tDef\n" " -verbose=n - n >= 3 prints each line of file after parsing.\n" " n >= 4 dumps the result of each query\n" " -eVal=n n is e-value threshold to filter results. Format can be either\n" @@ -26,6 +26,19 @@ " -pslx - create PSLX output (includes sequences for blocks)\n" " -convertToNucCoords - convert protein to nucleic alignments to nucleic\n" " to nucleic coordinates\n" + " -qName=src - define element used to obtain the qName. The following\n" + " values are support:\n" + " o Iteration_query-ID - use contents of the element.\n" + " o Iteration_query-def0 - use the first white-space separated word of the\n" + " element.\n" + " Default is query-def0.\n" + " -tName=src - define element used to obtain the tName. The following\n" + " values are support:\n" + " o Hit_id - use contents of the element.\n" + " o Hit_def0 - use the first white-space separated word of the\n" + " element.\n" + " o Hit_accession - contents of the element.\n" + " Default is Hit-def0.\n" "\n" "Output only results of last round from PSI BLAST\n"); } @@ -35,13 +48,29 @@ {"eVal", OPTION_DOUBLE}, {"pslx", OPTION_BOOLEAN}, {"convertToNucCoords", OPTION_BOOLEAN}, + {"qName", OPTION_STRING}, + {"tName", OPTION_STRING}, {NULL, 0}, }; +enum qNameSrc { + qNameSrcIterationQueryId, + qNameSrcIterationQueryDef0 +}; + +enum tNameSrc { + tNameSrcHitId, + tNameSrcHitDef0, + tNameSrcHitAccession +}; + + static double eVal = -1; /* default Expect value signifying no filtering */ static boolean pslxFmt = FALSE; /* output in pslx format */ static int errCount = 0; /* count of PSLs failing checks */ -static boolean convertToNucCoords = 1; /* adjust query coordinates */ +static boolean convertToNucCoords = FALSE; /* adjust query coordinates */ +static enum qNameSrc qNameSrc = qNameSrcIterationQueryDef0; /* source of qName */ +static enum tNameSrc tNameSrc = tNameSrcHitDef0; /* source of tName */ struct coords /* structure to return converted coordinates */ @@ -77,15 +106,69 @@ return algo | (convertToNucCoords ? cnvNucCoords : 0) | (pslxFmt ? bldPslx : 0); } -static void outputPsl(struct psl *psl, struct ncbiBlastHsp *hspRec, FILE* pslFh, FILE* scoreFh) -/* output a psl and optional score */ +static void outputPsl(struct psl *psl, FILE* pslFh) +/* output a psl */ { pslTabOut(psl, pslFh); -if (scoreFh != NULL) - pslBuildWriteScores(scoreFh, psl, hspRec->ncbiBlastHspBitScore->text, hspRec->ncbiBlastHspEvalue->text); pslCheck("blastXmlToPsl", stderr, psl); } +static void outputScore(struct psl *psl, struct ncbiBlastIteration *iterRec, struct ncbiBlastHit *hitRec, struct ncbiBlastHsp *hspRec, FILE* scoreFh) +/* output score record */ +{ +pslBuildScoresWriteWithDefs(scoreFh, psl, hspRec->ncbiBlastHspBitScore->text, hspRec->ncbiBlastHspEvalue->text, iterRec->ncbiBlastIterationQueryDef->text, hitRec->ncbiBlastHitDef->text); +} + +static void appendFirstWord(struct dyString *buf, char *str) +/* append the first white-spaced word from str */ +{ +char *end = skipToSpaces(str); +if (end == NULL) + end = str + strlen(str); +dyStringAppendN(buf, str, (end - str)); +} + +static char *getQName(struct ncbiBlastIteration *iterRec) +/* obtain the qName give the requested source */ +{ +static struct dyString *buf = NULL; +if (buf == NULL) + buf = dyStringNew(32); +dyStringClear(buf); +switch (qNameSrc) + { + case qNameSrcIterationQueryId: + dyStringAppend(buf, iterRec->ncbiBlastIterationQueryID->text); + break; + case qNameSrcIterationQueryDef0: + appendFirstWord(buf, iterRec->ncbiBlastIterationQueryDef->text); + break; + } +return buf->string; +} + +static char *getTName(struct ncbiBlastHit *hitRec) +/* obtain the tName give the requested source */ +{ +static struct dyString *buf = NULL; +if (buf == NULL) + buf = dyStringNew(32); +dyStringClear(buf); +switch (tNameSrc) + { + case tNameSrcHitId: + dyStringAppend(buf, hitRec->ncbiBlastHitId->text); + break; + case tNameSrcHitDef0: + appendFirstWord(buf, hitRec->ncbiBlastHitDef->text); + break; + case tNameSrcHitAccession: + dyStringAppend(buf, hitRec->ncbiBlastHitAccession->text); + break; + } +return buf->string; +} + static void processHspRec(struct ncbiBlastIteration *iterRec, struct ncbiBlastHit *hitRec, struct ncbiBlastHsp *hspRec, unsigned flags, FILE *pslFh, FILE *scoreFh) /* process one HSP record, converting to a PSL */ @@ -94,15 +177,15 @@ ((hspRec->ncbiBlastHspQueryFrame == NULL) ? 0 : hspRec->ncbiBlastHspQueryFrame->text)); struct coords tUcsc = blastToUcsc(hspRec->ncbiBlastHspHitFrom->text, hspRec->ncbiBlastHspHitTo->text, hitRec->ncbiBlastHitLen->text, ((hspRec->ncbiBlastHspHitFrame == NULL) ? 0 : hspRec->ncbiBlastHspHitFrame->text)); -char *qId = firstWordInLine(iterRec->ncbiBlastIterationQueryDef->text); -char *tId = (flags & psiblast) - ? firstWordInLine(hitRec->ncbiBlastHitId->text) - : firstWordInLine(hitRec->ncbiBlastHitDef->text); -struct psl *psl = pslBuildFromHsp(qId, qUcsc.size, qUcsc.start, qUcsc.end, qUcsc.strand, hspRec->ncbiBlastHspQseq->text, - tId, tUcsc.size, tUcsc.start, tUcsc.end, tUcsc.strand, hspRec->ncbiBlastHspHseq->text, +struct psl *psl = pslBuildFromHsp(getQName(iterRec), qUcsc.size, qUcsc.start, qUcsc.end, qUcsc.strand, hspRec->ncbiBlastHspQseq->text, + getTName(hitRec), tUcsc.size, tUcsc.start, tUcsc.end, tUcsc.strand, hspRec->ncbiBlastHspHseq->text, flags); if ((psl->blockCount > 0) && ((hspRec->ncbiBlastHspEvalue->text <= eVal) || (eVal == -1))) - outputPsl(psl, hspRec, pslFh, scoreFh); + { + outputPsl(psl, pslFh); + if (scoreFh != NULL) + outputScore(psl, iterRec, hitRec, hspRec, scoreFh); + } pslFree(&psl); } @@ -158,14 +241,10 @@ struct ncbiBlastBlastOutputIterations *itersRec; for (itersRec = outputRec->ncbiBlastBlastOutputIterations; itersRec != NULL; itersRec = itersRec->next) { - struct ncbiBlastIteration *iterRec; - for (iterRec = itersRec->ncbiBlastIteration; iterRec != NULL; iterRec = iterRec->next) - { - iterRec = findLastIterForQuery(iterRec); + struct ncbiBlastIteration *iterRec = findLastIterForQuery(itersRec->ncbiBlastIteration); processIterRec(iterRec, flags, pslFh, scoreFh); } } -} static void blastXmlToPsl(char *blastXmlFile, char *pslFile, char *scoreFile) /* blastXmlToPsl - convert blast XML output to PSLs. */ @@ -175,10 +254,7 @@ FILE *pslFh = mustOpen(pslFile, "w"); FILE *scoreFh = NULL; if (scoreFile != NULL) - { - scoreFh = mustOpen(scoreFile, "w"); - fputs(pslBuildScoreHdr, scoreFh); - } + scoreFh = pslBuildScoresOpen(scoreFile, TRUE); if (xap->topObject == NULL) errAbort("empty BLAST XML file: %s", blastXmlFile); @@ -208,6 +284,25 @@ eVal = optionDouble("eVal", eVal); pslxFmt = optionExists("pslx"); convertToNucCoords = optionExists("convertToNucCoords"); + +char *qNameSrcStr = optionVal("qName", "Iteration_query-def0"); +if (sameString(qNameSrcStr, "Iteration_query-ID")) + qNameSrc = qNameSrcIterationQueryId; +else if (sameString(qNameSrcStr, "Iteration_query-def0")) + qNameSrc = qNameSrcIterationQueryDef0; +else + errAbort("invalid value for -qName, expect on of: \"Iteration_query-ID\", or \"Iteration_query-def0\", got \"%s\"", qNameSrcStr); + +char *tNameSrcStr = optionVal("tName", "Hit_def0"); +if (sameString(tNameSrcStr, "Hit_id")) + tNameSrc = tNameSrcHitId; +else if (sameString(tNameSrcStr, "Hit_def0")) + tNameSrc = tNameSrcHitDef0; +else if (sameString(tNameSrcStr, "Hit_accession")) + tNameSrc = tNameSrcHitAccession; +else + errAbort("invalid value for -tName, expect on of: \"Hit_id\", \"Hit_def0\", or \"Hit_accession\", got \"%s\"", tNameSrcStr); + blastXmlToPsl(argv[1], argv[2], optionVal("scores", NULL)); if (errCount > 0) errAbort("%d invalid PSLs created", errCount);