386915fb7bdbae22f8ed152174d8b8635524c1b2 galt Sun Oct 28 00:51:33 2018 -0700 code review fixes to hgBlat Search ALL. refs #22251 diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c index 2e374c6..3bbafdd 100644 --- src/hg/hgBlat/hgBlat.c +++ src/hg/hgBlat/hgBlat.c @@ -20,30 +20,31 @@ #include "cart.h" #include "dbDb.h" #include "blatServers.h" #include "web.h" #include "hash.h" #include "botDelay.h" #include "trashDir.h" #include "trackHub.h" #include "hgConfig.h" #include "errCatch.h" #include "portable.h" #include "portable.h" #include "dystring.h" #include "chromInfo.h" #include "net.h" +#include "fuzzyFind.h" struct cart *cart; /* The user's ui state. */ struct hash *oldVars = NULL; boolean orgChange = FALSE; boolean dbChange = FALSE; boolean allGenomes = FALSE; struct gfResult /* Detailed gfServer results, this is a span of several nearby tiles, minimum 2 for dna. */ { struct gfResult *next; /* have to multiply translated coordinates by 3 */ int qStart; /* Query Start Coordinate */ @@ -83,74 +84,74 @@ int maxGeneTStart; /* Target Start Coordinate for gene with max hits */ int maxGeneTEnd; /* Target End Coordinate for gene with max hits*/ int maxGeneExons; /* Number of Exons in gene with max hits */ char maxGeneStrand[3]; /* + or - or ++ +- -+ -- Strand for gene with max hits */ char maxGeneTStrand;/* + or - TStrand for gene with max hits */ boolean done; /* Did the job get to finish */ boolean error; /* Some error happened */ char *networkErrMsg; /* Network layer error message */ struct dyString *dbg; /* Output debugging info */ struct gfResult *gfList; /* List of gfResult records */ boolean hide; /* To not show both strands, suppress the weaker-scoring one */ }; boolean debuggingGfResults = FALSE; //TRUE; -int slGfResultsCmp(const void *va, const void *vb) +int gfResultsCmp(const void *va, const void *vb) /* Compare two gfResults. */ { const struct gfResult *a = *((struct gfResult **)va); const struct gfResult *b = *((struct gfResult **)vb); int result = a->tStrand - b->tStrand; if (result == 0) { result = strcmp(a->chrom, b->chrom); if (result == 0) { return (a->tStart - b->tStart); } else return result; } else return result; } -int slRcPairsCmp(const void *va, const void *vb) +int rcPairsCmp(const void *va, const void *vb) /* Recombine Reverse-complimented Pairs (to hide the weaker one). */ { const struct genomeHits *a = *((struct genomeHits **)va); const struct genomeHits *b = *((struct genomeHits **)vb); // int result = strcmp(a->faName, b->faName); order by faName alphabetical int result = a->seqNumber - b->seqNumber; if (result == 0) { result = strcmp(a->db, b->db); if (result == 0) { return (a->queryRC - b->queryRC); } else return result; } else return result; } -int slHitsCmp(const void *va, const void *vb) -/* Compare two slHits. */ +int genomeHitsCmp(const void *va, const void *vb) +/* Compare two genomeHits. */ { const struct genomeHits *a = *((struct genomeHits **)va); const struct genomeHits *b = *((struct genomeHits **)vb); // int result = strcmp(a->faName, b->faName); order by faName alphabetical int result = a->seqNumber - b->seqNumber; if (result == 0) { if (a->error && b->error) return 0; else if (b->error && !a->error) return -1; else if (!b->error && a->error) return 1; else { @@ -223,39 +224,41 @@ } } static int remoteParallelLoadWait(int maxTimeInSeconds) /* Wait, checking to see if finished (completed or errAborted). * If timed-out or never-ran, record error status. * Return error count. */ { int maxTimeInMilliseconds = 1000 * maxTimeInSeconds; struct genomeHits *pfd; int errCount = 0; int waitTime = 0; while(1) { - sleep1000(50); // milliseconds + sleep1000(50); // milliseconds, give the threads some time to work waitTime += 50; + // check if they are done boolean done = TRUE; pthread_mutex_lock( &pfdMutex ); - if (pfdList || pfdRunning) + if (pfdList || pfdRunning) // lists not empty, stuff to do still done = FALSE; pthread_mutex_unlock( &pfdMutex ); if (done) break; + // check if max allowed time has been exceeded if (waitTime >= maxTimeInMilliseconds) break; } pthread_mutex_lock( &pfdMutex ); pfdNeverStarted = pfdList; pfdList = NULL; // stop the workers from starting any more waiting track loads for (pfd = pfdNeverStarted; pfd; pfd = pfd->next) { // query was never even started char temp[1024]; safef(temp, sizeof temp, "Ran out of time (%d milliseconds) unable to process %s %s", maxTimeInMilliseconds, pfd->genome, pfd->db); pfd->networkErrMsg = cloneString(temp); pfd->error = TRUE; ++errCount; } @@ -578,107 +581,84 @@ printf("<INPUT TYPE=\"hidden\" name=\"isProt\" value=\"on\" />\n"); printf("<TABLE><TR><TD>Custom track name: "); cgiMakeTextVar( "trackName", trackName, 30); printf("</TD></TR>"); printf("<TR><TD> Custom track description: "); cgiMakeTextVar( "trackDescription", trackDescription,50); printf("</TD></TR>"); printf("<TR><TD><INPUT TYPE=SUBMIT NAME=Submit VALUE=\"Build a custom track with these results\"></TD></TR>\n"); printf("</TABLE></FORM></DIV>"); } printf("<DIV STYLE=\"display:block;\"><PRE>"); - // find maximum query name size for padding calculations - int maxQChromNameSize = 0; - for (psl = pslList; psl != NULL; psl = psl->next) - { - int l = strlen(psl->qName); - maxQChromNameSize = max(maxQChromNameSize,l); - } - maxQChromNameSize = max(maxQChromNameSize,5); - + // find maximum query name size for padding calculations and // find maximum target chrom name size for padding calculations + int maxQChromNameSize = 0; int maxTChromNameSize = 0; for (psl = pslList; psl != NULL; psl = psl->next) { - int l = strlen(psl->tName); - maxTChromNameSize = max(maxTChromNameSize,l); + int qLen = strlen(psl->qName); + maxQChromNameSize = max(maxQChromNameSize,qLen); + int tLen = strlen(psl->tName); + maxTChromNameSize = max(maxTChromNameSize,tLen); } + maxQChromNameSize = max(maxQChromNameSize,5); maxTChromNameSize = max(maxTChromNameSize,5); - // header padding - char tempQ[1024]; - char tempT[1024]; - - printf(" ACTIONS QUERY "); - tempQ[0] = 0; - safecatRepeatChar(tempQ, sizeof tempQ, ' ', (maxQChromNameSize - 5)); - printf("%s", tempQ); + + spaceOut(stdout, maxQChromNameSize - 5); printf("SCORE START END QSIZE IDENTITY CHROM "); - tempT[0] = 0; - safecatRepeatChar(tempT, sizeof tempT, ' ', (maxTChromNameSize - 5)); - printf("%s", tempT); + spaceOut(stdout, maxTChromNameSize - 5); printf(" STRAND START END SPAN\n"); printf("---------------------------------------------------------------------------------------------"); - - tempQ[0] = 0; - safecatRepeatChar(tempQ, sizeof tempQ, '-', (maxQChromNameSize - 5)); - printf("%s", tempQ); - - tempT[0] = 0; - safecatRepeatChar(tempT, sizeof tempT, '-', (maxTChromNameSize - 5)); - printf("%s", tempT); + repeatCharOut(stdout, '-', maxQChromNameSize - 5); + repeatCharOut(stdout, '-', maxTChromNameSize - 5); printf("\n"); for (psl = pslList; psl != NULL; psl = psl->next) { if (customText) printf("<A HREF=\"%s?position=%s:%d-%d&db=%s&hgt.customText=%s&%s%s\">", browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database, customText, uiState, unhideTrack); else printf("<A HREF=\"%s?position=%s:%d-%d&db=%s&ss=%s+%s&%s%s\">", browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database, pslName, faName, uiState, unhideTrack); printf("browser</A> "); printf("<A HREF=\"%s?o=%d&g=htcUserAli&i=%s+%s+%s&c=%s&l=%d&r=%d&db=%s&%s\">", hgcUrl, psl->tStart, pslName, cgiEncode(faName), psl->qName, psl->tName, psl->tStart, psl->tEnd, database, uiState); printf("details</A> "); - tempQ[0] = 0; - safecpy(tempQ, sizeof tempQ, psl->qName); - int qPadding = maxQChromNameSize - strlen(psl->qName); - safecatRepeatChar(tempQ, sizeof tempQ, ' ', qPadding); - - tempT[0] = 0; - safecpy(tempT, sizeof tempT, psl->tName); - int tPadding = maxTChromNameSize - strlen(psl->tName); - safecatRepeatChar(tempT, sizeof tempT, ' ', tPadding); - - printf("%s %5d %5d %5d %5d %5.1f%% %s %-2s %9d %9d %6d\n", - tempQ, pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize, - 100.0 - pslCalcMilliBad(psl, TRUE) * 0.1, - tempT, psl->strand, psl->tStart+1, psl->tEnd, + printf("%s",psl->qName); + spaceOut(stdout, maxQChromNameSize - strlen(psl->qName)); + printf(" %5d %5d %5d %5d %5.1f%% ", + pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize, + 100.0 - pslCalcMilliBad(psl, TRUE) * 0.1); + printf("%s",psl->tName); + spaceOut(stdout, maxTChromNameSize - strlen(psl->tName)); + printf(" %-2s %9d %9d %6d\n", + psl->strand, psl->tStart+1, psl->tEnd, psl->tEnd - psl->tStart); } printf("</PRE>\n"); puts("<P style=\"text-align:right\"><SMALL><A HREF=\"../FAQ/FAQblat.html#blat1b\">Missing a match?</A></SMALL></P>\n"); puts("</DIV>\n"); } pslFreeList(&pslList); } void trimUniq(bioSeq *seqList) /* Check that all seq's in list have a unique name. Try and * abbreviate longer sequence names. */ { struct hash *hash = newHash(0); @@ -902,32 +882,31 @@ int bestHits = 0; int bestTStart = 0; int bestTEnd = 0; int bestExons = 0; char bestTStrand = ' '; char bestQStrand = ' '; char *thisChrom = NULL; int thisHits = 0; int thisTStart = 0; int thisTEnd = 0; int thisExons = 0; char thisTStrand = ' '; char thisQStrand = ' '; -int geneGap = 1000000; // about a million bases is a cutoff for genes. - // TODO should this really be 750K? +int geneGap = ffIntronMaxDefault; // about a million bases is a cutoff for genes. int qSlop = 5; // forgive 5 bases, about dna stepSize = 5. if (gH->complex) qSlop = 4; // reduce for translated with tileSize/stepSize = 4. struct gfResult *gfR = NULL, *gfLast = NULL; for(gfR=gH->gfList; gfR; gfR=gfR->next) { // filter on given queryFrame if (gfR->qFrame != queryFrame) continue; if (gfLast && (gfR->tStrand == gfLast->tStrand) && sameString(gfR->chrom,gfLast->chrom) && (gfR->qStart >= (gfLast->qEnd - qSlop)) && (gfR->tStart >= gfLast->tEnd) && ((gfR->tStart - gfLast->tEnd) < geneGap)) @@ -1038,162 +1017,148 @@ { dyStringPrintf(gH->dbg,"%d matches<br>\n", matchCount); break; } else if (startsWith("Error:", buf)) { errAbort("%s", buf); break; } else { dyStringPrintf(gH->dbg,"%s<br>\n", buf); // chop the line into words int numHits = 0; char *line = buf; - char *word=NULL; - int i=0; struct gfResult *gfR = NULL; AllocVar(gfR); gfR->qStrand = (gH->queryRC ? '-' : '+'); - while ((word = nextWord(&line)) != NULL) + + char *word[10]; + int fields = chopLine(line,word); + int expectedFields = 6; + if (gH->complex) { - if (i == 0) - { // e.g. 3139 - gfR->qStart = sqlUnsigned(word); - } - if (i == 1) - { // e.g. 3220 - gfR->qEnd = sqlUnsigned(word); + if (gH->isProt) + expectedFields = 8; + else + expectedFields = 9; } - if (i == 2) - { // e.g. hg38.2bit:chr1 - char *colon = strchr(word, ':'); + if (fields != expectedFields) + errAbort("expected %d fields, got %d fields", expectedFields, fields); + + gfR->qStart = sqlUnsigned(word[0]); // e.g. 3139 + gfR->qEnd = sqlUnsigned(word[1]); // e.g. 3220 + + // e.g. hg38.2bit:chr1 + char *colon = strchr(word[2], ':'); if (colon) { gfR->chrom = cloneString(colon+1); } else { // e.g. chr10.nib - char *dot = strchr(word, '.'); + char *dot = strchr(word[2], '.'); if (dot) { *dot = 0; - gfR->chrom = cloneString(word); + gfR->chrom = cloneString(word[2]); } else errAbort("Expecting colon or period in the 3rd field of gfServer response"); } - } - if (i == 3) - { // e.g. 173515 - gfR->tStart = sqlUnsigned(word); - } - if (i == 4) - { // e.g. 173586 - gfR->tEnd = sqlUnsigned(word); - } - if (i == 5) - { // e.g. 14 - numHits = sqlUnsigned(word); + + gfR->tStart = sqlUnsigned(word[3]); // e.g. 173515 + gfR->tEnd = sqlUnsigned(word[4]); // e.g. 173586 + + numHits = sqlUnsigned(word[5]); // e.g. 14 // Frustrated with weird little results with vastly exaggerated hit-score, // I have flattened that out so it maxes out at #tiles that could fit // It seems to still work just fine. I am going to leave it for now. // One minor note, by suppressing extra scores for short exons, // it will not prefer alignments with the same number of hits, but more exons. if (!gH->complex) // dna xType { // maximum tiles that could fit in qSpan int limit = ((gfR->qEnd - gfR->qStart) - 6)/5; ++limit; // for a little extra. if (numHits > limit) numHits = limit; } gfR->numHits = numHits; - } + if (gH->complex) { - if (i == 6) - { // e.g. + or - - gfR->tStrand = word[0]; - } - if (i == 7) - { // e.g. 0,1,2 - gfR->tFrame = sqlUnsigned(word); - } + gfR->tStrand = word[6][0]; // e.g. + or - + gfR->tFrame = sqlUnsigned(word[7]); // e.g. 0,1,2 if (!gH->isProt) { - if (i == 8) - { // e.g. 0,1,2 - gfR->qFrame = sqlUnsigned(word); - } + gfR->qFrame = sqlUnsigned(word[8]); // e.g. 0,1,2 } } else { gfR->tStrand = '+'; // dna search only on + target strand } - ++i; - } if (gH->complex) { char *s = netGetLongString(gH->sd); if (s == NULL) break; - dyStringPrintf(gH->dbg,"%s<br>\n", s); //dumps out qstart1 start1 qstart2 tstart2 ... + dyStringPrintf(gH->dbg,"%s<br>\n", s); //dumps out qstart1 tstart1 qstart2 tstart2 ... freeMem(s); } slAddHead(&gH->gfList, gfR); } ++matchCount; } slReverse(&gH->gfList); unTranslateCoordinates(gH); // convert back to untranslated coordinates -slSort(&gH->gfList, slGfResultsCmp); // sort by tStrand, chrom, tStart +slSort(&gH->gfList, gfResultsCmp); // sort by tStrand, chrom, tStart struct qFrameResults /* Information about hits on a genome assembly */ { int maxGeneHits; /* Highest gene hit-count */ char *maxGeneChrom; /* Target Chrom for gene with max gene hits */ int maxGeneTStart; /* Target Start Coordinate for gene with max hits */ int maxGeneTEnd; /* Target End Coordinate for gene with max hits*/ int maxGeneExons; /* Number of Exons in gene with max hits */ char maxGeneTStrand;/* + or - TStrand for gene with max hits */ }; int qFrame = 0; int qFrameStart = 0; int qFrameEnd = 0; if (gH->complex && !gH->isProt) // rnax, dnax { qFrameEnd = 2; } struct qFrameResults r[3]; for(qFrame = qFrameStart; qFrame <= qFrameEnd; ++qFrame) { - findBestGene(gH, qFrame); // find best gene-line thing with most hits + findBestGene(gH, qFrame); // find best gene-like thing with most hits r[qFrame].maxGeneHits = gH->maxGeneHits; r[qFrame].maxGeneChrom = cloneString(gH->maxGeneChrom); r[qFrame].maxGeneTStart = gH->maxGeneTStart; r[qFrame].maxGeneTEnd = gH->maxGeneTEnd; r[qFrame].maxGeneExons = gH->maxGeneExons; r[qFrame].maxGeneTStrand = gH->maxGeneTStrand; } // combine the results from all 3 qFrames if (gH->complex && !gH->isProt) // rnax, dnax { int biggestHit = -1; @@ -1954,39 +1919,39 @@ } if (ptMax > 0) { /* wait for remote parallel load to finish */ remoteParallelLoadWait(atoi(cfgOptionDefault("parallelFetch.timeout", "90"))); // wait up to default 90 seconds. } // Should continue with pfdDone since threads could still be running that might access pdfList ? // Hide weaker of RC'd query pairs, if not debugging. // Combine pairs with a query and its RC. if (!(debuggingGfResults) && (sameString(pfdDone->xType,"dna") || sameString(pfdDone->xType,"dnax"))) { - slSort(&pfdDone, slRcPairsCmp); + slSort(&pfdDone, rcPairsCmp); hideWeakerOfQueryRcPairs(pfdDone); } // requires db for chromSize, do database after multi-threading done. changeMaxGenePositionToPositiveStrandCoords(pfdDone); // sort by maximum hits - slSort(&pfdDone, slHitsCmp); + slSort(&pfdDone, genomeHitsCmp); // Print instructions printf("Click the link in the Assembly column to see the full BLAT output for that Genome.<br><br>\n"); // Print report // TODO move to final report at the end of ALL Assemblies int lastSeqNumber = -1; int idCount = 0; char id[256]; struct genomeHits *gH = NULL; for (gH = pfdDone; gH; gH = gH->next) { if (lastSeqNumber != gH->seqNumber) { if (lastSeqNumber != -1) // end previous table {