386915fb7bdbae22f8ed152174d8b8635524c1b2 galt Sun Oct 28 00:51:33 2018 -0700 code review fixes to hgBlat Search ALL. refs #22251 diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c index 2e374c6..3bbafdd 100644 --- src/hg/hgBlat/hgBlat.c +++ src/hg/hgBlat/hgBlat.c @@ -20,30 +20,31 @@ #include "cart.h" #include "dbDb.h" #include "blatServers.h" #include "web.h" #include "hash.h" #include "botDelay.h" #include "trashDir.h" #include "trackHub.h" #include "hgConfig.h" #include "errCatch.h" #include "portable.h" #include "portable.h" #include "dystring.h" #include "chromInfo.h" #include "net.h" +#include "fuzzyFind.h" struct cart *cart; /* The user's ui state. */ struct hash *oldVars = NULL; boolean orgChange = FALSE; boolean dbChange = FALSE; boolean allGenomes = FALSE; struct gfResult /* Detailed gfServer results, this is a span of several nearby tiles, minimum 2 for dna. */ { struct gfResult *next; /* have to multiply translated coordinates by 3 */ int qStart; /* Query Start Coordinate */ @@ -83,74 +84,74 @@ int maxGeneTStart; /* Target Start Coordinate for gene with max hits */ int maxGeneTEnd; /* Target End Coordinate for gene with max hits*/ int maxGeneExons; /* Number of Exons in gene with max hits */ char maxGeneStrand[3]; /* + or - or ++ +- -+ -- Strand for gene with max hits */ char maxGeneTStrand;/* + or - TStrand for gene with max hits */ boolean done; /* Did the job get to finish */ boolean error; /* Some error happened */ char *networkErrMsg; /* Network layer error message */ struct dyString *dbg; /* Output debugging info */ struct gfResult *gfList; /* List of gfResult records */ boolean hide; /* To not show both strands, suppress the weaker-scoring one */ }; boolean debuggingGfResults = FALSE; //TRUE; -int slGfResultsCmp(const void *va, const void *vb) +int gfResultsCmp(const void *va, const void *vb) /* Compare two gfResults. */ { const struct gfResult *a = *((struct gfResult **)va); const struct gfResult *b = *((struct gfResult **)vb); int result = a->tStrand - b->tStrand; if (result == 0) { result = strcmp(a->chrom, b->chrom); if (result == 0) { return (a->tStart - b->tStart); } else return result; } else return result; } -int slRcPairsCmp(const void *va, const void *vb) +int rcPairsCmp(const void *va, const void *vb) /* Recombine Reverse-complimented Pairs (to hide the weaker one). */ { const struct genomeHits *a = *((struct genomeHits **)va); const struct genomeHits *b = *((struct genomeHits **)vb); // int result = strcmp(a->faName, b->faName); order by faName alphabetical int result = a->seqNumber - b->seqNumber; if (result == 0) { result = strcmp(a->db, b->db); if (result == 0) { return (a->queryRC - b->queryRC); } else return result; } else return result; } -int slHitsCmp(const void *va, const void *vb) -/* Compare two slHits. */ +int genomeHitsCmp(const void *va, const void *vb) +/* Compare two genomeHits. */ { const struct genomeHits *a = *((struct genomeHits **)va); const struct genomeHits *b = *((struct genomeHits **)vb); // int result = strcmp(a->faName, b->faName); order by faName alphabetical int result = a->seqNumber - b->seqNumber; if (result == 0) { if (a->error && b->error) return 0; else if (b->error && !a->error) return -1; else if (!b->error && a->error) return 1; else { @@ -223,39 +224,41 @@ } } static int remoteParallelLoadWait(int maxTimeInSeconds) /* Wait, checking to see if finished (completed or errAborted). * If timed-out or never-ran, record error status. * Return error count. */ { int maxTimeInMilliseconds = 1000 * maxTimeInSeconds; struct genomeHits *pfd; int errCount = 0; int waitTime = 0; while(1) { - sleep1000(50); // milliseconds + sleep1000(50); // milliseconds, give the threads some time to work waitTime += 50; + // check if they are done boolean done = TRUE; pthread_mutex_lock( &pfdMutex ); - if (pfdList || pfdRunning) + if (pfdList || pfdRunning) // lists not empty, stuff to do still done = FALSE; pthread_mutex_unlock( &pfdMutex ); if (done) break; + // check if max allowed time has been exceeded if (waitTime >= maxTimeInMilliseconds) break; } pthread_mutex_lock( &pfdMutex ); pfdNeverStarted = pfdList; pfdList = NULL; // stop the workers from starting any more waiting track loads for (pfd = pfdNeverStarted; pfd; pfd = pfd->next) { // query was never even started char temp[1024]; safef(temp, sizeof temp, "Ran out of time (%d milliseconds) unable to process %s %s", maxTimeInMilliseconds, pfd->genome, pfd->db); pfd->networkErrMsg = cloneString(temp); pfd->error = TRUE; ++errCount; } @@ -578,107 +581,84 @@ printf("\n"); printf(""); printf(""); printf("\n"); printf("
Custom track name: "); cgiMakeTextVar( "trackName", trackName, 30); printf("
Custom track description: "); cgiMakeTextVar( "trackDescription", trackDescription,50); printf("
"); } printf("
");
 
-    // find maximum query name size for padding calculations
-    int maxQChromNameSize = 0;
-    for (psl = pslList; psl != NULL; psl = psl->next)
-	{
-	int l = strlen(psl->qName);
-	maxQChromNameSize = max(maxQChromNameSize,l);
-	}
-    maxQChromNameSize = max(maxQChromNameSize,5);
-
+    // find maximum query name size for padding calculations and
     // find maximum target chrom name size for padding calculations
+    int maxQChromNameSize = 0;
     int maxTChromNameSize = 0;
     for (psl = pslList; psl != NULL; psl = psl->next)
 	{
-	int l = strlen(psl->tName);
-	maxTChromNameSize = max(maxTChromNameSize,l);
+	int qLen = strlen(psl->qName);
+	maxQChromNameSize = max(maxQChromNameSize,qLen);
+	int tLen = strlen(psl->tName);
+	maxTChromNameSize = max(maxTChromNameSize,tLen);
 	}
+    maxQChromNameSize = max(maxQChromNameSize,5);
     maxTChromNameSize = max(maxTChromNameSize,5);
 
-    // header padding
-    char tempQ[1024];
-    char tempT[1024];
-
-
     printf("   ACTIONS      QUERY ");
-    tempQ[0] = 0;
-    safecatRepeatChar(tempQ, sizeof tempQ, ' ', (maxQChromNameSize - 5));
-    printf("%s", tempQ);
+    
+    spaceOut(stdout, maxQChromNameSize - 5);
 
     printf("SCORE START   END QSIZE IDENTITY  CHROM ");
-    tempT[0] = 0;
-    safecatRepeatChar(tempT, sizeof tempT, ' ', (maxTChromNameSize - 5));
-    printf("%s", tempT);
+    spaceOut(stdout, maxTChromNameSize - 5);
 
     printf(" STRAND  START       END   SPAN\n");
 
     printf("---------------------------------------------------------------------------------------------");
-
-    tempQ[0] = 0;
-    safecatRepeatChar(tempQ, sizeof tempQ, '-', (maxQChromNameSize - 5));
-    printf("%s", tempQ);
-
-    tempT[0] = 0;
-    safecatRepeatChar(tempT, sizeof tempT, '-', (maxTChromNameSize - 5));
-    printf("%s", tempT);
+    repeatCharOut(stdout, '-', maxQChromNameSize - 5);
+    repeatCharOut(stdout, '-', maxTChromNameSize - 5);
 
     printf("\n");
 
     for (psl = pslList; psl != NULL; psl = psl->next)
 	{
         if (customText)
             printf("",
                 browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database, 
                 customText, uiState, unhideTrack);
         else
             printf("",
                 browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database, 
                 pslName, faName, uiState, unhideTrack);
 	printf("browser ");
 	printf("", 
 	    hgcUrl, psl->tStart, pslName, cgiEncode(faName), psl->qName,  psl->tName,
 	    psl->tStart, psl->tEnd, database, uiState);
 	printf("details ");
 
-	tempQ[0] = 0;
-	safecpy(tempQ, sizeof tempQ, psl->qName);
-	int qPadding = maxQChromNameSize - strlen(psl->qName);
-	safecatRepeatChar(tempQ, sizeof tempQ, ' ', qPadding);
-
-	tempT[0] = 0;
-	safecpy(tempT, sizeof tempT, psl->tName);
-	int tPadding = maxTChromNameSize - strlen(psl->tName);
-	safecatRepeatChar(tempT, sizeof tempT, ' ', tPadding);
-
-	printf("%s %5d %5d %5d %5d   %5.1f%%  %s  %-2s  %9d %9d %6d\n",
-	    tempQ, pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize,
-	    100.0 - pslCalcMilliBad(psl, TRUE) * 0.1,
-	    tempT, psl->strand, psl->tStart+1, psl->tEnd,
+	printf("%s",psl->qName);
+	spaceOut(stdout, maxQChromNameSize - strlen(psl->qName));
+	printf(" %5d %5d %5d %5d   %5.1f%%  ",
+	    pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize,
+	    100.0 - pslCalcMilliBad(psl, TRUE) * 0.1);
+	printf("%s",psl->tName);
+	spaceOut(stdout, maxTChromNameSize - strlen(psl->tName));
+	printf("  %-2s  %9d %9d %6d\n",
+	    psl->strand, psl->tStart+1, psl->tEnd,
 	    psl->tEnd - psl->tStart);
 	}
     printf("
\n"); puts("

Missing a match?

\n"); puts("
\n"); } pslFreeList(&pslList); } void trimUniq(bioSeq *seqList) /* Check that all seq's in list have a unique name. Try and * abbreviate longer sequence names. */ { struct hash *hash = newHash(0); @@ -902,32 +882,31 @@ int bestHits = 0; int bestTStart = 0; int bestTEnd = 0; int bestExons = 0; char bestTStrand = ' '; char bestQStrand = ' '; char *thisChrom = NULL; int thisHits = 0; int thisTStart = 0; int thisTEnd = 0; int thisExons = 0; char thisTStrand = ' '; char thisQStrand = ' '; -int geneGap = 1000000; // about a million bases is a cutoff for genes. - // TODO should this really be 750K? +int geneGap = ffIntronMaxDefault; // about a million bases is a cutoff for genes. int qSlop = 5; // forgive 5 bases, about dna stepSize = 5. if (gH->complex) qSlop = 4; // reduce for translated with tileSize/stepSize = 4. struct gfResult *gfR = NULL, *gfLast = NULL; for(gfR=gH->gfList; gfR; gfR=gfR->next) { // filter on given queryFrame if (gfR->qFrame != queryFrame) continue; if (gfLast && (gfR->tStrand == gfLast->tStrand) && sameString(gfR->chrom,gfLast->chrom) && (gfR->qStart >= (gfLast->qEnd - qSlop)) && (gfR->tStart >= gfLast->tEnd) && ((gfR->tStart - gfLast->tEnd) < geneGap)) @@ -1038,162 +1017,148 @@ { dyStringPrintf(gH->dbg,"%d matches
\n", matchCount); break; } else if (startsWith("Error:", buf)) { errAbort("%s", buf); break; } else { dyStringPrintf(gH->dbg,"%s
\n", buf); // chop the line into words int numHits = 0; char *line = buf; - char *word=NULL; - int i=0; struct gfResult *gfR = NULL; AllocVar(gfR); gfR->qStrand = (gH->queryRC ? '-' : '+'); - while ((word = nextWord(&line)) != NULL) + + char *word[10]; + int fields = chopLine(line,word); + int expectedFields = 6; + if (gH->complex) { - if (i == 0) - { // e.g. 3139 - gfR->qStart = sqlUnsigned(word); - } - if (i == 1) - { // e.g. 3220 - gfR->qEnd = sqlUnsigned(word); + if (gH->isProt) + expectedFields = 8; + else + expectedFields = 9; } - if (i == 2) - { // e.g. hg38.2bit:chr1 - char *colon = strchr(word, ':'); + if (fields != expectedFields) + errAbort("expected %d fields, got %d fields", expectedFields, fields); + + gfR->qStart = sqlUnsigned(word[0]); // e.g. 3139 + gfR->qEnd = sqlUnsigned(word[1]); // e.g. 3220 + + // e.g. hg38.2bit:chr1 + char *colon = strchr(word[2], ':'); if (colon) { gfR->chrom = cloneString(colon+1); } else { // e.g. chr10.nib - char *dot = strchr(word, '.'); + char *dot = strchr(word[2], '.'); if (dot) { *dot = 0; - gfR->chrom = cloneString(word); + gfR->chrom = cloneString(word[2]); } else errAbort("Expecting colon or period in the 3rd field of gfServer response"); } - } - if (i == 3) - { // e.g. 173515 - gfR->tStart = sqlUnsigned(word); - } - if (i == 4) - { // e.g. 173586 - gfR->tEnd = sqlUnsigned(word); - } - if (i == 5) - { // e.g. 14 - numHits = sqlUnsigned(word); + + gfR->tStart = sqlUnsigned(word[3]); // e.g. 173515 + gfR->tEnd = sqlUnsigned(word[4]); // e.g. 173586 + + numHits = sqlUnsigned(word[5]); // e.g. 14 // Frustrated with weird little results with vastly exaggerated hit-score, // I have flattened that out so it maxes out at #tiles that could fit // It seems to still work just fine. I am going to leave it for now. // One minor note, by suppressing extra scores for short exons, // it will not prefer alignments with the same number of hits, but more exons. if (!gH->complex) // dna xType { // maximum tiles that could fit in qSpan int limit = ((gfR->qEnd - gfR->qStart) - 6)/5; ++limit; // for a little extra. if (numHits > limit) numHits = limit; } gfR->numHits = numHits; - } + if (gH->complex) { - if (i == 6) - { // e.g. + or - - gfR->tStrand = word[0]; - } - if (i == 7) - { // e.g. 0,1,2 - gfR->tFrame = sqlUnsigned(word); - } + gfR->tStrand = word[6][0]; // e.g. + or - + gfR->tFrame = sqlUnsigned(word[7]); // e.g. 0,1,2 if (!gH->isProt) { - if (i == 8) - { // e.g. 0,1,2 - gfR->qFrame = sqlUnsigned(word); - } + gfR->qFrame = sqlUnsigned(word[8]); // e.g. 0,1,2 } } else { gfR->tStrand = '+'; // dna search only on + target strand } - ++i; - } if (gH->complex) { char *s = netGetLongString(gH->sd); if (s == NULL) break; - dyStringPrintf(gH->dbg,"%s
\n", s); //dumps out qstart1 start1 qstart2 tstart2 ... + dyStringPrintf(gH->dbg,"%s
\n", s); //dumps out qstart1 tstart1 qstart2 tstart2 ... freeMem(s); } slAddHead(&gH->gfList, gfR); } ++matchCount; } slReverse(&gH->gfList); unTranslateCoordinates(gH); // convert back to untranslated coordinates -slSort(&gH->gfList, slGfResultsCmp); // sort by tStrand, chrom, tStart +slSort(&gH->gfList, gfResultsCmp); // sort by tStrand, chrom, tStart struct qFrameResults /* Information about hits on a genome assembly */ { int maxGeneHits; /* Highest gene hit-count */ char *maxGeneChrom; /* Target Chrom for gene with max gene hits */ int maxGeneTStart; /* Target Start Coordinate for gene with max hits */ int maxGeneTEnd; /* Target End Coordinate for gene with max hits*/ int maxGeneExons; /* Number of Exons in gene with max hits */ char maxGeneTStrand;/* + or - TStrand for gene with max hits */ }; int qFrame = 0; int qFrameStart = 0; int qFrameEnd = 0; if (gH->complex && !gH->isProt) // rnax, dnax { qFrameEnd = 2; } struct qFrameResults r[3]; for(qFrame = qFrameStart; qFrame <= qFrameEnd; ++qFrame) { - findBestGene(gH, qFrame); // find best gene-line thing with most hits + findBestGene(gH, qFrame); // find best gene-like thing with most hits r[qFrame].maxGeneHits = gH->maxGeneHits; r[qFrame].maxGeneChrom = cloneString(gH->maxGeneChrom); r[qFrame].maxGeneTStart = gH->maxGeneTStart; r[qFrame].maxGeneTEnd = gH->maxGeneTEnd; r[qFrame].maxGeneExons = gH->maxGeneExons; r[qFrame].maxGeneTStrand = gH->maxGeneTStrand; } // combine the results from all 3 qFrames if (gH->complex && !gH->isProt) // rnax, dnax { int biggestHit = -1; @@ -1954,39 +1919,39 @@ } if (ptMax > 0) { /* wait for remote parallel load to finish */ remoteParallelLoadWait(atoi(cfgOptionDefault("parallelFetch.timeout", "90"))); // wait up to default 90 seconds. } // Should continue with pfdDone since threads could still be running that might access pdfList ? // Hide weaker of RC'd query pairs, if not debugging. // Combine pairs with a query and its RC. if (!(debuggingGfResults) && (sameString(pfdDone->xType,"dna") || sameString(pfdDone->xType,"dnax"))) { - slSort(&pfdDone, slRcPairsCmp); + slSort(&pfdDone, rcPairsCmp); hideWeakerOfQueryRcPairs(pfdDone); } // requires db for chromSize, do database after multi-threading done. changeMaxGenePositionToPositiveStrandCoords(pfdDone); // sort by maximum hits - slSort(&pfdDone, slHitsCmp); + slSort(&pfdDone, genomeHitsCmp); // Print instructions printf("Click the link in the Assembly column to see the full BLAT output for that Genome.

\n"); // Print report // TODO move to final report at the end of ALL Assemblies int lastSeqNumber = -1; int idCount = 0; char id[256]; struct genomeHits *gH = NULL; for (gH = pfdDone; gH; gH = gH->next) { if (lastSeqNumber != gH->seqNumber) { if (lastSeqNumber != -1) // end previous table {