386915fb7bdbae22f8ed152174d8b8635524c1b2
galt
  Sun Oct 28 00:51:33 2018 -0700
code review fixes to hgBlat Search ALL. refs #22251

diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c
index 2e374c6..3bbafdd 100644
--- src/hg/hgBlat/hgBlat.c
+++ src/hg/hgBlat/hgBlat.c
@@ -20,30 +20,31 @@
 #include "cart.h"
 #include "dbDb.h"
 #include "blatServers.h"
 #include "web.h"
 #include "hash.h"
 #include "botDelay.h"
 #include "trashDir.h"
 #include "trackHub.h"
 #include "hgConfig.h"
 #include "errCatch.h"
 #include "portable.h"
 #include "portable.h"
 #include "dystring.h"
 #include "chromInfo.h"
 #include "net.h"
+#include "fuzzyFind.h"
 
 
 struct cart *cart;	/* The user's ui state. */
 struct hash *oldVars = NULL;
 boolean orgChange = FALSE;
 boolean dbChange = FALSE;
 boolean allGenomes = FALSE;
 
 
 struct gfResult
 /* Detailed gfServer results, this is a span of several nearby tiles, minimum 2 for dna. */
     {
     struct gfResult *next;
     /* have to multiply translated coordinates by 3 */
     int qStart;    /* Query Start Coordinate */  
@@ -83,74 +84,74 @@
     int maxGeneTStart;  /* Target Start Coordinate for gene with max hits */  
     int maxGeneTEnd;    /* Target End Coordinate for gene with max hits*/
     int maxGeneExons;   /* Number of Exons in gene with max hits */
     char maxGeneStrand[3]; /* + or - or ++ +- -+ -- Strand for gene with max hits */ 
     char maxGeneTStrand;/* + or - TStrand for gene with max hits */ 
     boolean done;       /* Did the job get to finish */
     boolean error;      /* Some error happened */
     char *networkErrMsg;      /* Network layer error message */
     struct dyString *dbg;     /* Output debugging info */
     struct gfResult *gfList;  /* List of gfResult records */
     boolean hide;      /* To not show both strands, suppress the weaker-scoring one */
     };
 
 boolean debuggingGfResults = FALSE; //TRUE;
 
-int slGfResultsCmp(const void *va, const void *vb)
+int gfResultsCmp(const void *va, const void *vb)
 /* Compare two gfResults. */
 {
 const struct gfResult *a = *((struct gfResult **)va);
 const struct gfResult *b = *((struct gfResult **)vb);
 int result = a->tStrand - b->tStrand;
 if (result == 0)
     {
     result = strcmp(a->chrom, b->chrom);
     if (result == 0)
 	{
 	return (a->tStart - b->tStart);
 	}
     else
 	return result;
     }
 else
     return result;
 }
 
-int slRcPairsCmp(const void *va, const void *vb)
+int rcPairsCmp(const void *va, const void *vb)
 /* Recombine Reverse-complimented Pairs (to hide the weaker one). */
 {
 const struct genomeHits *a = *((struct genomeHits **)va);
 const struct genomeHits *b = *((struct genomeHits **)vb);
 // int result = strcmp(a->faName, b->faName); order by faName alphabetical
 int result = a->seqNumber - b->seqNumber;
 if (result == 0)
     {
     result = strcmp(a->db, b->db);
     if (result == 0)
 	{
 	return (a->queryRC - b->queryRC); 
 	}
     else
 	return result;
     }
 else
     return result;
 }
 
 
-int slHitsCmp(const void *va, const void *vb)
-/* Compare two slHits. */
+int genomeHitsCmp(const void *va, const void *vb)
+/* Compare two genomeHits. */
 {
 const struct genomeHits *a = *((struct genomeHits **)va);
 const struct genomeHits *b = *((struct genomeHits **)vb);
 // int result = strcmp(a->faName, b->faName); order by faName alphabetical
 int result = a->seqNumber - b->seqNumber;
 if (result == 0)
     {
     if (a->error && b->error)
 	return 0;
     else if (b->error && !a->error)
 	return -1;
     else if (!b->error && a->error)
 	return 1;
     else
 	{
@@ -223,39 +224,41 @@
 
     }
 }
 
 static int remoteParallelLoadWait(int maxTimeInSeconds)
 /* Wait, checking to see if finished (completed or errAborted).
  * If timed-out or never-ran, record error status.
  * Return error count. */
 {
 int maxTimeInMilliseconds = 1000 * maxTimeInSeconds;
 struct genomeHits *pfd;
 int errCount = 0;
 int waitTime = 0;
 while(1)
     {
-    sleep1000(50); // milliseconds
+    sleep1000(50); // milliseconds, give the threads some time to work
     waitTime += 50;
+    // check if they are done
     boolean done = TRUE;
     pthread_mutex_lock( &pfdMutex );
-    if (pfdList || pfdRunning)
+    if (pfdList || pfdRunning)  // lists not empty, stuff to do still
 	done = FALSE;
     pthread_mutex_unlock( &pfdMutex );
     if (done)
         break;
+    // check if max allowed time has been exceeded
     if (waitTime >= maxTimeInMilliseconds)
         break;
     }
 pthread_mutex_lock( &pfdMutex );
 pfdNeverStarted = pfdList;
 pfdList = NULL;  // stop the workers from starting any more waiting track loads
 for (pfd = pfdNeverStarted; pfd; pfd = pfd->next)
     {
     // query was never even started
     char temp[1024];
     safef(temp, sizeof temp, "Ran out of time (%d milliseconds) unable to process %s %s", maxTimeInMilliseconds, pfd->genome, pfd->db);
     pfd->networkErrMsg = cloneString(temp);
     pfd->error = TRUE;
     ++errCount;
     }
@@ -578,107 +581,84 @@
             printf("<INPUT TYPE=\"hidden\" name=\"isProt\" value=\"on\" />\n");
 
         printf("<TABLE><TR><TD>Custom track name: ");
         cgiMakeTextVar( "trackName", trackName, 30);
         printf("</TD></TR>");
 
         printf("<TR><TD> Custom track description: ");
         cgiMakeTextVar( "trackDescription", trackDescription,50);
         printf("</TD></TR>");
         printf("<TR><TD><INPUT TYPE=SUBMIT NAME=Submit VALUE=\"Build a custom track with these results\"></TD></TR>\n");
         printf("</TABLE></FORM></DIV>");
         }
 
     printf("<DIV STYLE=\"display:block;\"><PRE>");
 
-    // find maximum query name size for padding calculations
-    int maxQChromNameSize = 0;
-    for (psl = pslList; psl != NULL; psl = psl->next)
-	{
-	int l = strlen(psl->qName);
-	maxQChromNameSize = max(maxQChromNameSize,l);
-	}
-    maxQChromNameSize = max(maxQChromNameSize,5);
-
+    // find maximum query name size for padding calculations and
     // find maximum target chrom name size for padding calculations
+    int maxQChromNameSize = 0;
     int maxTChromNameSize = 0;
     for (psl = pslList; psl != NULL; psl = psl->next)
 	{
-	int l = strlen(psl->tName);
-	maxTChromNameSize = max(maxTChromNameSize,l);
+	int qLen = strlen(psl->qName);
+	maxQChromNameSize = max(maxQChromNameSize,qLen);
+	int tLen = strlen(psl->tName);
+	maxTChromNameSize = max(maxTChromNameSize,tLen);
 	}
+    maxQChromNameSize = max(maxQChromNameSize,5);
     maxTChromNameSize = max(maxTChromNameSize,5);
 
-    // header padding
-    char tempQ[1024];
-    char tempT[1024];
-
-
     printf("   ACTIONS      QUERY ");
-    tempQ[0] = 0;
-    safecatRepeatChar(tempQ, sizeof tempQ, ' ', (maxQChromNameSize - 5));
-    printf("%s", tempQ);
+    
+    spaceOut(stdout, maxQChromNameSize - 5);
 
     printf("SCORE START   END QSIZE IDENTITY  CHROM ");
-    tempT[0] = 0;
-    safecatRepeatChar(tempT, sizeof tempT, ' ', (maxTChromNameSize - 5));
-    printf("%s", tempT);
+    spaceOut(stdout, maxTChromNameSize - 5);
 
     printf(" STRAND  START       END   SPAN\n");
 
     printf("---------------------------------------------------------------------------------------------");
-
-    tempQ[0] = 0;
-    safecatRepeatChar(tempQ, sizeof tempQ, '-', (maxQChromNameSize - 5));
-    printf("%s", tempQ);
-
-    tempT[0] = 0;
-    safecatRepeatChar(tempT, sizeof tempT, '-', (maxTChromNameSize - 5));
-    printf("%s", tempT);
+    repeatCharOut(stdout, '-', maxQChromNameSize - 5);
+    repeatCharOut(stdout, '-', maxTChromNameSize - 5);
 
     printf("\n");
 
     for (psl = pslList; psl != NULL; psl = psl->next)
 	{
         if (customText)
             printf("<A HREF=\"%s?position=%s:%d-%d&db=%s&hgt.customText=%s&%s%s\">",
                 browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database, 
                 customText, uiState, unhideTrack);
         else
             printf("<A HREF=\"%s?position=%s:%d-%d&db=%s&ss=%s+%s&%s%s\">",
                 browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database, 
                 pslName, faName, uiState, unhideTrack);
 	printf("browser</A> ");
 	printf("<A HREF=\"%s?o=%d&g=htcUserAli&i=%s+%s+%s&c=%s&l=%d&r=%d&db=%s&%s\">", 
 	    hgcUrl, psl->tStart, pslName, cgiEncode(faName), psl->qName,  psl->tName,
 	    psl->tStart, psl->tEnd, database, uiState);
 	printf("details</A> ");
 
-	tempQ[0] = 0;
-	safecpy(tempQ, sizeof tempQ, psl->qName);
-	int qPadding = maxQChromNameSize - strlen(psl->qName);
-	safecatRepeatChar(tempQ, sizeof tempQ, ' ', qPadding);
-
-	tempT[0] = 0;
-	safecpy(tempT, sizeof tempT, psl->tName);
-	int tPadding = maxTChromNameSize - strlen(psl->tName);
-	safecatRepeatChar(tempT, sizeof tempT, ' ', tPadding);
-
-	printf("%s %5d %5d %5d %5d   %5.1f%%  %s  %-2s  %9d %9d %6d\n",
-	    tempQ, pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize,
-	    100.0 - pslCalcMilliBad(psl, TRUE) * 0.1,
-	    tempT, psl->strand, psl->tStart+1, psl->tEnd,
+	printf("%s",psl->qName);
+	spaceOut(stdout, maxQChromNameSize - strlen(psl->qName));
+	printf(" %5d %5d %5d %5d   %5.1f%%  ",
+	    pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize,
+	    100.0 - pslCalcMilliBad(psl, TRUE) * 0.1);
+	printf("%s",psl->tName);
+	spaceOut(stdout, maxTChromNameSize - strlen(psl->tName));
+	printf("  %-2s  %9d %9d %6d\n",
+	    psl->strand, psl->tStart+1, psl->tEnd,
 	    psl->tEnd - psl->tStart);
 	}
     printf("</PRE>\n");
     puts("<P style=\"text-align:right\"><SMALL><A HREF=\"../FAQ/FAQblat.html#blat1b\">Missing a match?</A></SMALL></P>\n");
     puts("</DIV>\n");
     }
 pslFreeList(&pslList);
 
 }
 
 void trimUniq(bioSeq *seqList)
 /* Check that all seq's in list have a unique name.  Try and
  * abbreviate longer sequence names. */
 {
 struct hash *hash = newHash(0);
@@ -902,32 +882,31 @@
 int bestHits   = 0;
 int bestTStart = 0;
 int bestTEnd   = 0;
 int bestExons  = 0;
 char bestTStrand = ' ';
 char bestQStrand = ' ';
 
 char *thisChrom = NULL;
 int thisHits   = 0;
 int thisTStart = 0;
 int thisTEnd   = 0;
 int thisExons  = 0;
 char thisTStrand = ' ';
 char thisQStrand = ' ';
 
-int geneGap = 1000000;  // about a million bases is a cutoff for genes.
-    // TODO should this really be 750K?
+int geneGap = ffIntronMaxDefault;  // about a million bases is a cutoff for genes.
 
 int qSlop = 5; // forgive 5 bases, about dna stepSize = 5.
 if (gH->complex)
     qSlop = 4; // reduce for translated with tileSize/stepSize = 4.
 
 struct gfResult *gfR = NULL, *gfLast = NULL;
 for(gfR=gH->gfList; gfR; gfR=gfR->next)
     {
     // filter on given queryFrame
     if (gfR->qFrame != queryFrame)
 	continue;
 
     if (gfLast && (gfR->tStrand == gfLast->tStrand) && sameString(gfR->chrom,gfLast->chrom) && 
 
 	(gfR->qStart >= (gfLast->qEnd - qSlop)) && (gfR->tStart >= gfLast->tEnd) && ((gfR->tStart - gfLast->tEnd) < geneGap))
@@ -1038,162 +1017,148 @@
         {
         dyStringPrintf(gH->dbg,"%d matches<br>\n", matchCount);
         break;
         }
     else if (startsWith("Error:", buf))
        {
        errAbort("%s", buf);
        break;
        }
     else
         {
         dyStringPrintf(gH->dbg,"%s<br>\n", buf);
 	// chop the line into words
 	int numHits = 0;
 	char *line = buf;
-	char *word=NULL;
-	int i=0;
 	struct gfResult *gfR = NULL;
 	AllocVar(gfR);
 	gfR->qStrand = (gH->queryRC ? '-' : '+');
-	while ((word = nextWord(&line)) != NULL)
+
+	char *word[10];
+	int fields = chopLine(line,word);
+	int expectedFields = 6;
+	if (gH->complex)
 	    {
-	    if (i == 0)
-		{  // e.g. 3139
-		gfR->qStart = sqlUnsigned(word);
-		}
-	    if (i == 1)
-		{  // e.g. 3220
-		gfR->qEnd = sqlUnsigned(word);
+	    if (gH->isProt)
+		expectedFields = 8;
+	    else
+		expectedFields = 9;
 	    }
-	    if (i == 2)
-		{  // e.g. hg38.2bit:chr1
-		char *colon = strchr(word, ':');
+	if (fields != expectedFields)
+	    errAbort("expected %d fields, got %d fields", expectedFields, fields);
+
+	gfR->qStart = sqlUnsigned(word[0]); // e.g. 3139
+	gfR->qEnd   = sqlUnsigned(word[1]); // e.g. 3220
+
+	// e.g. hg38.2bit:chr1
+	char *colon = strchr(word[2], ':');
 	if (colon)
 	    {
 	    gfR->chrom = cloneString(colon+1);
 	    }
 	else
 	    { // e.g. chr10.nib
-		    char *dot = strchr(word, '.');
+	    char *dot = strchr(word[2], '.');
 	    if (dot)
 		{
 		*dot = 0;
-			gfR->chrom = cloneString(word);
+		gfR->chrom = cloneString(word[2]);
 		}
 	    else
 		errAbort("Expecting colon or period in the 3rd field of gfServer response");
 	    }
-		}
-	    if (i == 3)
-		{  // e.g. 173515
-		gfR->tStart = sqlUnsigned(word);
-		}
-	    if (i == 4)
-		{  // e.g. 173586
-		gfR->tEnd = sqlUnsigned(word);
-		}
-	    if (i == 5)
-		{  // e.g. 14
-		numHits = sqlUnsigned(word);
+
+	gfR->tStart = sqlUnsigned(word[3]); // e.g. 173515
+	gfR->tEnd   = sqlUnsigned(word[4]); // e.g. 173586
+
+	numHits     = sqlUnsigned(word[5]); // e.g. 14
 
 	// Frustrated with weird little results with vastly exaggerated hit-score,
 	// I have flattened that out so it maxes out at #tiles that could fit 
 	// It seems to still work just fine. I am going to leave it for now.
 	// One minor note, by suppressing extra scores for short exons,
 	// it will not prefer alignments with the same number of hits, but more exons.
 	if (!gH->complex) // dna xType
 	    {
 	    // maximum tiles that could fit in qSpan
 	    int limit = ((gfR->qEnd - gfR->qStart) - 6)/5; 
 	    ++limit;  // for a little extra.
 	    if (numHits > limit)
 		numHits = limit;
 	    }
 
 	gfR->numHits = numHits;
-		}
+
 	if (gH->complex)
 	    {
-		if (i == 6)
-		    {  // e.g. + or -
-		    gfR->tStrand = word[0];
-		    }
-		if (i == 7)
-		    {  // e.g. 0,1,2
-		    gfR->tFrame = sqlUnsigned(word);
-		    }
+	    gfR->tStrand = word[6][0];          // e.g. + or -
+	    gfR->tFrame = sqlUnsigned(word[7]); // e.g. 0,1,2
 	    if (!gH->isProt)
 		{
-		    if (i == 8)
-			{  // e.g. 0,1,2
-			gfR->qFrame = sqlUnsigned(word);
-			}
+		gfR->qFrame = sqlUnsigned(word[8]); // e.g. 0,1,2
 		}
 	    }
 	else
 	    {
 	    gfR->tStrand = '+';  // dna search only on + target strand
 	    }
-	    ++i;
-	    }
 	
         if (gH->complex)
             {
             char *s = netGetLongString(gH->sd);
             if (s == NULL)
                 break;
-            dyStringPrintf(gH->dbg,"%s<br>\n", s); //dumps out qstart1 start1 qstart2 tstart2 ...  
+            dyStringPrintf(gH->dbg,"%s<br>\n", s); //dumps out qstart1 tstart1 qstart2 tstart2 ...  
             freeMem(s);
             }
 
 	slAddHead(&gH->gfList, gfR);
         }
     ++matchCount;
     }
 slReverse(&gH->gfList);
 
 unTranslateCoordinates(gH);  // convert back to untranslated coordinates
-slSort(&gH->gfList, slGfResultsCmp);  // sort by tStrand, chrom, tStart
+slSort(&gH->gfList, gfResultsCmp);  // sort by tStrand, chrom, tStart
 
 
 
 struct qFrameResults
 /* Information about hits on a genome assembly */
     {
     int maxGeneHits;    /* Highest gene hit-count */
     char *maxGeneChrom; /* Target Chrom for gene with max gene hits */
     int maxGeneTStart;  /* Target Start Coordinate for gene with max hits */  
     int maxGeneTEnd;    /* Target End Coordinate for gene with max hits*/
     int maxGeneExons;   /* Number of Exons in gene with max hits */
     char maxGeneTStrand;/* + or - TStrand for gene with max hits */ 
     };
 
 int qFrame = 0;
 int qFrameStart = 0;
 int qFrameEnd = 0;
 
 if (gH->complex && !gH->isProt)  // rnax, dnax
     {
     qFrameEnd = 2;
     }
 
 struct qFrameResults r[3]; 
 
 for(qFrame = qFrameStart; qFrame <= qFrameEnd; ++qFrame)
     {	    
-    findBestGene(gH, qFrame);  // find best gene-line thing with most hits
+    findBestGene(gH, qFrame);  // find best gene-like thing with most hits
     
     r[qFrame].maxGeneHits = gH->maxGeneHits;
     r[qFrame].maxGeneChrom = cloneString(gH->maxGeneChrom);
     r[qFrame].maxGeneTStart = gH->maxGeneTStart;
     r[qFrame].maxGeneTEnd = gH->maxGeneTEnd;
     r[qFrame].maxGeneExons = gH->maxGeneExons;
     r[qFrame].maxGeneTStrand = gH->maxGeneTStrand;
 
     }
 
 // combine the results from all 3 qFrames
 
 if (gH->complex && !gH->isProt)  // rnax, dnax
     {
     int biggestHit = -1;
@@ -1954,39 +1919,39 @@
 	    }
 
 	if (ptMax > 0)
 	    {
 	    /* wait for remote parallel load to finish */
 	    remoteParallelLoadWait(atoi(cfgOptionDefault("parallelFetch.timeout", "90")));  // wait up to default 90 seconds.
 	    }
 
 	// Should continue with pfdDone since threads could still be running that might access pdfList ?
 
 	// Hide weaker of RC'd query pairs, if not debugging.
 	// Combine pairs with a query and its RC.
 	if (!(debuggingGfResults) &&
 	   (sameString(pfdDone->xType,"dna") || sameString(pfdDone->xType,"dnax")))
 	    {
-	    slSort(&pfdDone, slRcPairsCmp);  
+	    slSort(&pfdDone, rcPairsCmp);  
 	    hideWeakerOfQueryRcPairs(pfdDone);
 	    }
 
 	// requires db for chromSize, do database after multi-threading done.
 	changeMaxGenePositionToPositiveStrandCoords(pfdDone);
 
         // sort by maximum hits
-	slSort(&pfdDone, slHitsCmp);
+	slSort(&pfdDone, genomeHitsCmp);
 
 	// Print instructions
 	printf("Click the link in the Assembly column to see the full BLAT output for that Genome.<br><br>\n");
 
 	// Print report  // TODO move to final report at the end of ALL Assemblies
 	int lastSeqNumber = -1;
 	int idCount = 0;
 	char id[256];
 	struct genomeHits *gH = NULL;
 	for (gH = pfdDone; gH; gH = gH->next)
 	    {
 	    if (lastSeqNumber != gH->seqNumber)
 		{
 		if (lastSeqNumber != -1) // end previous table
 		    {