c4f23aaa36169fc4ee7aa14c7fb0eaf69bfb7a02
galt
  Thu Oct 18 18:13:40 2018 -0700
Adding support to hgBlat for searching attached hubs gfServers

diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c
index 55b58fe..fbb419e 100644
--- src/hg/hgBlat/hgBlat.c
+++ src/hg/hgBlat/hgBlat.c
@@ -18,31 +18,31 @@
 #include "hdb.h"
 #include "hui.h"
 #include "cart.h"
 #include "dbDb.h"
 #include "blatServers.h"
 #include "web.h"
 #include "hash.h"
 #include "botDelay.h"
 #include "trashDir.h"
 #include "trackHub.h"
 #include "hgConfig.h"
 #include "errCatch.h"
 #include "portable.h"
 #include "portable.h"
 #include "dystring.h"
-
+#include "chromInfo.h"
 #include "net.h"
 
 
 struct cart *cart;	/* The user's ui state. */
 struct hash *oldVars = NULL;
 boolean orgChange = FALSE;
 boolean dbChange = FALSE;
 boolean allGenomes = FALSE;
 
 
 struct gfResult
 /* Detailed gfServer results, this is a span of several nearby tiles, minimum 2 for dna. */
     {
     struct gfResult *next;
     /* have to multiply translated coordinates by 3 */
@@ -241,40 +241,40 @@
     pthread_mutex_lock( &pfdMutex );
     if (pfdList || pfdRunning)
 	done = FALSE;
     pthread_mutex_unlock( &pfdMutex );
     if (done)
         break;
     if (waitTime >= maxTimeInMilliseconds)
         break;
     }
 pthread_mutex_lock( &pfdMutex );
 pfdNeverStarted = pfdList;
 pfdList = NULL;  // stop the workers from starting any more waiting track loads
 for (pfd = pfdNeverStarted; pfd; pfd = pfd->next)
     {
     // query was never even started
-    char temp[256];
+    char temp[1024];
     safef(temp, sizeof temp, "Ran out of time (%d milliseconds) unable to process %s %s", maxTimeInMilliseconds, pfd->genome, pfd->db);
     pfd->networkErrMsg = cloneString(temp);
     pfd->error = TRUE;
     ++errCount;
     }
 for (pfd = pfdRunning; pfd; pfd = pfd->next)
     {
     // unfinished query
-    char temp[256];
+    char temp[1024];
     safef(temp, sizeof temp, "Timeout %d milliseconds exceeded processing %s %s", maxTimeInMilliseconds, pfd->genome, pfd->db);
     pfd->networkErrMsg = cloneString(temp);
     pfd->error = TRUE;
     ++errCount;
     }
 for (pfd = pfdDone; pfd; pfd = pfd->next)
     {
     // some done queries may have errors
     if (pfd->error)
         ++errCount;
     }
 pthread_mutex_unlock( &pfdMutex );
 return errCount;
 }
 
@@ -578,79 +578,107 @@
             printf("<INPUT TYPE=\"hidden\" name=\"isProt\" value=\"on\" />\n");
 
         printf("<TABLE><TR><TD>Custom track name: ");
         cgiMakeTextVar( "trackName", trackName, 30);
         printf("</TD></TR>");
 
         printf("<TR><TD> Custom track description: ");
         cgiMakeTextVar( "trackDescription", trackDescription,50);
         printf("</TD></TR>");
         printf("<TR><TD><INPUT TYPE=SUBMIT NAME=Submit VALUE=\"Build a custom track with these results\"></TD></TR>\n");
         printf("</TABLE></FORM></DIV>");
         }
 
     printf("<DIV STYLE=\"display:block;\"><PRE>");
 
+    // find maximum query name size for padding calculations
+    int maxQChromNameSize = 0;
+    for (psl = pslList; psl != NULL; psl = psl->next)
+	{
+	int l = strlen(psl->qName);
+	maxQChromNameSize = max(maxQChromNameSize,l);
+	}
+    maxQChromNameSize = max(maxQChromNameSize,5);
+
     // find maximum target chrom name size for padding calculations
-    int maxChromNameSize = 0;
+    int maxTChromNameSize = 0;
     for (psl = pslList; psl != NULL; psl = psl->next)
 	{
 	int l = strlen(psl->tName);
-	maxChromNameSize = max(maxChromNameSize,l);
+	maxTChromNameSize = max(maxTChromNameSize,l);
 	}
-    maxChromNameSize = max(maxChromNameSize,5);
+    maxTChromNameSize = max(maxTChromNameSize,5);
 
     // header padding
-    char temp[256];
+    char tempQ[1024];
+    char tempT[1024];
+
+
+    printf("   ACTIONS      QUERY ");
+    tempQ[0] = 0;
+    safecatRepeatChar(tempQ, sizeof tempQ, ' ', (maxQChromNameSize - 5));
+    printf("%s", tempQ);
 
-    temp[0] = 0;
-    safecatRepeatChar(temp, sizeof temp, ' ', (maxChromNameSize - 5));
+    printf("SCORE START   END QSIZE IDENTITY  CHROM ");
+    tempT[0] = 0;
+    safecatRepeatChar(tempT, sizeof tempT, ' ', (maxTChromNameSize - 5));
+    printf("%s", tempT);
 
-    printf("   ACTIONS      QUERY          SCORE START   END QSIZE IDENTITY  CHROM ");
-    printf("%s", temp);
     printf(" STRAND  START       END   SPAN\n");
 
-    printf("------------------------------------------------------------------------------------------------------");
-    temp[0] = 0;
-    safecatRepeatChar(temp, sizeof temp, '-', (maxChromNameSize - 5));
-    printf("%s\n", temp);
+    printf("---------------------------------------------------------------------------------------------");
+
+    tempQ[0] = 0;
+    safecatRepeatChar(tempQ, sizeof tempQ, '-', (maxQChromNameSize - 5));
+    printf("%s", tempQ);
+
+    tempT[0] = 0;
+    safecatRepeatChar(tempT, sizeof tempT, '-', (maxTChromNameSize - 5));
+    printf("%s", tempT);
+
+    printf("\n");
 
     for (psl = pslList; psl != NULL; psl = psl->next)
 	{
         if (customText)
             printf("<A HREF=\"%s?position=%s:%d-%d&db=%s&hgt.customText=%s&%s%s\">",
                 browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database, 
                 customText, uiState, unhideTrack);
         else
             printf("<A HREF=\"%s?position=%s:%d-%d&db=%s&ss=%s+%s&%s%s\">",
                 browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database, 
                 pslName, faName, uiState, unhideTrack);
 	printf("browser</A> ");
 	printf("<A HREF=\"%s?o=%d&g=htcUserAli&i=%s+%s+%s&c=%s&l=%d&r=%d&db=%s&%s\">", 
 	    hgcUrl, psl->tStart, pslName, cgiEncode(faName), psl->qName,  psl->tName,
 	    psl->tStart, psl->tEnd, database, uiState);
 	printf("details</A> ");
 
-	temp[0] = 0;
-	safecpy(temp, sizeof temp, psl->tName);
-	int padding = maxChromNameSize - strlen(psl->tName);
-	safecatRepeatChar(temp, sizeof temp, ' ', padding);
+	tempQ[0] = 0;
+	safecpy(tempQ, sizeof tempQ, psl->qName);
+	int qPadding = maxQChromNameSize - strlen(psl->qName);
+	safecatRepeatChar(tempQ, sizeof tempQ, ' ', qPadding);
+
+	tempT[0] = 0;
+	safecpy(tempT, sizeof tempT, psl->tName);
+	int tPadding = maxTChromNameSize - strlen(psl->tName);
+	safecatRepeatChar(tempT, sizeof tempT, ' ', tPadding);
 
 	printf("%-14s %5d %5d %5d %5d   %5.1f%%  %s  %-2s  %9d %9d %6d\n",
-	    psl->qName, pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize,
+	    tempQ, pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize,
 	    100.0 - pslCalcMilliBad(psl, TRUE) * 0.1,
-	    temp, psl->strand, psl->tStart+1, psl->tEnd,
+	    tempT, psl->strand, psl->tStart+1, psl->tEnd,
 	    psl->tEnd - psl->tStart);
 	}
     printf("</PRE>\n");
     puts("<P style=\"text-align:right\"><SMALL><A HREF=\"../FAQ/FAQblat.html#blat1b\">Missing a match?</A></SMALL></P>\n");
     puts("</DIV>\n");
     }
 pslFreeList(&pslList);
 
 }
 
 void trimUniq(bioSeq *seqList)
 /* Check that all seq's in list have a unique name.  Try and
  * abbreviate longer sequence names. */
 {
 struct hash *hash = newHash(0);
@@ -1581,60 +1609,60 @@
 );
 puts("<TR><TD COLSPAN=5 WIDTH=\"100%\">\n"); 
 puts("<BR><B>File Upload:</B> ");
 puts("Rather than pasting a sequence, you can choose to upload a text file containing "
 	 "the sequence.<BR>");
 puts("Upload sequence: <INPUT TYPE=FILE NAME=\"seqFile\">");
 puts(" <INPUT TYPE=SUBMIT Name=Submit VALUE=\"submit file\"><P>\n");
 printf("%s", 
 "<P>Only DNA sequences of 25,000 or fewer bases and protein or translated \n"
 "sequence of 10000 or fewer letters will be processed.  Up to 25 sequences\n"
 "can be submitted at the same time. The total limit for multiple sequence\n"
 "submissions is 50,000 bases or 25,000 letters.\n</P>\n");
 
 printf("%s", 
 "<P>The <b>Search ALL</b> checkbox above the Genome drop-down list allows you to search the\n"
-"genomes of the default assemblies for all of our organisms.\n"
+"genomes of the default assemblies for all of our organisms. It also searches any attached hubs' blat servers.\n"
 "This shows you which organisms have the highest homology with your query sequence.\n"
 "The results are ordered so that the organism whose best alignment has the most hits is at the top,\n"
 "and shows the best region found.\n"
 "It makes quick approximate alignments based only on the raw hits,\n"
 "which are a perfectly matching short sub-sequence of a fixed size: \n"
 "11 for DNA and 4 for protein. \n"
 "The entire alignment, including mismatches and gaps, must score 20 \n"
 "or higher in order to appear in the BLAT output.\n"
-"Having only 2 hits is quite low, and will often yield no BLAT results.\n"
-"Click the link to see the full BLAT output for that organism.\n</P>\n");
+"Having too few hits will often yield no BLAT results.\n"
+"Click the Assembly column link on the results page to see the full BLAT output for that organism.\n</P>\n");
 
 if (hgPcrOk(db))
     printf("<P>For locating PCR primers, use <A HREF=\"../cgi-bin/hgPcr?db=%s\">In-Silico PCR</A>"
            " for best results instead of BLAT.</P>", db);
 puts("</TD></TR></TABLE>\n");
 
 
 
 printf("</FORM>\n");
 
 webNewSection("About BLAT");
 printf( 
 "<P>BLAT on DNA is designed to\n"
 "quickly find sequences of 95%% and greater similarity of length 25 bases or\n"
 "more.  It may miss more divergent or shorter sequence alignments.  It will find\n"
 "perfect sequence matches of 20 bases.\n"
 "BLAT on proteins finds sequences of 80%% and greater similarity of length 20 amino\n"
 "acids or more.  In practice DNA BLAT works well on primates, and protein\n"
-"blat on land vertebrates."
+"BLAT on land vertebrates."
 );
 
 
 printf("%s",
 "\n</P><P>BLAT is not BLAST.  DNA BLAT works by keeping an index of the entire genome\n"
 "in memory.  The index consists of all overlapping 11-mers stepping by 5 except for\n"
 "those heavily involved in repeats.  The index takes up about\n"
 "2 gigabytes of RAM.  RAM can be further reduced to less than 1 GB by increasing step size to 11.\n"
 "The genome itself is not kept in memory, allowing\n"
 "BLAT to deliver high performance on a reasonably priced Linux box.\n"
 "The index is used to find areas of probable homology, which are then\n"
 "loaded into memory for a detailed alignment. Protein BLAT works in a similar\n"
 "manner, except with 4-mers rather than 11-mers.  The protein index takes a little\n"
 "more than 2 gigabytes.</P>\n"
 "<P>BLAT was written by <A HREF=\"mailto:kent@soe.ucsc.edu\">Jim Kent</A>.\n"
@@ -1709,40 +1737,60 @@
 	}
     }
 }
 
 void changeMaxGenePositionToPositiveStrandCoords(struct genomeHits *gH)
 /* convert negative strand coordinates to positive strand coordinates if TStrand=='-' */
 {
 for (;gH; gH = gH->next)
     {
     if (gH->hide)
 	continue;
     if (gH->error)
 	continue;
     if (gH->maxGeneTStrand == '-')  // convert to pos strand coordinates
 	{
+	int chromSize = 0;
+	char temp[1024];
+	safef(temp, sizeof temp, "%s", "");
+	if (trackHubDatabase(gH->db))  // if not hub db, make sure it is the default assembly.
+	    {
+
+	    struct chromInfo *ci = trackHubMaybeChromInfo(gH->db, gH->maxGeneChrom);
+	    if (ci)
+		chromSize = ci->size;
+	    else
+		{
+		warn("chromosome %s missing from %s .2bit (%s).", gH->maxGeneChrom, gH->db, gH->genome);
+		safef(temp, sizeof temp, "chromosome %s missing from %s .2bit.", gH->maxGeneChrom, gH->db);
+		}
+	    }
+	else
+	    {
 	    struct sqlConnection *conn = hAllocConn(gH->db);
 	    char query[256];
 	    sqlSafef(query, sizeof query, "select size from chromInfo where chrom='%s'", gH->maxGeneChrom);
-	int chromSize = sqlQuickNum(conn, query);
+	    chromSize = sqlQuickNum(conn, query);
 	    hFreeConn(&conn);
 	    if (chromSize == 0)
 		{
 		warn("chromosome %s missing from %s.chromInfo (%s)", gH->maxGeneChrom, gH->db, gH->genome);
-	    char temp[256];
 		safef(temp, sizeof temp, "chromosome %s missing from %s.chromInfo", gH->maxGeneChrom, gH->db);
+		}
+	    }
+	if (chromSize == 0)
+	    {
 	    gH->error = TRUE;
 	    gH->networkErrMsg = cloneString(temp);
 	    }
 	else
 	    {
 	    gH->maxGeneChromSize = chromSize;
 	    int tempTStart = gH->maxGeneTStart;
 	    gH->maxGeneTStart = chromSize - gH->maxGeneTEnd;
 	    gH->maxGeneTEnd   = chromSize - tempTStart;
 	    }
 	}
     }
 }
 
 void printDebugging()
@@ -1850,35 +1898,38 @@
 	{
 	cartWebStart(cart, db, "ALL Genomes BLAT Results");
 
 	struct dbDb *dbList = hGetBlatIndexedDatabases();
 
 	struct dbDb *this = NULL;
 	char *saveDb = db;
 	char *saveOrg = organism;
 	struct sqlConnection *conn = hConnectCentral();
 	int dbCount = 0;
 	for(this = dbList; this; this = this->next)
 	    {
 	    db = this->name;
 	    organism = hGenome(db);
 
+	    if (!trackHubDatabase(db))  // if not hub db, make sure it is the default assembly.
+		{	    
 		char query[256];
 		sqlSafef(query, sizeof query, "select name from defaultDb where genome='%s'", organism);
 		char *defaultDb = sqlQuickString(conn, query);
 		if (!sameOk(defaultDb, db))
 		    continue;  // skip non-default dbs
+		}
 
 	    blatSeq(skipLeadingSpaces(userSeq), organism, db, dbCount);
 
 	    ++dbCount;
 	    }
 	dbDbFreeList(&dbList);
 	db = saveDb;
 	organism = saveOrg;
 	hDisconnectCentral(&conn);
 
 	// Loop over each org's default assembly
 
 	/* pre-load remote tracks in parallel */
 	int ptMax = atoi(cfgOptionDefault("parallelFetch.threads", "20"));  // default number of threads for parallel fetch.
 	int pfdListCount = 0;
@@ -1963,46 +2014,46 @@
 		    "<th style='text-align:left'>Exons</th>"
 		    "<th style='text-align:left'>Query RC'd</th>"
 		    "<th style='text-align:left'>Type</th>"
 			);
 		    }
 		printf("\n");
 		printf("</TR>\n");
 		}
 
 	    if (gH->hide) // hide weaker of pairs for dna and dnax with reverse-complimented queries.
 		    continue;
 	    printf("<TR>\n");
 	    if (gH->error)
 		{
 		printf("<td>%s</td><td>%s</td><td>%s</td><td></td><td>%s</td><td></td>",
-		    gH->faName, gH->genome, gH->db, gH->networkErrMsg); 
+		    gH->faName, trackHubSkipHubName(gH->genome), trackHubSkipHubName(gH->db), gH->networkErrMsg); 
 		if (debuggingGfResults)
 		    printf("<td>%d</td><td>%s</td><td></td>", 
 		    gH->queryRC, gH->type);
 		printf("\n");
 		}
 	    else
 		{
 		char pos[256];
 		safef(pos, sizeof pos, "%s:%d-%d", gH->maxGeneChrom, gH->maxGeneTStart+1, gH->maxGeneTEnd); // 1-based closed coord
 		if (!gH->maxGeneChrom) // null
 		    pos[0] = 0;  // empty string
 		safef(id, sizeof id, "res%d", idCount);
 		printf("<td>%s</td><td>%s</td>"
 		    "<td><a id=%s href=''>%s</a></td>"
-		    , gH->faName, gH->genome, id, gH->db);
+		    , gH->faName, trackHubSkipHubName(gH->genome), id, trackHubSkipHubName(gH->db));
 
 		printf("<td style='text-align:right'>%d</td><td>%s</td>", gH->maxGeneHits, 
 		    gH->maxGeneChrom ? gH->maxGeneChrom : "");
 
 		if (debuggingGfResults)
 		    {
 		    printf("<td>%s</td><td>%s</td>", pos, gH->maxGeneStrand);
 		    printf( "<td>%d</td><td>%d</td><td>%s</td>", gH->maxGeneExons, gH->queryRC, gH->xType);
 		    }
 
 		printf("\n");
 		jsOnEventByIdF("click", id, 
 		    "document.mainForm.org.value=\"%s\";"  // some have single-quotes in their value.
 		    "document.mainForm.db.value='%s';"
 		    "document.mainForm.submit();"