b31907d700c1fe956e4e4c20e64d91de027d7c84
markd
  Tue May 14 02:03:33 2024 -0700
merge blatHuge implementation

diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c
index 75f57f4..0d765ae 100644
--- src/gfServer/gfServer.c
+++ src/gfServer/gfServer.c
@@ -66,36 +66,35 @@
 boolean noSimpRepMask = FALSE;
 int repMatch = 1024;    /* Can be overridden from command line. */
 int maxDnaHits = 100;   /* Can be overridden from command line. */
 int maxTransHits = 200; /* Can be overridden from command line. */
 int maxGap = gfMaxGap;
 boolean seqLog = FALSE;
 boolean ipLog = FALSE;
 boolean doMask = FALSE;
 boolean canStop = FALSE;
 char *indexFile = NULL;
 char *genome = NULL;
 char *genomeDataDir = NULL;
 
 int timeout = 90;  // default timeout in seconds
 
-
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
-  "gfServer v %s - Make a server to quickly find where DNA occurs in genome\n"
+  "gfServer v %s - Make a server to quickly find where DNA occurs in genome (%d-bit index)\n"
   "   To set up a server:\n"
   "      gfServer start host port file(s)\n"
   "      where the files are .2bit or .nib format files specified relative to the current directory\n"
   "   To remove a server:\n"
   "      gfServer stop host port\n"
   "   To query a server with DNA sequence:\n"
   "      gfServer query host port probe.fa\n"
   "   To query a server with protein sequence:\n"
   "      gfServer protQuery host port probe.fa\n"
   "   To query a server with translated DNA sequence:\n"
   "      gfServer transQuery host port probe.fa\n"
   "   To query server with PCR primers:\n"
   "      gfServer pcr host port fPrimer rPrimer maxDistance\n"
   "   To process one probe fa file against a .2bit format genome (not starting server):\n"
   "      gfServer direct probe.fa file(s).2bit\n"
@@ -155,37 +154,37 @@
   "                   tile. Default is %d.\n"
   "   -noSimpRepMask  Suppresses simple repeat masking.\n"
   "   -maxDnaHits=N   Maximum number of hits for a DNA query that are sent from the server.\n"
   "                   Default is %d.\n"
   "   -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n"
   "                   Default is %d.\n"
   "   -maxNtSize=N    Maximum size of untranslated DNA query sequence.\n"
   "                   Default is %d.\n"
   "   -maxAaSize=N    Maximum size of protein or translated DNA queries.\n"
   "                   Default is %d.\n"
   "   -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n"
   "                   -maxDnaHits will be applied to each filename[:seq] separately: each may\n"
   "                   have at most maxDnaHits/2 hits.  The filename MUST not include the directory.\n"
   "                   Useful for assemblies with many alternate/patch sequences.\n"
   "   -canStop        If set, a quit message will actually take down the server.\n"
-  "   -indexFile      Index file create by `gfServer index'. Saving index can speed up\n"
+  "   -indexFile      Index file created by `gfServer index'. Saving index can speed up\n"
   "                   gfServer startup by two orders of magnitude.  The parameters must\n"
   "                   exactly match the parameters when the file is written or bad things\n"
   "                   will happen.\n"
   "   -timeout=N      Timeout in seconds.\n"
   "                   Default is %d.\n"
-  ,	gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize, timeout
+  ,	gfVersion, GFINDEX_BITS, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize, timeout
   );
 
 }
 /*
   Note about file(s) specified in the start command:
       The path(s) specified here are sent back exactly as-is
       to clients such as gfClient, hgBlat, webBlat.
       It is intended that relative paths are used.
       Absolute paths starting with '/' tend not to work
       unless the client is on the same machine as the server.
       For use with hgBlat and webBlat, cd to the directory where the file is
       and use the plain file name with no slashes.
         hgBlat will append the path(s) given to dbDb.nibPath.
        webBlat will append the path(s) given to path specified in webBlat.cfg.
       gfClient will append the path(s) given to the seqDir path specified.
@@ -226,31 +225,31 @@
 /* debug log the genoFind parameters */
 {
 logDebug("gf->isMapped: %d", gf->isMapped);
 logDebug("gf->maxPat: %d", gf->maxPat);
 logDebug("gf->minMatch: %d", gf->minMatch);
 logDebug("gf->maxGap: %d", gf->maxGap);
 logDebug("gf->tileSize: %d", gf->tileSize);
 logDebug("gf->stepSize: %d", gf->stepSize);
 logDebug("gf->tileSpaceSize: %d", gf->tileSpaceSize);
 logDebug("gf->tileMask: %d", gf->tileMask);
 logDebug("gf->sourceCount: %d", gf->sourceCount);
 logDebug("gf->isPep: %d", gf->isPep);
 logDebug("gf->allowOneMismatch: %d", gf->allowOneMismatch);
 logDebug("gf->noSimpRepMask: %d", gf->noSimpRepMask);
 logDebug("gf->segSize: %d", gf->segSize);
-logDebug("gf->totalSeqSize: %d", gf->totalSeqSize);
+logDebug("gf->totalSeqSize: " GFOFFSET_FMT, gf->totalSeqSize);
 }
 
 void logGenoFindIndex(struct genoFindIndex *gfIdx)
 /* debug log the genoFind parameters in an genoFindIndex */
 {
 logDebug("gfIdx->isTrans: %d", gfIdx->isTrans);
 logDebug("gfIdx->noSimpRepMask: %d", gfIdx->noSimpRepMask);
 if (gfIdx->untransGf != NULL)
     logGenoFind(gfIdx->untransGf);
 else
     logGenoFind(gfIdx->transGf[0][0]);
 }
 
 
 void genoFindDirect(char *probeName, int fileCount, char *seqFiles[])
@@ -355,31 +354,32 @@
 {
 char buf[256];
 struct gfClump *clumpList = NULL, *clump;
 int limit = 1000;
 int clumpCount = 0, hitCount = -1;
 struct lm *lm = lmInit(0);
 
 if (seq->size > gf->tileSize + gf->stepSize + gf->stepSize)
      limit = maxDnaHits;
 clumpList = gfFindClumps(gf, seq, lm, &hitCount);
 if (clumpList == NULL)
     ++missCount;
 for (clump = clumpList; clump != NULL; clump = clump->next)
     {
     struct gfSeqSource *ss = clump->target;
-    sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d", 
+    assert((clump->qStart < clump->qEnd) && (clump->tStart < clump->tEnd));
+    sprintf(buf, GFOFFSET_FMT "\t" GFOFFSET_FMT "\t%s\t" GFOFFSET_FMT "\t" GFOFFSET_FMT "\t%d", 
 	clump->qStart, clump->qEnd, ss->fileName,
 	clump->tStart-ss->start, clump->tEnd-ss->start, clump->hitCount);
     errSendString(connectionHandle, buf);
     ++clumpCount;
     int perSeqCount = -1;
     if (perSeqMaxHash &&
         ((perSeqCount = hashIntValDefault(perSeqMaxHash, ss->fileName, -1)) >= 0))
         {
         if (perSeqCount >= (maxDnaHits / 2))
             break;
         hashIncInt(perSeqMaxHash, ss->fileName);
         }
     else if (--limit < 0)
 	break;
     }
@@ -403,40 +403,41 @@
 
 sprintf(buf, "tileSize %d", tileSize);
 errSendString(connectionHandle, buf);
 for (frame = 0; frame < 3; ++frame)
     clumps[frame] = NULL;
 for (isRc = 0; isRc <= 1; ++isRc)
     {
     strand = (isRc ? '-' : '+');
     gfTransFindClumps(transGf[isRc], seq, clumps, lm, &oneHit);
     hitCount += oneHit;
     for (frame = 0; frame < 3; ++frame)
         {
 	int limit = maxTransHits;
 	for (clump = clumps[frame]; clump != NULL; clump = clump->next)
 	    {
+            assert((clump->qStart < clump->qEnd) && (clump->tStart < clump->tEnd));
 	    struct gfSeqSource *ss = clump->target;
-	    sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d\t%c\t%d", 
+	    sprintf(buf, GFOFFSET_FMT "\t" GFOFFSET_FMT "\t%s\t" GFOFFSET_FMT "\t" GFOFFSET_FMT "\t%d\t%c\t%d", 
 		clump->qStart, clump->qEnd, ss->fileName,
 		clump->tStart-ss->start, clump->tEnd-ss->start, clump->hitCount,
 		strand, frame);
 	    errSendString(connectionHandle, buf);
 	    dyStringClear(dy);
 	    for (hit = clump->hitList; hit != NULL; hit = hit->next)
-	        dyStringPrintf(dy, " %d %d", hit->qStart, hit->tStart - ss->start);
-	    errSendLongString(connectionHandle, dy->string);
+	        dyStringPrintf(dy, " " GFOFFSET_FMT " " GFOFFSET_FMT, hit->qStart, hit->tStart - ss->start);
+	    netSendLongString(connectionHandle, dy->string);
 	    ++clumpCount;
 	    if (--limit < 0)
 		break;
 	    }
 	gfClumpFreeList(&clumps[frame]);
 	}
     }
 if (clumpCount == 0)
     ++missCount;
 dyStringFree(&dy);
 lmCleanup(&lm);
 logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount);
 }
 
 void transTransQuery(struct genoFind *transGf[2][3], struct dnaSeq *seq, 
@@ -458,84 +459,87 @@
     for (tFrame=0; tFrame<3; ++tFrame)
 	clumps[qFrame][tFrame] = NULL;
 for (isRc = 0; isRc <= 1; ++isRc)
     {
     struct lm *lm = lmInit(0);
     strand = (isRc ? '-' : '+');
     gfTransTransFindClumps(transGf[isRc], t3->trans, clumps, lm, &oneCount);
     hitCount += oneCount;
     for (qFrame = 0; qFrame<3; ++qFrame)
 	{
 	for (tFrame=0; tFrame<3; ++tFrame)
 	    {
 	    int limit = maxTransHits;
 	    for (clump = clumps[qFrame][tFrame]; clump != NULL; clump = clump->next)
 		{
+                assert((clump->qStart < clump->qEnd) && (clump->tStart < clump->tEnd));
 		struct gfSeqSource *ss = clump->target;
-		sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d\t%c\t%d\t%d", 
+		sprintf(buf, GFOFFSET_FMT "\t" GFOFFSET_FMT "\t%s\t" GFOFFSET_FMT "\t" GFOFFSET_FMT "\t%d\t%c\t%d\t%d", 
 		    clump->qStart, clump->qEnd, ss->fileName,
 		    clump->tStart-ss->start, clump->tEnd-ss->start, clump->hitCount,
 		    strand, qFrame, tFrame);
 		errSendString(connectionHandle, buf);
 		dyStringClear(dy);
 		for (hit = clump->hitList; hit != NULL; hit = hit->next)
 		    {
-		    dyStringPrintf(dy, " %d %d", hit->qStart, hit->tStart - ss->start);
+		    dyStringPrintf(dy, " " GFOFFSET_FMT " " GFOFFSET_FMT, hit->qStart, hit->tStart - ss->start);
 		    }
 		errSendLongString(connectionHandle, dy->string);
 		++clumpCount;
 		if (--limit < 0)
 		    break;
 		}
 	    gfClumpFreeList(&clumps[qFrame][tFrame]);
 	    }
 	}
     lmCleanup(&lm);
     }
 trans3Free(&t3);
 if (clumpCount == 0)
     ++missCount;
 logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount);
 }
 
 static void pcrQuery(struct genoFind *gf, char *fPrimer, char *rPrimer, 
 	int maxDistance, int connectionHandle)
 /* Do PCR query and report results down socket. */
 {
 int fPrimerSize = strlen(fPrimer);
 int rPrimerSize = strlen(rPrimer);
 struct gfClump *clumpList, *clump;
 int clumpCount = 0;
 char buf[256];
 
 clumpList = gfPcrClumps(gf, fPrimer, fPrimerSize, rPrimer, rPrimerSize, 0, maxDistance);
 for (clump = clumpList; clump != NULL; clump = clump->next)
     {
+    assert((clump->qStart < clump->qEnd) && (clump->tStart < clump->tEnd));
     struct gfSeqSource *ss = clump->target;
-    safef(buf, sizeof(buf), "%s\t%d\t%d\t+", ss->fileName, 
+    safef(buf, sizeof(buf), "%s\t" GFOFFSET_FMT "\t" GFOFFSET_FMT "\t+", ss->fileName, 
         clump->tStart, clump->tEnd);
     errSendString(connectionHandle, buf);
     ++clumpCount;
     }
 gfClumpFreeList(&clumpList);
 
 clumpList = gfPcrClumps(gf, rPrimer, rPrimerSize, fPrimer, fPrimerSize, 0, maxDistance);
 
 for (clump = clumpList; clump != NULL; clump = clump->next)
     {
+    assert((clump->qStart < clump->qEnd) && (clump->tStart < clump->tEnd));
     struct gfSeqSource *ss = clump->target;
-    safef(buf, sizeof(buf), "%s\t%d\t%d\t-", ss->fileName, 
+    safef(buf, sizeof(buf), "%s\t" GFOFFSET_FMT "\t" GFOFFSET_FMT "\t-", ss->fileName, 
         clump->tStart, clump->tEnd);
     errSendString(connectionHandle, buf);
     ++clumpCount;
     }
 gfClumpFreeList(&clumpList);
 errSendString(connectionHandle, "end");
 logDebug("%lu PCR %s %s %d clumps", clock1000(), fPrimer, rPrimer, clumpCount);
 }
 
 
 static jmp_buf gfRecover;
 static char *ripCord = NULL;	/* A little memory to give back to system
                                  * during error recovery. */
 
 static void gfAbort()