b31907d700c1fe956e4e4c20e64d91de027d7c84
markd
  Tue May 14 02:03:33 2024 -0700
merge blatHuge implementation

diff --git src/jkOwnLib/genoFind.c src/jkOwnLib/genoFind.c
index a4e0e12..4e20111 100644
--- src/jkOwnLib/genoFind.c
+++ src/jkOwnLib/genoFind.c
@@ -6,34 +6,73 @@
 #include <sys/mman.h>
 #include "portable.h"
 #include "obscure.h"
 #include "dnautil.h"
 #include "dnaseq.h"
 #include "nib.h"
 #include "twoBit.h"
 #include "fa.h"
 #include "dystring.h"
 #include "errAbort.h"
 #include "sig.h"
 #include "ooc.h"
 #include "genoFind.h"
 #include "trans3.h"
 #include "binRange.h"
+#include "portable.h"
+#include <sys/mman.h>
 
 static char indexFileMagic[] = "genoFind";
 static char indexFileVerison[] = "1.0";
 
+#undef DEBUG_CLUMP
+#undef DEBUG_HITS
+
+#ifdef DEBUG_CLUMP
+static void dumpClump(struct gfClump *clump, FILE *f)
+/* Print out a clump */
+{
+struct gfSeqSource *target = clump->target;
+char *tName = target->fileName;
+if (tName == NULL) tName = target->seq->name;
+fprintf(f, GFOFFSET_FMT "-" GFOFFSET_FMT "\t%s:" GFOFFSET_FMT "-" GFOFFSET_FMT "\n",
+        clump->qStart, clump->qEnd, tName, clump->tStart, clump->tEnd);
+}
+
+static void dumpClumpList(struct gfClump *clumpList, FILE *f)
+/* Dump list of clumps. */
+{
+fprintf(f, "=== Clumps\n");
+struct gfClump *clump;
+for (clump = clumpList; clump != NULL; clump = clump->next)
+    dumpClump(clump, f);
+fflush(f);
+}
+#endif /* DEBUG_CLUMP */
+
+#ifdef DEBUG_HITS
+static void dumpHits(struct gfHit *hits, FILE *f)
+/* Dump list of hits. */
+{
+fprintf(f, "=== Hits\n");
+for (struct gfHit *hit = hits; hit != NULL; hit = hit->next)
+    fprintf(f, GFOFFSET_FMT " " GFOFFSET_FMT " "  GFOFFSET_FMT "\n",
+            hit->qStart, hit->tStart, hit->diagonal);
+fflush(f);
+}
+#endif /*  DEBUG_HITS */
+
 char *gfSignature()
 /* Return signature that starts each command to gfServer. Helps defend 
  * server from confused clients. */
 {
 static char signature[] = "0ddf270562684f29";
 return signature;
 }
 
 volatile boolean pipeBroke = FALSE;	/* Flag broken pipes here. */
 
 static void gfPipeHandler(int sigNum)
 /* Set broken pipe flag. */
 {
 pipeBroke = TRUE;
 }
@@ -79,59 +118,109 @@
     if (!gf->isMapped)
         {
         freeMem(gf->listSizes);
         freeMem(gf->allocated);
         }
     if ((sources = gf->sources) != NULL)
 	{
 	for (i=0; i<gf->sourceCount; ++i)
 	    bitFree(&sources[i].maskedBits);
 	freeMem(sources);
 	}
     freez(pGenoFind);
     }
 }
 
+/* constants for endList */
+#if GFSERVER_HUGE
+#define ENDLIST_ENTRY_NUM_PARTS 3  // 48-bits endList entry parts
+#else
+#define ENDLIST_ENTRY_NUM_PARTS 5  // 80-bits endList entry parts
+#endif
+
+#define ENDLIST_ENTRY_SIZE (ENDLIST_ENTRY_NUM_PARTS * sizeof(endListPart))
+/* byte size of an entry in endList array */
+
+INLINE endListPart* endListGetEntry(struct genoFind *gf, int iRow, int iCol)
+/* get an entry in the endlist array */
+{
+assert(iCol < gf->listSizes[iRow]);
+bits8 *row = (bits8*)gf->endLists[iRow];
+return (endListPart*)(row + (iCol * ENDLIST_ENTRY_SIZE));
+}
+
+INLINE void endListSetEntryValues(struct genoFind *gf, int iRow, int iCol,
+                                  bits16 tileTail, gfOffset offset)
+/* store values in an endList entry */
+{
+endListPart* entry = endListGetEntry(gf, iRow, iCol);
+entry[0] = tileTail;
+#if GFSERVER_HUGE
+entry[1] = offset >> 32;
+entry[2] = (offset >> 16) & 0xffff;
+entry[3] = offset & 0xffff;
+#else
+entry[1] = offset >> 16;
+entry[2] = offset & 0xffff;
+#endif
+}
+
+INLINE bits16 endListEntryTileTail(endListPart *entry)
+/* get endList tileTail value */
+{
+return entry[0];
+}
+
+INLINE gfOffset endListEntryOffset(endListPart *entry)
+/* get endList genome offset */
+{
+#if GFSERVER_HUGE
+return ((bits64)entry[1] << 32) | ((bits64)entry[2] << 16) | (bits64)entry[3];
+#else
+return (entry[1] << 16) | entry[2];
+#endif
+}
+
 static off_t mustSeekAligned(FILE *f)
 /* seek so that the current offset is 64-bit aligned */
 {
 off_t off = ftell(f);
 if ((off & 0x7) != 0)
     {
     off = (off & ~0x7) + 0x8;
     mustSeek(f, off, SEEK_SET);
     }
 return off;
 }
 
 
 struct genoFindFileHdr
 /* header for genoFind section in file */
 {
     int maxPat;
     int minMatch;
     int maxGap;
     int tileSize;
     int stepSize;
     int tileSpaceSize;
     int tileMask;
     int sourceCount;
     bool isPep;
     bool allowOneMismatch;
     bool noSimpRepMask;
     int segSize;
-    int totalSeqSize;
+    gfOffset totalSeqSize;
 
     off_t sourcesOff;     // offset of sequences sources
     off_t listSizesOff;   // offset of listSizes
     off_t listsOff;       // offset of lists or endLists
     off_t endListsOff;
 
     // Reserved area. These are bytes of zero, so that need fields can be added
     // that default to zero without needed check the version in code.
     bits64 reserved[32];  // vesion 1.0: 32 words
 };
 
 static void genoFindInitHdr(struct genoFind *gf,
                             struct genoFindFileHdr *hdr)
 /* fill in the file header struct from in-memory struct */
 {
@@ -173,40 +262,40 @@
 static void genoFindWriteSource(struct gfSeqSource *ss, FILE *f)
 /* write a gfSeqSource object */
 {
 if ((ss->seq != NULL) || (ss->maskedBits != NULL))
     errAbort("can't write index contained sequences, must used external sequence file");
 
 char *fileName = ss->fileName;
 if (fileName != NULL)
     {
     // don't include directories
     char *s = strrchr(fileName, '/');
     if (s != NULL)
         fileName = s + 1;
     }
 writeStringSafe(f, fileName);
-mustWrite(f, &ss->start, sizeof(bits32));
-mustWrite(f, &ss->end, sizeof(bits32));
+mustWrite(f, &ss->start, sizeof(gfOffset));
+mustWrite(f, &ss->end, sizeof(gfOffset));
 }
 
 static void genoFindReadSource(FILE *f, struct gfSeqSource *ss)
 /* read a gfSeqSource from file */
 {
 ss->fileName = readString(f);
-mustRead(f, &ss->start, sizeof(bits32));
-mustRead(f, &ss->end, sizeof(bits32));
+mustRead(f, &ss->start, sizeof(gfOffset));
+mustRead(f, &ss->end, sizeof(gfOffset));
 }
 
 static off_t genoFindWriteSources(struct genoFind *gf, FILE *f)
 /* write the sources to the file */
 {
 off_t off = mustSeekAligned(f);
 int i;
 for (i = 0; i < gf->sourceCount; i++)
     genoFindWriteSource(gf->sources + i, f);
 return off;
 }
 
 static void genoFindReadSources(FILE *f, off_t off, struct genoFind *gf)
 /* read the sources from the file */
 {
@@ -229,70 +318,70 @@
 static void genoFindMapListSize(void *memMapped, off_t off, struct genoFind *gf)
 /* map the list sizes into memory */
 {
 gf->listSizes = memMapped + off;
 }
 
 static off_t genoFindWriteLists(struct genoFind *gf, FILE *f)
 /* write the lists */
 {
 off_t off = mustSeekAligned(f);
 size_t count = 0;
 int i;
 for (i = 0; i < gf->tileSpaceSize; i++)
     if (gf->listSizes[i] < gf->maxPat)
         count += gf->listSizes[i];
-mustWrite(f, gf->allocated, count*sizeof(bits32));
+mustWrite(f, gf->allocated, count*sizeof(gfOffset));
 return off;
 }
 
 static void genoFindMapLists(void *memMapped, off_t off, struct genoFind *gf)
 /* maps the lists into memory */
 {
 gf->allocated = memMapped + off;
 gf->lists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->lists[0]));
-bits32 *cur = gf->allocated;
+gfOffset *cur = gf->allocated;
 size_t count = 0;
 int i;
 for (i = 0; i < gf->tileSpaceSize; i++)
     {
     if (gf->listSizes[i] < gf->maxPat)
         {
         gf->lists[i] = cur;
         cur += gf->listSizes[i];
         count += gf->listSizes[i];
         }
     }
 }
 
 static off_t genoFindWriteEndLists(struct genoFind *gf, FILE *f)
 /* write the endList */
 {
 off_t off = mustSeekAligned(f);
 size_t count = 0;
 int i;
 for (i = 0; i < gf->tileSpaceSize; i++)
     count += gf->listSizes[i];
-mustWrite(f, gf->allocated, 3*count*sizeof(bits16));
+mustWrite(f, gf->allocated, count * ENDLIST_ENTRY_SIZE);
 return off;
 }
 
 static void genoFindMapEndLists(void *memMapped, off_t off, struct genoFind *gf)
 /* maps the endLists into memory */
 {
 gf->endLists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->endLists[0]));
-bits16 *cur = gf->allocated;
+endListPart* cur = gf->allocated;
 size_t count = 0;
 int i;
 for (i = 0; i < gf->tileSpaceSize; i++)
     {
     gf->endLists[i] = cur;
     cur += 3 * gf->listSizes[i];
     count += gf->listSizes[i];
     }
 }
   
 
 static off_t genoFindWrite(struct genoFind *gf, FILE *f)
 /* write one genoFind structure, return offset */
 {
 off_t hdrOff = mustSeekAligned(f);
@@ -381,45 +470,54 @@
     off_t untransOff;
     off_t transOff[2][3];
 
     // Reserved area. These are bytes of zero, so that need fields can be added
     // that default to zero without needed check the version in code.
     bits64 reserved[32];  // vesion 1.0: 32 words
 };
 
 static void genoFindIndexInitHeader(struct genoFindIndex *gfIdx,
                                     struct genoFindIndexFileHdr* hdr)
 /* fill in the file header struct from in-memory struct */
 {
 zeroBytes(hdr, sizeof(struct genoFindIndexFileHdr));
 safecpy(hdr->magic, sizeof(hdr->magic), indexFileMagic);
 safecpy(hdr->version, sizeof(hdr->version), indexFileVerison);
+#ifdef GFSERVER_HUGE
+hdr->indexAddressSize = 64;
+#else
 hdr->indexAddressSize = 32;
+#endif
 hdr->isTrans = gfIdx->isTrans;
 }
 
 static void genoFindIndexReadHeader(void* memMapped, struct genoFindIndexFileHdr* hdr,
                                     struct genoFindIndex* gfIdx)
 /* fill in the file header from file and valodate */
 {
 *hdr = *((struct genoFindIndexFileHdr*)memMapped);
 if (!sameString(hdr->magic, indexFileMagic))
     errAbort("wrong magic string for index file");
 if (!sameString(hdr->version, indexFileVerison))
     errAbort("unsupported version for index file: %s", hdr->version);
+#ifdef GFSERVER_HUGE
+if (hdr->indexAddressSize != 64)
+    errAbort("compiled for 64-bit index, but not a 64-bit index: %d", hdr->indexAddressSize);
+#else
 if (hdr->indexAddressSize != 32)
-    errAbort("not a 32-bit index: %d", hdr->indexAddressSize);
+    errAbort("compiled for 32-bit index, but not a 32-bit index: %d", hdr->indexAddressSize);
+#endif
 if (hdr->isTrans != gfIdx->isTrans)
     errAbort("index file has isTrans=%d, isTrans=%d requested", hdr->isTrans, gfIdx->isTrans);
 }
 
 static void genoFindIndexWriteTrans(struct genoFindIndex *gfIdx,
                                     struct genoFindIndexFileHdr* hdr,
                                     FILE *f)
 /* write translated indexes */
 {
 int i, j;
 for (i = 0; i < 2; i++)
     for (j = 0; j < 3; j++)
         hdr->transOff[i][j] = genoFindWrite(gfIdx->transGf[i][j], f);
 }
 
@@ -495,31 +593,31 @@
     else
         {
         for (int i = 0; i < 2; i++)
             for (int j = 0; j < 3; j++)
                 genoFindFree(&gfIdx->transGf[i][j]);
         }
     if (gfIdx->memMapped != NULL)
         {
         if (munmap(gfIdx->memMapped, gfIdx->memLength))
             errnoAbort("munmap error");
         }
     freez(pGfIdx);
     }
 }
 
-int gfPowerOf20(int n)
+static int gfPowerOf20(int n)
 /* Return a 20 to the n */
 {
 int res = 20;
 while (--n > 0)
     res *= 20;
 return res;
 }
 
 void gfCheckTileSize(int tileSize, boolean isPep)
 /* Check that tile size is legal.  Abort if not. */
 {
 if (isPep)
     {
     if (tileSize < 3 || tileSize > 8)
 	errAbort("protein tileSize must be between 3 and 8");
@@ -600,46 +698,46 @@
 {
 static boolean initted = FALSE;
 if (!initted)
     {
     int i;
     for (i=0; i<ArraySize(ntLookup); ++i)
         ntLookup[i] = -1;
     ntLookup['a'] = A_BASE_VAL;
     ntLookup['c'] = C_BASE_VAL;
     ntLookup['t'] = T_BASE_VAL;
     ntLookup['g'] = G_BASE_VAL;
     initted = TRUE;
     }
 }
 
-int gfDnaTile(DNA *dna, int n)
+static int gfDnaTile(DNA *dna, int n)
 /* Make a packed DNA n-mer. */
 {
 int tile = 0;
 int i, c;
 for (i=0; i<n; ++i)
     {
     tile <<= 2;
     if ((c = ntLookup[(int)dna[i]]) < 0)
         return -1;
     tile += c;
     }
 return tile;
 }
 
-int gfPepTile(AA *pep, int n)
+static int gfPepTile(AA *pep, int n)
 /* Make up packed representation of translated protein. */
 {
 int tile = 0;
 int aa;
 while (--n >= 0)
     {
     tile *= 20;
     aa = aaVal[(int)(*pep++)];
     if (aa < 0 || aa > 19)
         return -1;
     tile += aa;
     }
 return tile;
 }
 
@@ -658,233 +756,225 @@
 
 initNtLookup();
 for (i=0; i<=lastTile; i += stepSize)
     {
     if ((tile = makeTile(poly, tileHeadSize)) >= 0)
 	{
         if (listSizes[tile] < maxPat)
 	    {
 	    listSizes[tile] += 1;
 	    }
 	}
     poly += stepSize;
     }
 }
 
-static int gfCountTilesInNib(struct genoFind *gf, int stepSize, char *fileName)
+static long long gfCountTilesInNib(struct genoFind *gf, int stepSize, char *fileName)
 /* Count all tiles in nib file.  Returns nib size. */
 {
 FILE *f;
 int tileSize = gf->tileSize;
 int bufSize = tileSize * 16*1024;
 int nibSize, i;
 int endBuf, basesInBuf;
 struct dnaSeq *seq;
 
 printf("Counting tiles in %s\n", fileName);
 nibOpenVerify(fileName, &f, &nibSize);
 for (i=0; i < nibSize; i = endBuf)
     {
     endBuf = i + bufSize;
     if (endBuf >= nibSize) endBuf = nibSize;
     basesInBuf = endBuf - i;
     seq = nibLdPart(fileName, f, nibSize, i, basesInBuf);
     gfCountSeq(gf, seq);
     freeDnaSeq(&seq);
     }
 fclose(f);
 return nibSize;
 }
 
-long long maxTotalBases()
+static bits64 maxTotalBases()
 /* Return maximum bases we can index. */
 {
-long long maxBases = 1024*1024;
-maxBases *= 4*1024;
-return maxBases;
+#ifdef GFSERVER_HUGE
+return 18446744073709551615ULL; // 64-bit index
+#else
+return 4294967295ULL;    // 32-bit index
+#endif
 }
 
-static long long twoBitCheckTotalSize(struct twoBitFile *tbf)
+static bits64 twoBitCheckTotalSize(struct twoBitFile *tbf)
 /* Return total size of sequence in two bit file.  Squawk and
  * die if it's more than 4 gig. */
 {
-long long totalSize = twoBitTotalSize(tbf);
+bits64 totalSize = twoBitTotalSize(tbf);
 if (totalSize > maxTotalBases())
     errAbort("Sorry, can only index up to %lld bases, %s has %lld",
 	maxTotalBases(),  tbf->fileName, totalSize);
 return totalSize;
 }
 
 static void gfCountTilesInTwoBit(struct genoFind *gf, int stepSize,
-	char *fileName, int *retSeqCount, long long *retBaseCount)
+	char *fileName, int *retSeqCount, bits64 *retBaseCount)
 /* Count all tiles in 2bit file.  Returns number of sequences and
  * total size of sequences in file. */
 {
 struct dnaSeq *seq;
 struct twoBitFile *tbf = twoBitOpen(fileName);
 struct twoBitIndex *index;
 int seqCount = 0;
-long long baseCount = twoBitCheckTotalSize(tbf);
+bits64 baseCount = twoBitCheckTotalSize(tbf);
 
 printf("Counting tiles in %s\n", fileName);
 for (index = tbf->indexList; index != NULL; index = index->next)
     {
     seq = twoBitReadSeqFragLower(tbf, index->name, 0, 0);
     gfCountSeq(gf, seq);
     ++seqCount;
     freeDnaSeq(&seq);
     }
 twoBitClose(&tbf);
 *retSeqCount = seqCount;
 *retBaseCount = baseCount;
 }
 
 static int gfAllocLists(struct genoFind *gf)
 /* Allocate index lists and set up list pointers. 
  * Returns size of all lists. */
 {
-int oneCount;
-int count = 0;
+bits64 count = 0;
 int i;
 bits32 *listSizes = gf->listSizes;
-bits32 **lists = gf->lists;
-bits32 *allocated = NULL;
-bits32 maxPat = gf->maxPat;
-int size;
+gfOffset **lists = gf->lists;
+gfOffset *allocated = NULL;
+gfOffset maxPat = gf->maxPat;
 int usedCount = 0, overusedCount = 0;
 int tileSpaceSize = gf->tileSpaceSize;
 
 for (i=0; i<tileSpaceSize; ++i)
     {
     /* If pattern is too much used it's no good to us, ignore. */
-    if ((oneCount = listSizes[i]) < maxPat)
+    if (listSizes[i] < maxPat)
         {
-        count += oneCount;
+        count += listSizes[i];
         usedCount += 1;
         }
     else
         {
         overusedCount += 1;
         }
     }
 if (count > 0)
-    gf->allocated = allocated = needHugeMem(count*sizeof(allocated[0]));
+    gf->allocated = allocated = needHugeMem(count*sizeof(gfOffset));
 for (i=0; i<tileSpaceSize; ++i)
     {
-    if ((size = listSizes[i]) < maxPat)
+    if (listSizes[i] < maxPat)
         {
         lists[i] = allocated;
-        allocated += size;
+        allocated += listSizes[i];
         }
     }
 return count;
 }
 
 static int gfAllocLargeLists(struct genoFind *gf)
 /* Allocate large index lists and set up list pointers. 
  * Returns size of all lists. */
 {
 int count = 0;
 int i;
 bits32 *listSizes = gf->listSizes;
-bits16 **endLists = gf->endLists;
-bits16 *allocated = NULL;
-int size;
+endListPart **endLists = gf->endLists;
+endListPart *allocated = NULL;
 int tileSpaceSize = gf->tileSpaceSize;
 
 for (i=0; i<tileSpaceSize; ++i)
     count += listSizes[i];
 if (count > 0)
-    gf->allocated = allocated = needHugeMem(3*count*sizeof(allocated[0]));
+    gf->allocated = allocated = needHugeMem(count * ENDLIST_ENTRY_SIZE);
 for (i=0; i<tileSpaceSize; ++i)
     {
-    size = listSizes[i];
     endLists[i] = allocated;
-    allocated += 3*size;
+    allocated += ENDLIST_ENTRY_SIZE * listSizes[i];
     }
 return count;
 }
 
 
-static void gfAddSeq(struct genoFind *gf, bioSeq *seq, bits32 offset)
+static void gfAddSeq(struct genoFind *gf, bioSeq *seq, gfOffset offset)
 /* Add all N-mers in seq.  Done after gfCountSeq. */
 {
 char *poly = seq->dna;
 int tileSize = gf->tileSize;
 int stepSize = gf->stepSize;
 int i, lastTile = seq->size - tileSize;
 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
 int maxPat = gf->maxPat;
 int tile;
 bits32 *listSizes = gf->listSizes;
-bits32 **lists = gf->lists;
+gfOffset **lists = gf->lists;
 
 initNtLookup();
 for (i=0; i<=lastTile; i += stepSize)
     {
     tile = makeTile(poly, tileSize);
     if (tile >= 0)
         {
 	if (listSizes[tile] < maxPat)
 	    {
 	    lists[tile][listSizes[tile]++] = offset;
 	    }
 	}
     offset += stepSize;
     poly += stepSize;
     }
 }
 
-static void gfAddLargeSeq(struct genoFind *gf, bioSeq *seq, bits32 offset)
+static void gfAddLargeSeq(struct genoFind *gf, bioSeq *seq, gfOffset offset)
 /* Add all N-mers to segmented index.  Done after gfCountSeq. */
 {
 char *poly = seq->dna;
 int tileSize = gf->tileSize;
 int stepSize = gf->stepSize;
 int tileTailSize = gf->segSize;
 int tileHeadSize = tileSize - tileTailSize;
 int i, lastTile = seq->size - tileSize;
 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
 int tileHead;
 int tileTail;
 bits32 *listSizes = gf->listSizes;
-bits16 **endLists = gf->endLists;
-bits16 *endList;
 int headCount;
 
 initNtLookup();
 for (i=0; i<=lastTile; i += stepSize)
     {
     tileHead = makeTile(poly, tileHeadSize);
     tileTail = makeTile(poly + tileHeadSize, tileTailSize);
     if (tileHead >= 0 && tileTail >= 0)
         {
-	endList = endLists[tileHead];
 	headCount = listSizes[tileHead]++;
-	endList += headCount * 3;		/* Because have three slots per. */
-	endList[0] = tileTail;
-	endList[1] = (offset >> 16);
-	endList[2] = (offset&0xffff);
+        endListSetEntryValues(gf, i, headCount, tileTail, offset);
 	}
     offset += stepSize;
     poly += stepSize;
     }
 }
 
 
 
-static int gfAddTilesInNib(struct genoFind *gf, char *fileName, bits32 offset,
+static int gfAddTilesInNib(struct genoFind *gf, char *fileName, gfOffset offset,
 	int stepSize)
 /* Add all tiles in nib file.  Returns size of nib file. */
 {
 FILE *f;
 int tileSize = gf->tileSize;
 int bufSize = tileSize * 16*1024;
 int nibSize, i;
 int endBuf, basesInBuf;
 struct dnaSeq *seq;
 
 printf("Adding tiles in %s\n", fileName);
 nibOpenVerify(fileName, &f, &nibSize);
 for (i=0; i < nibSize; i = endBuf)
     {
     endBuf = i + bufSize;
@@ -939,61 +1029,63 @@
 	int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
 	boolean allowOneMismatch, int stepSize, boolean noSimpRepMask)
 /* Make index for all nibs and .2bits in list. 
  *      minMatch - minimum number of matching tiles to trigger alignments
  *      maxGap   - maximum deviation from diagonal of tiles
  *      tileSize - size of tile in nucleotides
  *      maxPat   - maximum use of tile to not be considered a repeat
  *      oocFile  - .ooc format file that lists repeat tiles.  May be NULL. 
  *      allowOneMismatch - allow one mismatch in a tile.  
  *      stepSize - space between tiles.  Zero means default (which is tileSize). 
  *      noSimpRepMask - skip simple repeat masking. */
 {
 struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize,
 	maxPat, oocFile, FALSE, allowOneMismatch, noSimpRepMask);
 int i;
-bits32 offset = 0, nibSize;
+gfOffset offset = 0, nibSize;
 char *fileName;
 struct gfSeqSource *ss;
-long long totalBases = 0, warnAt = maxTotalBases();
+bits64 totalBases = 0, warnAt = maxTotalBases();
 int totalSeq = 0;
 
 if (allowOneMismatch)
     errAbort("Don't currently support allowOneMismatch in gfIndexNibsAndTwoBits");
 if (stepSize == 0)
     stepSize = gf->tileSize;
 for (i=0; i<fileCount; ++i)
     {
     fileName = fileNames[i];
+    if (!fileExists(fileName))
+        errAbort("Unrecognized genome sequence does not exist %s", fileName);
     if (twoBitIsFile(fileName))
 	{
 	int seqCount;
-	long long baseCount;
+	bits64 baseCount;
         gfCountTilesInTwoBit(gf, stepSize, fileName, &seqCount, &baseCount);
 	totalBases += baseCount;
 	totalSeq += seqCount;
 	}
     else if (nibIsFile(fileName))
 	{
 	totalBases += gfCountTilesInNib(gf, stepSize, fileName);
 	totalSeq += 1;
 	}
     else
-        errAbort("Unrecognized file type %s", fileName);
-    /* Warn if they exceed 4 gig. */
+        errAbort("Unrecognized genome sequence file type %s", fileName);
+    /* Abort if they exceed size, suggest large sequence blat. */
     if (totalBases >= warnAt)
-	errAbort("Exceeding 4 billion bases, sorry gfServer can't handle that.");
+	errAbort("Exceeding 4 billion bases, try large genome gfServer.");
     }
 gfAllocLists(gf);
 gfZeroNonOverused(gf);
 AllocArray(gf->sources, totalSeq);
 gf->sourceCount = totalSeq;
 ss = gf->sources;
 for (i=0; i<fileCount; ++i)
     {
     fileName = fileNames[i];
     if (nibIsFile(fileName))
 	{
 	nibSize = gfAddTilesInNib(gf, fileName, offset, stepSize);
 	ss->fileName = cloneString(findTail(fileName, '/'));
 	ss->start = offset;
 	offset += nibSize;
@@ -1104,31 +1196,31 @@
 for (isRc=0; isRc <= 1; ++isRc)
     {
     if (isRc)
 	reverseComplement(seq->dna, seq->size);
     t3 = trans3New(seq);
     for (frame = 0; frame < 3; ++frame)
 	{
 	struct genoFind *gf = transGf[isRc][frame];
 	gfCountSeq(gf, t3->trans[frame]);
 	}
     trans3Free(&t3);
     }
 }
 
 static void transIndexBothStrands(struct dnaSeq *seq, 
-    struct genoFind *transGf[2][3], bits32 offset[2][3],
+    struct genoFind *transGf[2][3], gfOffset offset[2][3],
     int sourceIx, char *fileName)
 /* Add unmasked tiles on both strands of sequence to
  * index.  As a side effect this will reverse-complement seq. */
 {
 int isRc, frame;
 struct trans3 *t3;
 struct gfSeqSource *ss;
 for (isRc=0; isRc <= 1; ++isRc)
     {
     if (isRc)
 	{
 	reverseComplement(seq->dna, seq->size);
 	}
     t3 = trans3New(seq);
     for (frame = 0; frame < 3; ++frame)
@@ -1141,35 +1233,35 @@
 	offset[isRc][frame] += t3->trans[frame]->size;
 	ss->end = offset[isRc][frame];
 	}
     trans3Free(&t3);
     }
 }
 
 void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], 
     int fileCount, char *fileNames[], 
     int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
     boolean allowOneMismatch, boolean doMask, int stepSize, boolean noSimpRepMask)
 /* Make translated (6 frame) index for all .nib and .2bit files. */
 {
 struct genoFind *gf;
 int i,isRc, frame;
-bits32 offset[2][3];
+gfOffset offset[2][3];
 char *fileName;
 struct dnaSeq *seq;
 int sourceCount = 0;
-long long totalBases = 0, warnAt = maxTotalBases();
+bits64 totalBases = 0, warnAt = maxTotalBases();
 
 if (allowOneMismatch)
     errAbort("Don't currently support allowOneMismatch in gfIndexTransNibsAndTwoBits");
 /* Allocate indices for all reading frames. */
 for (isRc=0; isRc <= 1; ++isRc)
     {
     for (frame = 0; frame < 3; ++frame)
 	{
 	transGf[isRc][frame] = gf = gfNewEmpty(minMatch, maxGap, 
 		tileSize, stepSize, maxPat, oocFile, TRUE, allowOneMismatch, noSimpRepMask);
 	}
     }
 
 /* Mask simple AA repeats (of period 1 and 2). */
 for (isRc = 0; isRc <= 1; ++isRc)
@@ -1261,31 +1353,31 @@
 	gf = transGf[isRc][frame];
 	gf->totalSeqSize = offset[isRc][frame];
 	gfZeroOverused(gf);
 	}
     }
 }
 
 static struct genoFind *gfSmallIndexSeq(struct genoFind *gf, bioSeq *seqList,
 	int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 
 	boolean isPep, boolean maskUpper)
 /* Make index for all seqs in list. */
 {
 int seqCount = slCount(seqList);
 bioSeq *seq;
 int i;
-bits32 offset = 0;
+gfOffset offset = 0;
 struct gfSeqSource *ss;
 
 if (isPep)
     maskSimplePepRepeat(gf);
 for (seq = seqList; seq != NULL; seq = seq->next)
     gfCountSeq(gf, seq);
 gfAllocLists(gf);
 gfZeroNonOverused(gf);
 if (seqCount > 0)
     AllocArray(gf->sources, seqCount);
 gf->sourceCount = seqCount;
 for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next)
     {
     gfAddSeq(gf, seq, offset);
     ss = gf->sources+i;
@@ -1297,31 +1389,31 @@
 	ss->maskedBits = maskFromUpperCaseSeq(seq);
     }
 gf->totalSeqSize = offset;
 gfZeroOverused(gf);
 return gf;
 }
 
 static struct genoFind *gfLargeIndexSeq(struct genoFind *gf, bioSeq *seqList,
 	int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 
 	boolean isPep, boolean maskUpper)
 /* Make index for all seqs in list. */
 {
 int seqCount = slCount(seqList);
 bioSeq *seq;
 int i;
-bits32 offset = 0;
+gfOffset offset = 0;
 struct gfSeqSource *ss;
 
 for (seq = seqList; seq != NULL; seq = seq->next)
     gfCountSeq(gf, seq);
 gfAllocLargeLists(gf);
 gfZeroNonOverused(gf);
 AllocArray(gf->sources, seqCount);
 gf->sourceCount = seqCount;
 for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next)
     {
     gfAddLargeSeq(gf, seq, offset);
     ss = gf->sources+i;
     ss->seq = seq;
     ss->start = offset;
     offset += seq->size;
@@ -1363,46 +1455,46 @@
     stepSize = tileSize;
 if (gf->segSize > 0)
     {
     gfLargeIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
     }
 else
     {
     gfSmallIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
     }
 return gf;
 }
 
 static int bCmpSeqSource(const void *vTarget, const void *vRange)
 /* Compare function for binary search of gfSeqSource. */
 {
-const bits32 *pTarget = vTarget;
-bits32 target = *pTarget;
+const gfOffset *pTarget = vTarget;
+gfOffset target = *pTarget;
 const struct gfSeqSource *ss = vRange;
 
 if (target < ss->start) return -1;
 if (target >= ss->end) return 1;
 return 0;
 }
 
-static struct gfSeqSource *findSource(struct genoFind *gf, bits32 targetPos)
+static struct gfSeqSource *findSource(struct genoFind *gf, gfOffset targetPos)
 /* Find source given target position. */
 {
 struct gfSeqSource *ss =  bsearch(&targetPos, gf->sources, gf->sourceCount, 
 	sizeof(gf->sources[0]), bCmpSeqSource);
 if (ss == NULL)
-    errAbort("Couldn't find source for %d", targetPos);
+    errAbort("Couldn't find source for " GFOFFSET_FMT, targetPos);
 return ss;
 }
 
 void gfClumpFree(struct gfClump **pClump)
 /* Free a single clump. */
 {
 struct gfClump *clump;
 if ((clump = *pClump) == NULL)
     return;
 freez(pClump);
 }
 
 void gfClumpFreeList(struct gfClump **pList)
 /* Free a list of dynamically allocated gfClump's */
 {
@@ -1411,31 +1503,31 @@
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     gfClumpFree(&el);
     }
 *pList = NULL;
 }
 
 void gfClumpDump(struct genoFind *gf, struct gfClump *clump, FILE *f)
 /* Print out info on clump */
 {
 struct gfSeqSource *ss = clump->target;
 char *name = ss->fileName;
 
 if (name == NULL) name = ss->seq->name;
-fprintf(f, "%u-%u %s %u-%u, hits %d\n", 
+fprintf(f, GFOFFSET_FMT "-" GFOFFSET_FMT " %s " GFOFFSET_FMT "-" GFOFFSET_FMT ", hits %d\n", 
 	clump->qStart, clump->qEnd, name,
 	clump->tStart - ss->start, clump->tEnd - ss->start,
 	clump->hitCount);
 #ifdef SOMETIMES
 for (hit = clump->hitList; hit != NULL; hit = hit->next)
     fprintf(f, "   q %d, t %d, diag %d\n", hit->qStart, hit->tStart, hit->diagonal);
 #endif
 }
 
 /* Fast sorting routines for sorting gfHits on diagonal. 
  * More or less equivalent to system qsort, but with
  * comparison function inline.  Worth a little tweaking
  * since this is the bottleneck for the whole procedure. */
 
 static void gfHitSort2(struct gfHit **ptArray, int n);
@@ -1626,31 +1718,31 @@
 
 
 static int gfClumpCmpQueryCoverage(const void *va, const void *vb)
 /* Compare to sort based on query coverage. */
 {
 const struct gfClump *a = *((struct gfClump **)va);
 const struct gfClump *b = *((struct gfClump **)vb);
 
 return (b->queryCoverage - a->queryCoverage);
 }
 
 static void findClumpBounds(struct gfClump *clump, int tileSize)
 /* Figure out qStart/qEnd tStart/tEnd from hitList */
 {
 struct gfHit *hit;
-int x;
+gfOffset x;
 hit = clump->hitList;
 if (hit == NULL)
     return;
 clump->qStart = clump->qEnd = hit->qStart;
 clump->tStart = clump->tEnd = hit->tStart;
 for (hit = hit->next; hit != NULL; hit = hit->next)
     {
     x = hit->qStart;
     if (x < clump->qStart) clump->qStart = x;
     if (x > clump->qEnd) clump->qEnd = x;
     x = hit->tStart;
     if (x < clump->tStart) clump->tStart = x;
     if (x > clump->tEnd) clump->tEnd = x;
     }
 clump->tEnd += tileSize;
@@ -1716,31 +1808,31 @@
 	}
     clump->hitList = NULL;	/* We ate up the hit list. */
     gfClumpFree(&clump);
     }
 }
 
 static int gfNearEnough = 300;
 
 static struct gfClump *clumpNear(struct genoFind *gf, struct gfClump *oldClumps, int minMatch)
 /* Go through clump list and make sure hits are also near each other.
  * If necessary divide clumps. */
 {
 struct gfClump *newClumps = NULL, *clump, *nextClump;
 struct gfHit *hit, *nextHit;
 int tileSize = gf->tileSize;
-bits32 lastT;
+gfOffset lastT;
 int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough);
 
 for (clump = oldClumps; clump != NULL; clump = nextClump)
     {
     struct gfHit *newHits = NULL, *oldHits = clump->hitList;
     int clumpSize = 0;
     clump->hitCount = 0;
     clump->hitList = NULL;	/* Clump no longer owns list. */
     nextClump = clump->next;
     slSort(&oldHits, gfHitCmpTarget);
     lastT = oldHits->tStart;
     for (hit = oldHits; hit != NULL; hit = nextHit)
         {
 	nextHit = hit->next;
 	if (hit->tStart > nearEnough + lastT)
@@ -1782,49 +1874,52 @@
 return newClumps;
 }
 
 static struct gfClump *clumpHits(struct genoFind *gf, struct gfHit *hitList, int minMatch)
 /* Clump together hits according to parameters in gf. */
 {
 struct gfClump *clumpList = NULL, *clump = NULL;
 int maxGap = gf->maxGap;
 struct gfHit *hit, *nextHit, *lastHit = NULL;
 int totalHits = 0, usedHits = 0, clumpCount = 0;
 int tileSize = gf->tileSize;
 int bucketShift = 16;		/* 64k buckets. */
 bits32 bucketSize = (1<<bucketShift);
 int bucketCount = (gf->totalSeqSize >> bucketShift) + 1;
 int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough);
-bits32 boundary = bucketSize - nearEnough;
+gfOffset boundary = bucketSize - nearEnough;
 int i;
 struct gfHit **buckets = NULL, **pb;
 
+#ifdef DEBUG_HITS
+dumpHits(hitList, stdout);
+#endif
 /* Sort hit list into buckets. */
 AllocArray(buckets, bucketCount);
 for (hit = hitList; hit != NULL; hit = nextHit)
     {
     nextHit = hit->next;
     assert((hit->tStart >> bucketShift) < bucketCount);
     pb = buckets + (hit->tStart >> bucketShift);
     slAddHead(pb, hit);
     }
 
 /* Sort each bucket on diagonal and clump. */
 for (i=0; i<bucketCount; ++i)
     {
     int clumpSize;
-    bits32 maxT;
+    gfOffset maxT;
     struct gfHit *clumpHits;
     pb = buckets + i;
     gfHitSortDiagonal(pb);
     for (hit = *pb; hit != NULL; )
          {
 	 /* Each time through this loop will get info on a clump.  Will only
 	  * actually create clump if it is big enough though. */
 	 clumpSize = 0;
 	 clumpHits = lastHit = NULL;
 	 maxT = 0;
 	 for (; hit != NULL; hit = nextHit)
 	     {
 	     nextHit = hit->next;
 	     if (lastHit != NULL && hit->diagonal - lastHit->diagonal > maxGap)
 		 break;
@@ -1846,134 +1941,130 @@
 	     AllocVar(clump);
 	     slAddHead(&clumpList, clump);
 	     clump->hitCount = clumpSize;
 	     clump->hitList = clumpHits;
 	     usedHits += clumpSize;
 	     ++clumpCount;
 	     }
 	 }
     *pb = NULL;
     boundary += bucketSize;
     }
 clumpList = clumpNear(gf, clumpList, minMatch);
 gfClumpComputeQueryCoverage(clumpList, tileSize);	/* Thanks AG */
 slSort(&clumpList, gfClumpCmpQueryCoverage);
 
-#ifdef DEBUG
-uglyf("Dumping clumps B\n");
-for (clump = clumpList; clump != NULL; clump = clump->next)	/* uglyf */
-    {
-    uglyf(" %d %d %s %d %d (%d hits)\n", clump->qStart, clump->qEnd, clump->target->seq->name,   clump->tStart, clump->tEnd, clump->hitCount);
-    }
+#ifdef DEBUG_CLUMP
+dumpClumpList(clumpList, stdout);
 #endif /* DEBUG */
 freez(&buckets);
 return clumpList;
 }
 
 
 static struct gfHit *gfFastFindDnaHits(struct genoFind *gf, struct dnaSeq *seq, 
-	Bits *qMaskBits,  int qMaskOffset, struct lm *lm, int *retHitCount,
-	struct gfSeqSource *target, int tMin, int tMax)
+	Bits *qMaskBits,  gfOffset qMaskOffset, struct lm *lm, int *retHitCount,
+	struct gfSeqSource *target, gfOffset tMin, gfOffset tMax)
 /* Find hits associated with one sequence. This is is special fast
  * case for DNA that is in an unsegmented index. */
 {
 struct gfHit *hitList = NULL, *hit;
-int size = seq->size;
+gfOffset size = seq->size;
 int tileSizeMinusOne = gf->tileSize - 1;
 int mask = gf->tileMask;
 DNA *dna = seq->dna;
 int i, j;
-bits32 bits = 0;
-bits32 bVal;
+gfOffset bits = 0;
+gfOffset bVal;
 int listSize;
-bits32 qStart, *tList;
+gfOffset qStart, *tList;
 int hitCount = 0;
 
 for (i=0; i<tileSizeMinusOne; ++i)
     {
     bVal = ntValNoN[(int)dna[i]];
     bits <<= 2;
     bits += bVal;
     }
 for (i=tileSizeMinusOne; i<size; ++i)
     {
     bVal = ntValNoN[(int)dna[i]];
     bits <<= 2;
     bits += bVal;
     bits &= mask;
     listSize = gf->listSizes[bits];
     if (listSize != 0)
 	{
 	qStart = i-tileSizeMinusOne;
 	if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, gf->tileSize) == 0)
 	    {
 	    tList = gf->lists[bits];
 	    for (j=0; j<listSize; ++j)
 		{
-		int tStart = tList[j];
+		gfOffset tStart = tList[j];
 		if (target == NULL || 
 			(target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) ) 
 		    {
 		    lmAllocVar(lm, hit);
 		    hit->qStart = qStart;
 		    hit->tStart = tStart;
 		    hit->diagonal = tStart + size - qStart;
 		    slAddHead(&hitList, hit);
 		    ++hitCount;
 		    }
 		}
 	    }
 	}
     }
 *retHitCount = hitCount;
 return hitList;
 }
 
 static struct gfHit *gfStraightFindHits(struct genoFind *gf, aaSeq *seq, 
-	Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
-	struct gfSeqSource *target, int tMin, int tMax)
+	Bits *qMaskBits, gfOffset qMaskOffset, struct lm *lm, int *retHitCount,
+	struct gfSeqSource *target, gfOffset tMin, gfOffset tMax)
 /* Find hits associated with one sequence in a non-segmented
  * index where hits match exactly. */
 {
 struct gfHit *hitList = NULL, *hit;
 int size = seq->size;
 int tileSize = gf->tileSize;
 int lastStart = size - tileSize;
 char *poly = seq->dna;
 int i, j;
 int tile;
 int listSize;
-bits32 qStart, *tList;
+gfOffset qStart, *tList;
 int hitCount = 0;
 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
 
 initNtLookup();
 for (i=0; i<=lastStart; ++i)
     {
     tile = makeTile(poly+i, tileSize);
     if (tile < 0)
 	continue;
     listSize = gf->listSizes[tile];
     if (listSize > 0)
 	{
 	qStart = i;
 	if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, tileSize) == 0)
 	    {
 	    tList = gf->lists[tile];
 	    for (j=0; j<listSize; ++j)
 		{
-		int tStart = tList[j];
+		gfOffset tStart = tList[j];
 		if (target == NULL || 
 			(target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) ) 
 		    {
 		    lmAllocVar(lm,hit);
 		    hit->qStart = qStart;
 		    hit->tStart = tStart;
 		    hit->diagonal = tStart + size - qStart;
 		    slAddHead(&hitList, hit);
 		    ++hitCount;
 		    }
 		}
 	    }
 	}
     }
 *retHitCount = hitCount;
@@ -1982,31 +2073,31 @@
 
 static struct gfHit *gfStraightFindNearHits(struct genoFind *gf, aaSeq *seq, 
 	Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
 	struct gfSeqSource *target, int tMin, int tMax)
 /* Find hits associated with one sequence in a non-segmented
  * index where hits can mismatch in one letter. */
 {
 struct gfHit *hitList = NULL, *hit;
 int size = seq->size;
 int tileSize = gf->tileSize;
 int lastStart = size - tileSize;
 char *poly = seq->dna;
 int i, j;
 int tile;
 int listSize;
-bits32 qStart, *tList;
+gfOffset qStart, *tList;
 int hitCount = 0;
 int varPos, varVal;	/* Variable position. */
 int (*makeTile)(char *poly, int n); 
 int alphabetSize;
 char oldChar, zeroChar;
 int *seqValLookup;
 int posMul, avoid;
 
 initNtLookup();
 if (gf->isPep)
     {
     makeTile = gfPepTile;
     alphabetSize = 20;
     zeroChar = 'A';
     seqValLookup = aaVal;
@@ -2082,55 +2173,56 @@
 	Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
 	struct gfSeqSource *target, int tMin, int tMax)
 /* Find hits associated with one sequence in general case in a segmented
  * index. */
 {
 struct gfHit *hitList = NULL, *hit;
 int size = seq->size;
 int tileSize = gf->tileSize;
 int tileTailSize = gf->segSize;
 int tileHeadSize = gf->tileSize - tileTailSize;
 int lastStart = size - tileSize;
 char *poly = seq->dna;
 int i, j;
 int tileHead, tileTail;
 int listSize;
-bits32 qStart;
-bits16 *endList;
+gfOffset qStart;
+endListPart *endList;
 int hitCount = 0;
 int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile);
 
 
 initNtLookup();
 for (i=0; i<=lastStart; ++i)
     {
     if (qMaskBits == NULL || bitCountRange(qMaskBits, i+qMaskOffset, tileSize) == 0)
 	{
 	tileHead = makeTile(poly+i, tileHeadSize);
 	if (tileHead < 0)
 	    continue;
 	tileTail = makeTile(poly+i+tileHeadSize, tileTailSize);
 	if (tileTail < 0)
 	    continue;
 	listSize = gf->listSizes[tileHead];
 	qStart = i;
 	endList = gf->endLists[tileHead];
 	for (j=0; j<listSize; ++j)
 	    {
-	    if (endList[0] == tileTail)
+            endListPart *entry = endListGetEntry(gf, tileHead, j);
+ 	    if (endListEntryTileTail(entry) == tileTail)
 		{
-		int tStart = (endList[1]<<16) + endList[2];
+		int tStart = endListEntryOffset(entry);
 		if (target == NULL || 
 			(target == findSource(gf, tStart) 
 			&& tStart >= tMin && tStart < tMax) ) 
 		    {
 		    lmAllocVar(lm,hit);
 		    hit->qStart = qStart;
 		    hit->tStart = tStart;
 		    hit->diagonal = tStart + size - qStart;
 		    slAddHead(&hitList, hit);
 		    ++hitCount;
 		    }
 		}
 	    endList += 3;
 	    }
 	}
@@ -2143,32 +2235,32 @@
 	aaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount,
 	struct gfSeqSource *target, int tMin, int tMax)
 /* Find hits associated with one sequence in a segmented
  * index where one mismatch is allowed. */
 {
 struct gfHit *hitList = NULL, *hit;
 int size = seq->size;
 int tileSize = gf->tileSize;
 int tileTailSize = gf->segSize;
 int tileHeadSize = gf->tileSize - tileTailSize;
 int lastStart = size - tileSize;
 char *poly = seq->dna;
 int i, j;
 int tileHead, tileTail;
 int listSize;
-bits32 qStart;
-bits16 *endList;
+gfOffset qStart;
+endListPart *endList;
 int hitCount = 0;
 int varPos, varVal;	/* Variable position. */
 int (*makeTile)(char *poly, int n); 
 int alphabetSize;
 char oldChar, zeroChar;
 int headPosMul, tailPosMul, avoid;
 boolean modTail;
 int *seqValLookup;
 
 
 initNtLookup();
 if (gf->isPep)
     {
     makeTile = gfPepTile;
     alphabetSize = 20;
@@ -2200,36 +2292,36 @@
 
 	    /* Avoid checking the unmodified tile multiple times. */
 	    if (varPos == 0)
 		avoid = -1;
 	    else
 		avoid = seqValLookup[(int)oldChar];
 
 	    if (tileHead >= 0 && tileTail >= 0)
 		{
 		for (varVal=0; varVal<alphabetSize; ++varVal)
 		    {
 		    if (varVal != avoid)
 			{
 			listSize = gf->listSizes[tileHead];
 			qStart = i;
-			endList = gf->endLists[tileHead];
 			for (j=0; j<listSize; ++j)
 			    {
-			    if (endList[0] == tileTail)
+                            endListPart *entry = endListGetEntry(gf, tileHead, j);
+			    if (endListEntryTileTail(entry) == tileTail)
 				{
-				int tStart = (endList[1]<<16) + endList[2];
+				int tStart = endListEntryOffset(entry);
 				if (target == NULL || 
 					(target == findSource(gf, tStart) 
 					&& tStart >= tMin && tStart < tMax) ) 
 				    {
 				    lmAllocVar(lm,hit);
 				    hit->qStart = qStart;
 				    hit->tStart = tStart;
 				    hit->diagonal = tStart + size - qStart;
 				    slAddHead(&hitList, hit);
 				    ++hitCount;
 				    }
 				}
 			    endList += 3;
 			    }
 			}
@@ -2283,49 +2375,30 @@
 	if (gf->allowOneMismatch)
 	    {
 	    hitList = gfSegmentedFindNearHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
 		target, tMin, tMax);
 	    }
 	else
 	    {
 	    hitList = gfSegmentedFindHits(gf, seq, qMaskBits, qMaskOffset, lm, retHitCount,
 		target, tMin, tMax);
 	    }
 	}
     }
 return hitList;
 }
 
-#ifdef DEBUG
-static void dumpClump(struct gfClump *clump, FILE *f)
-/* Print out a clump */
-{
-struct gfSeqSource *target = clump->target;
-char *tName = target->fileName;
-if (tName == NULL) tName = target->seq->name;
-fprintf(f, "%d-%d\t%s:%d-%d\n", clump->qStart, clump->qEnd, tName, clump->tStart, clump->tEnd);
-}
-
-static void dumpClumpList(struct gfClump *clumpList, FILE *f)
-/* Dump list of clumps. */
-{
-struct gfClump *clump;
-for (clump = clumpList; clump != NULL; clump = clump->next)
-    dumpClump(clump, f);
-}
-#endif /* DEBUG */
-
 struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq, 
 	Bits *qMaskBits, int qMaskOffset,
 	struct lm *lm, int *retHitCount)
 /* Find clumps associated with one sequence soft-masking seq according to qMaskBits */
 {
 struct gfClump *clumpList = NULL;
 struct gfHit *hitList;
 int minMatch = gf->minMatch;
 
 #ifdef OLD	/* stepSize makes this obsolete. */
 if (seq->size < gf->tileSize * (gf->minMatch+1))
      minMatch = 1;
 #endif /* OLD */
 
 hitList =  gfFindHitsWithQmask(gf, seq, qMaskBits, qMaskOffset, lm,
@@ -2495,56 +2568,57 @@
     {
     while (--count >= 0)
 	{
 	if (sameString(source->seq->name, name))
 	    return source;
 	source += 1;
 	}
     }
 return NULL;
 }
 
 static void mergeAdd(struct binKeeper *bk, int start, int end, struct gfSeqSource *src)
 /* Add interval to bin-keeper, merging with any existing overlapping
  * intervals. */
 {
+assert((start >= 0) && (end > start));
 struct binElement *iEl, *iList = binKeeperFind(bk, start, end);
 for (iEl = iList; iEl != NULL; iEl = iEl->next)
     {
     if (iEl->start < start)
         start = iEl->start;
     if (iEl->end > end)
         end = iEl->end;
     binKeeperRemove(bk, iEl->start, iEl->end, src);
     }
 slFreeList(&iList);
 binKeeperAdd(bk, start, end, src);
 }
 
 static struct gfClump *pcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize, 
 	char *rPrimer, int rPrimerSize, int minDistance, int maxDistance)
 /* Find possible PCR hits.  The fPrimer and rPrimer are on the same strand. */
 {
 struct gfClump *clumpList = NULL;
 int tileSize = gf->tileSize;
 int fTile;
 int fTileCount = fPrimerSize - tileSize;
 int *rTiles, rTile;
 int rTileCount = rPrimerSize - tileSize;
 int fTileIx,rTileIx,fPosIx,rPosIx;
-bits32 *fPosList, fPos, *rPosList, rPos;
-int fPosListSize, rPosListSize;
+gfOffset *fPosList, fPos, *rPosList, rPos;
+gfOffset fPosListSize, rPosListSize;
 struct hash *targetHash = newHash(0);
 
 /* Build up array of all tiles in reverse primer. */
 initNtLookup();
 AllocArray(rTiles, rTileCount);
 for (rTileIx = 0; rTileIx<rTileCount; ++rTileIx)
     {
     rTiles[rTileIx] = gfDnaTile(rPrimer + rTileIx, tileSize);
     if (rTiles[rTileIx] == -1)
         errAbort("Bad char in reverse primer sequence: %s", rPrimer);
     }
 
 /* Loop through all tiles in forward primer. */
 for (fTileIx=0; fTileIx<fTileCount; ++fTileIx)
     {
@@ -2562,31 +2636,31 @@
 		rTile = rTiles[rTileIx];
 		rPosListSize = gf->listSizes[rTile];
 		rPosList = gf->lists[rTile];
 		for (rPosIx=0; rPosIx < rPosListSize; ++rPosIx)
 		    {
 		    rPos = rPosList[rPosIx];
 		    if (rPos > fPos)
 		        {
 			int distance = rPos - fPos;
 			if (distance >= minDistance && distance <= maxDistance)
 			    {
 			    struct gfSeqSource *target = findSource(gf, fPos);
 			    if (rPos < target->end)
 			        {
 				struct binKeeper *bk;
-				int tStart = target->start;
+				gfOffset tStart = target->start;
 				char *tName = target->fileName;
 				if (tName == NULL)
 				    tName = target->seq->name;
 				if ((bk = hashFindVal(targetHash, tName)) == NULL)
 				    {
 				    bk = binKeeperNew(0, target->end - tStart);
 				    hashAdd(targetHash, tName, bk);
 				    }
 				mergeAdd(bk, fPos - tStart, rPos - tStart, target);
 				}
 			    }
 			}
 		    }
 		}
 	    }
@@ -2681,15 +2755,16 @@
     else if (tileSize == 9)
 	repMatch = 64*256;
     else if (tileSize == 8)
 	repMatch = 256*256;
     else if (tileSize == 7)
 	repMatch = 1024*256;
     else if (tileSize == 6)
 	repMatch = 4*1024*256;
     else
         internalErr();
     }
 repMatch *= tileSize;
 repMatch /= stepSize;
 return repMatch;
 }
+