b31907d700c1fe956e4e4c20e64d91de027d7c84
markd
  Tue May 14 02:03:33 2024 -0700
merge blatHuge implementation

diff --git src/inc/genoFind.h src/inc/genoFind.h
index 81ec2ad..ac0bd79 100644
--- src/inc/genoFind.h
+++ src/inc/genoFind.h
@@ -1,17 +1,18 @@
 /* genoFind.h - Interface to modules for fast finding of sequence
- * matches. */
+ * matches. Compile with -DGFSERVER_HUGE defined to get 64-bit indexes.
+ */
 /* Copyright 2001-2002 Jim Kent.  All rights reserved. */
 
 #ifndef GENOFIND_H
 #define GENOFIND_H
 
 #ifndef DNASEQ_H
 #include "dnaseq.h"
 #endif
 
 #ifndef FUZZYFIND_H
 #include "fuzzyFind.h"
 #endif
 
 #ifndef HASH_H
 #include "hash.h"
@@ -52,110 +53,127 @@
     char *hostName;   // need when reconnecting
     int port;
     boolean isDynamic;  // is this a dynamic server?
     char *genome;   // genome name for dynamic server
     char *genomeDataDir; // genome data directory for dynamic server
 };
 
 enum gfConstants {
     gfMinMatch = 2,
     gfMaxGap = 2,
     gfTileSize = 11,
     gfMaxTileUse = 1024,
     gfPepMaxTileUse = 30000,
 };
 
+#ifdef GFSERVER_HUGE
+typedef bits64 gfOffset;  /* offset/size of genome sequences */
+#define GFINDEX_BITS 64
+#define GFOFFSET_FMT "%lld"
+#else
+typedef bits32 gfOffset;  /* offset/size of genome sequences */
+#define GFINDEX_BITS 32
+#define GFOFFSET_FMT "%d"
+#endif
+
 struct gfSeqSource
 /* Where a block of sequence comes from. */
     {
     struct gfSeqSource *next;
     char *fileName;	/* Name of file. */
     bioSeq *seq;	/* Sequences.  Usually either this or fileName is NULL. */
-    bits32 start,end;	/* Position within merged sequence. */
+    gfOffset start,end;	/* Position within merged sequence. */
     Bits *maskedBits;	/* If non-null contains repeat-masking info. */
     };
 
 struct gfHit
 /* A genoFind hit. */
    {
    struct gfHit *next;
-   bits32 qStart;		/* Where it hits in query. */
-   bits32 tStart;		/* Where it hits in target. */
-   bits32 diagonal;		/* tStart + qSize - qStart. */
+   gfOffset qStart;		/* Where it hits in query. */
+   gfOffset tStart;		/* Where it hits in target. */
+   gfOffset diagonal;		/* tStart + qSize - qStart. */
    };
 
 /* gfHits are free'd with simple freeMem or slFreeList. */
 
 struct gfClump
 /* A clump of hits. */
 /* Note: for clumps from regular (blat) queries, tStart and tEnd include 
  * target->start, but for clumps from gfPcrClumps(), tStart and tEnd have 
  * already had target->start subtracted.  So tStart and tEnd in PCR clumps 
  * are relative to that target sequence (not the collection of all target 
  * sequences). */
     {
     struct gfClump *next;	/* Next clump. */
-    bits32 qStart, qEnd;	/* Position in query. */
+    gfOffset qStart, qEnd;	/* Position in query. */
     struct gfSeqSource *target;	/* Target source sequence. */
-    bits32 tStart, tEnd;	/* Position in target. */
+    gfOffset tStart, tEnd;	/* Position in target. */
     int hitCount;		/* Number of hits. */
     struct gfHit *hitList;	/* List of hits. Not allocated here. */
     int queryCoverage;		/* Number of bases covered in query (thx AG!) */
     };
 
 void gfClumpFree(struct gfClump **pClump);
 /* Free a single clump. */
 
 void gfClumpFreeList(struct gfClump **pList);
 /* Free a list of dynamically allocated gfClump's */
 
+typedef bits16 endListPart;  // endList structure (below) is packed into 3 or 5 16-bit values
+
+
 struct genoFind
 /* An index of all K-mers in the genome.  
  * WARNING: MUST MODIFY CODE TO STORE/LOAD INDEX TO FILES IF THIS STRUCTURE IS
  * MODIFIED!!!
+ *
+ * The endList structure in the index is a more complex list for each N-mer.
+ * Each row of endList width is in listSizes.  Each entry packed last few
+ * letters of the tile.  The next two are the offset in the genome.  This
+ * would be a struct but that would take 8 bytes instead of 6, or nearly an
+ * extra gigabyte of RAM for the 32-bit index.
+ * 
+ * The data is packed into an array to optimized. layout and functions are used
+ * to access it.
+ *     index   lastLetters   genomeOffset  entrySize
+ *     32-bit  16-bits       32-bits       48-bits
+ *     64-bit  16-bits       64-bits       80-bits
  */
 {
-    boolean isMapped;                    /* is this a mapped file? */
-    int maxPat;                          /* Max # of times pattern can occur
+    boolean isMapped;                    /* is this a mapped file? */    int maxPat;                          /* Max # of times pattern can occur
                                           * before it is ignored. */
     int minMatch;                        /* Minimum number of tile hits needed
                                           * to trigger a clump hit. */
     int maxGap;                          /* Max gap between tiles in a clump. */
     int tileSize;			 /* Size of each N-mer. */
     int stepSize;			 /* Spacing between N-mers. */
     int tileSpaceSize;                   /* Number of N-mer values. */
     int tileMask;			 /* 1-s for each N-mer. */
     int sourceCount;			 /* Count of source files. */
     bool isPep;			 	 /* Is a peptide. */
     bool allowOneMismatch;		 /* Allow a single mismatch? */
     bool noSimpRepMask;			 /* Dis-Allow simple repeat masking. */
     int segSize;			 /* Index is segmented if non-zero. */
-    bits32 totalSeqSize;		 /* Total size of all sequences. */
+    gfOffset totalSeqSize;		 /* Total size of all sequences. */
     struct gfSeqSource *sources;         /* List of sequence sources. */
     bits32 *listSizes;                   /* Size of list for each N-mer */
     void *allocated;                     /* Storage space for all lists. */
-    bits32 **lists;                      /* A list for each N-mer. Used if
-                                          * isSegmented is false. */
-    bits16 **endLists;                   /* A more complex list for each N-mer.
-                                          * Used if isSegmented is true.
-					  * Values come in groups of threes.
-					  * The first is the packed last few
-					  * letters of the tile.  The next two
-					  * are the offset in the genome.  This
-					  * would be a struct but that would take
-					  * 8 bytes instead of 6, or nearly an
-					  * extra gigabyte of RAM. */
+    gfOffset **lists;                    /* A list for each N-mer. Used if
+                                          * if segSize is zero. */
+    endListPart **endLists;              /* A more complex list for each N-mer.
+                                          * Used if sequence is non-zero. */
     };
 
 
 void genoFindFree(struct genoFind **pGenoFind);
 /* Free up a genoFind index. */
 
 struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name);
 /* Find target of given name.  Return NULL if none. */
 
 struct genoFindIndex
 /* container for genoFind indexes, sorting either an untranslated index on six translated indexes.
  * these can be created in memory or saved to a file to quickly mmap */
 {
     void *memMapped;     /* memory mapped if non-NULL, with amount allocated */
     size_t memLength;
@@ -442,19 +460,19 @@
    int minScore, struct gfOutput *out);
 /* Chop up query into pieces, align each in translated space, and stitch back
  * together again as nucleotides. */
 
 struct gfClump *gfPcrClumps(struct genoFind *gf, 
         char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
 	int minDistance, int maxDistance);
 /* Find possible PCR hits.  The fPrimer and rPrimer are on opposite strands.
  * Note: unlike clumps from other query functions, PCR clumps from this 
  * function have already had clump->target->start subtracted from 
  * clump->tStart and clump->tEnd so that the coords are relative to that 
  * target sequence (not the collection of all target sequences). */
 
 #define MAXSINGLEPIECESIZE 5000 /* maximum size of a single piece */
 
-#define gfVersion "38x1"	/* Current BLAT version number */
+#define gfVersion "39x1"	/* Current BLAT version number */
 
 #endif /* GENOFIND_H */