b31907d700c1fe956e4e4c20e64d91de027d7c84 markd Tue May 14 02:03:33 2024 -0700 merge blatHuge implementation diff --git src/inc/genoFind.h src/inc/genoFind.h index 81ec2ad..ac0bd79 100644 --- src/inc/genoFind.h +++ src/inc/genoFind.h @@ -1,17 +1,18 @@ /* genoFind.h - Interface to modules for fast finding of sequence - * matches. */ + * matches. Compile with -DGFSERVER_HUGE defined to get 64-bit indexes. + */ /* Copyright 2001-2002 Jim Kent. All rights reserved. */ #ifndef GENOFIND_H #define GENOFIND_H #ifndef DNASEQ_H #include "dnaseq.h" #endif #ifndef FUZZYFIND_H #include "fuzzyFind.h" #endif #ifndef HASH_H #include "hash.h" @@ -52,110 +53,127 @@ char *hostName; // need when reconnecting int port; boolean isDynamic; // is this a dynamic server? char *genome; // genome name for dynamic server char *genomeDataDir; // genome data directory for dynamic server }; enum gfConstants { gfMinMatch = 2, gfMaxGap = 2, gfTileSize = 11, gfMaxTileUse = 1024, gfPepMaxTileUse = 30000, }; +#ifdef GFSERVER_HUGE +typedef bits64 gfOffset; /* offset/size of genome sequences */ +#define GFINDEX_BITS 64 +#define GFOFFSET_FMT "%lld" +#else +typedef bits32 gfOffset; /* offset/size of genome sequences */ +#define GFINDEX_BITS 32 +#define GFOFFSET_FMT "%d" +#endif + struct gfSeqSource /* Where a block of sequence comes from. */ { struct gfSeqSource *next; char *fileName; /* Name of file. */ bioSeq *seq; /* Sequences. Usually either this or fileName is NULL. */ - bits32 start,end; /* Position within merged sequence. */ + gfOffset start,end; /* Position within merged sequence. */ Bits *maskedBits; /* If non-null contains repeat-masking info. */ }; struct gfHit /* A genoFind hit. */ { struct gfHit *next; - bits32 qStart; /* Where it hits in query. */ - bits32 tStart; /* Where it hits in target. */ - bits32 diagonal; /* tStart + qSize - qStart. */ + gfOffset qStart; /* Where it hits in query. */ + gfOffset tStart; /* Where it hits in target. */ + gfOffset diagonal; /* tStart + qSize - qStart. */ }; /* gfHits are free'd with simple freeMem or slFreeList. */ struct gfClump /* A clump of hits. */ /* Note: for clumps from regular (blat) queries, tStart and tEnd include * target->start, but for clumps from gfPcrClumps(), tStart and tEnd have * already had target->start subtracted. So tStart and tEnd in PCR clumps * are relative to that target sequence (not the collection of all target * sequences). */ { struct gfClump *next; /* Next clump. */ - bits32 qStart, qEnd; /* Position in query. */ + gfOffset qStart, qEnd; /* Position in query. */ struct gfSeqSource *target; /* Target source sequence. */ - bits32 tStart, tEnd; /* Position in target. */ + gfOffset tStart, tEnd; /* Position in target. */ int hitCount; /* Number of hits. */ struct gfHit *hitList; /* List of hits. Not allocated here. */ int queryCoverage; /* Number of bases covered in query (thx AG!) */ }; void gfClumpFree(struct gfClump **pClump); /* Free a single clump. */ void gfClumpFreeList(struct gfClump **pList); /* Free a list of dynamically allocated gfClump's */ +typedef bits16 endListPart; // endList structure (below) is packed into 3 or 5 16-bit values + + struct genoFind /* An index of all K-mers in the genome. * WARNING: MUST MODIFY CODE TO STORE/LOAD INDEX TO FILES IF THIS STRUCTURE IS * MODIFIED!!! + * + * The endList structure in the index is a more complex list for each N-mer. + * Each row of endList width is in listSizes. Each entry packed last few + * letters of the tile. The next two are the offset in the genome. This + * would be a struct but that would take 8 bytes instead of 6, or nearly an + * extra gigabyte of RAM for the 32-bit index. + * + * The data is packed into an array to optimized. layout and functions are used + * to access it. + * index lastLetters genomeOffset entrySize + * 32-bit 16-bits 32-bits 48-bits + * 64-bit 16-bits 64-bits 80-bits */ { - boolean isMapped; /* is this a mapped file? */ - int maxPat; /* Max # of times pattern can occur + boolean isMapped; /* is this a mapped file? */ int maxPat; /* Max # of times pattern can occur * before it is ignored. */ int minMatch; /* Minimum number of tile hits needed * to trigger a clump hit. */ int maxGap; /* Max gap between tiles in a clump. */ int tileSize; /* Size of each N-mer. */ int stepSize; /* Spacing between N-mers. */ int tileSpaceSize; /* Number of N-mer values. */ int tileMask; /* 1-s for each N-mer. */ int sourceCount; /* Count of source files. */ bool isPep; /* Is a peptide. */ bool allowOneMismatch; /* Allow a single mismatch? */ bool noSimpRepMask; /* Dis-Allow simple repeat masking. */ int segSize; /* Index is segmented if non-zero. */ - bits32 totalSeqSize; /* Total size of all sequences. */ + gfOffset totalSeqSize; /* Total size of all sequences. */ struct gfSeqSource *sources; /* List of sequence sources. */ bits32 *listSizes; /* Size of list for each N-mer */ void *allocated; /* Storage space for all lists. */ - bits32 **lists; /* A list for each N-mer. Used if - * isSegmented is false. */ - bits16 **endLists; /* A more complex list for each N-mer. - * Used if isSegmented is true. - * Values come in groups of threes. - * The first is the packed last few - * letters of the tile. The next two - * are the offset in the genome. This - * would be a struct but that would take - * 8 bytes instead of 6, or nearly an - * extra gigabyte of RAM. */ + gfOffset **lists; /* A list for each N-mer. Used if + * if segSize is zero. */ + endListPart **endLists; /* A more complex list for each N-mer. + * Used if sequence is non-zero. */ }; void genoFindFree(struct genoFind **pGenoFind); /* Free up a genoFind index. */ struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name); /* Find target of given name. Return NULL if none. */ struct genoFindIndex /* container for genoFind indexes, sorting either an untranslated index on six translated indexes. * these can be created in memory or saved to a file to quickly mmap */ { void *memMapped; /* memory mapped if non-NULL, with amount allocated */ size_t memLength; @@ -442,19 +460,19 @@ int minScore, struct gfOutput *out); /* Chop up query into pieces, align each in translated space, and stitch back * together again as nucleotides. */ struct gfClump *gfPcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, int minDistance, int maxDistance); /* Find possible PCR hits. The fPrimer and rPrimer are on opposite strands. * Note: unlike clumps from other query functions, PCR clumps from this * function have already had clump->target->start subtracted from * clump->tStart and clump->tEnd so that the coords are relative to that * target sequence (not the collection of all target sequences). */ #define MAXSINGLEPIECESIZE 5000 /* maximum size of a single piece */ -#define gfVersion "38x1" /* Current BLAT version number */ +#define gfVersion "39x1" /* Current BLAT version number */ #endif /* GENOFIND_H */