ef2b0974644900d17bdc4e0a2c8056e995832282 markd Thu Jan 16 11:14:03 2020 -0800 Revert "Initial pass at 64bit blat index" This reverts commit 85f6ef8de85a89781978add2ed98fb158718cd40. diff --git src/inc/genoFind.h src/inc/genoFind.h index 7b8a7f8..f74c209 100644 --- src/inc/genoFind.h +++ src/inc/genoFind.h @@ -29,110 +29,106 @@ #include "bits.h" #endif #ifndef AXT_H #include "axt.h" #endif enum gfConstants { gfMinMatch = 2, gfMaxGap = 2, gfTileSize = 11, gfMaxTileUse = 1024, gfPepMaxTileUse = 30000, }; -typedef bits64 gfOffset; /* offset/size of genome sequences */ - struct gfSeqSource /* Where a block of sequence comes from. */ { struct gfSeqSource *next; char *fileName; /* Name of file. */ bioSeq *seq; /* Sequences. Usually either this or fileName is NULL. */ - gfOffset start,end; /* Position within merged sequence. */ + bits32 start,end; /* Position within merged sequence. */ Bits *maskedBits; /* If non-null contains repeat-masking info. */ }; struct gfHit /* A genoFind hit. */ { struct gfHit *next; - gfOffset qStart; /* Where it hits in query. */ - gfOffset tStart; /* Where it hits in target. */ - gfOffset diagonal; /* tStart + qSize - qStart. */ + bits32 qStart; /* Where it hits in query. */ + bits32 tStart; /* Where it hits in target. */ + bits32 diagonal; /* tStart + qSize - qStart. */ }; /* gfHits are free'd with simple freeMem or slFreeList. */ struct gfClump /* A clump of hits. */ /* Note: for clumps from regular (blat) queries, tStart and tEnd include * target->start, but for clumps from gfPcrClumps(), tStart and tEnd have * already had target->start subtracted. So tStart and tEnd in PCR clumps * are relative to that target sequence (not the collection of all target * sequences). */ { struct gfClump *next; /* Next clump. */ - gfOffset qStart, qEnd; /* Position in query. */ + bits32 qStart, qEnd; /* Position in query. */ struct gfSeqSource *target; /* Target source sequence. */ - gfOffset tStart, tEnd; /* Position in target. */ + bits32 tStart, tEnd; /* Position in target. */ int hitCount; /* Number of hits. */ struct gfHit *hitList; /* List of hits. Not allocated here. */ int queryCoverage; /* Number of bases covered in query (thx AG!) */ }; void gfClumpFree(struct gfClump **pClump); /* Free a single clump. */ void gfClumpFreeList(struct gfClump **pList); /* Free a list of dynamically allocated gfClump's */ -struct endList -/* A more complex list for each N-mer. Used if isSegmented is true. */ -{ - int tileTail: 16; /* The first is the packed last few * letters of the tile */ - gfOffset offset: 48; /* offset in genome */ -}; - - - struct genoFind /* An index of all K-mers in the genome. */ { int maxPat; /* Max # of times pattern can occur * before it is ignored. */ int minMatch; /* Minimum number of tile hits needed * to trigger a clump hit. */ int maxGap; /* Max gap between tiles in a clump. */ int tileSize; /* Size of each N-mer. */ int stepSize; /* Spacing between N-mers. */ int tileSpaceSize; /* Number of N-mer values. */ int tileMask; /* 1-s for each N-mer. */ int sourceCount; /* Count of source files. */ struct gfSeqSource *sources; /* List of sequence sources. */ bool isPep; /* Is a peptide. */ bool allowOneMismatch; /* Allow a single mismatch? */ bool noSimpRepMask; /* Dis-Allow simple repeat masking. */ int segSize; /* Index is segmented if non-zero. */ - gfOffset totalSeqSize; /* Total size of all sequences. */ - gfOffset *listSizes; /* Size of list for each N-mer */ + bits32 totalSeqSize; /* Total size of all sequences. */ + bits32 *listSizes; /* Size of list for each N-mer */ void *allocated; /* Storage space for all lists. */ - gfOffset **lists; /* A list for each N-mer. Used if + bits32 **lists; /* A list for each N-mer. Used if * isSegmented is false. */ - struct endList **endLists; /* A more complex list for each N-mer. - * Used if isSegmented is true. */ + bits16 **endLists; /* A more complex list for each N-mer. + * Used if isSegmented is true. + * Values come in groups of threes. + * The first is the packed last few + * letters of the tile. The next two + * are the offset in the genome. This + * would be a struct but that would take + * 8 bytes instead of 6, or nearly an + * extra gigabyte of RAM. */ }; void genoFindFree(struct genoFind **pGenoFind); /* Free up a genoFind index. */ struct gfSeqSource *gfFindNamedSource(struct genoFind *gf, char *name); /* Find target of given name. Return NULL if none. */ /* --- Stuff for saving results ---- */ struct gfOutput /* A polymorphic object to help us write many file types. */ { struct gfOutput *next;