0ede364a2dcb452681933d5a3579b4f05d90a245 markd Sun Jul 5 15:16:27 2020 -0700 fixed bug where totalSeqSize was not stored in index, now hgBlat works diff --git src/jkOwnLib/genoFind.c src/jkOwnLib/genoFind.c index f475290..a2e40d1 100644 --- src/jkOwnLib/genoFind.c +++ src/jkOwnLib/genoFind.c @@ -44,31 +44,31 @@ signal(SIGPIPE, gfPipeHandler); } int gfReadMulti(int sd, void *vBuf, size_t size) /* Read in until all is read or there is an error. */ { char *buf = vBuf; size_t totalRead = 0; int oneRead; while (totalRead < size) { oneRead = read(sd, buf + totalRead, size - totalRead); if (oneRead < 0) { - perror("Couldn't finish large read"); + errAbort("Couldn't finish large read"); return oneRead; } else if (oneRead == 0) /* Avoid an infinite loop when the client closed the socket. */ break; totalRead += oneRead; } return totalRead; } void genoFindFree(struct genoFind **pGenoFind) /* Free up a genoFind index. */ { struct genoFind *gf = *pGenoFind; @@ -105,73 +105,80 @@ struct genoFindFileHdr /* header for genoFind section in file */ { int maxPat; int minMatch; int maxGap; int tileSize; int stepSize; int tileSpaceSize; int tileMask; int sourceCount; bool isPep; bool allowOneMismatch; bool noSimpRepMask; int segSize; + int totalSeqSize; off_t sourcesOff; // offset of sequences sources off_t listSizesOff; // offset of listSizes off_t listsOff; // offset of lists or endLists off_t endListsOff; + + // Reserved area. This is really padding, as all structures are accessed via + // offsets, so fields can be added without decreasing padding as long as the + // reserved is not consumed. bits64 reserved[32]; // vesion 1.0: 32 }; static void genoFindInitHdr(struct genoFind *gf, struct genoFindFileHdr *hdr) /* fill in the file header struct from in-memory struct */ { zeroBytes(hdr, sizeof(struct genoFindFileHdr)); hdr->maxPat = gf->maxPat; hdr->minMatch = gf->minMatch; hdr->maxGap = gf->maxGap; hdr->tileSize = gf->tileSize; hdr->stepSize = gf->stepSize; hdr->tileSpaceSize = gf->tileSpaceSize; hdr->tileMask = gf->tileMask; hdr->sourceCount = gf->sourceCount; hdr->isPep = gf->isPep; hdr->allowOneMismatch = gf->allowOneMismatch; hdr->noSimpRepMask = gf->noSimpRepMask; hdr->segSize = gf->segSize; +hdr->totalSeqSize = gf->totalSeqSize; } static void genoFindReadHdr(struct genoFindFileHdr *hdr, struct genoFind *gf) /* fill in the in-memory struct from file header struct */ { gf->maxPat = hdr->maxPat; gf->minMatch = hdr->minMatch; gf->maxGap = hdr->maxGap; gf->tileSize = hdr->tileSize; gf->stepSize = hdr->stepSize; gf->tileSpaceSize = hdr->tileSpaceSize; gf->tileMask = hdr->tileMask; gf->sourceCount = hdr->sourceCount; gf->isPep = hdr->isPep; gf->allowOneMismatch = hdr->allowOneMismatch; gf->noSimpRepMask = hdr->noSimpRepMask; gf->segSize = hdr->segSize; +gf->totalSeqSize = hdr->totalSeqSize; } static void genoFindWriteSource(struct gfSeqSource *ss, FILE *f) /* write a gfSeqSource object */ { if ((ss->seq != NULL) || (ss->maskedBits != NULL)) errAbort("can't write index contained sequences, must used external sequence file"); char *fileName = ss->fileName; if (fileName != NULL) { // don't include directories char *s = strrchr(fileName, '/'); if (s != NULL) fileName = s + 1; @@ -359,30 +366,34 @@ maxGap, tileSize, repMatch, oocFile, allowOneMismatch, stepSize, noSimpRepMask); return gfIdx; } struct genoFindIndexFileHdr /* header for genoFind section in file */ { char magic[32]; char version[32]; bits32 indexAddressSize; // 32 or 64 bit, compile time. boolean isTrans; // offsets to data, only one is filed in based on being translated or note off_t untransOff; off_t transOff[2][3]; + + // Reserved area. This is really padding, as all structures are accessed via + // offsets, so fields can be added without decreasing padding as long as the + // reserved is not consumed. bits64 reserved[32]; // vesion 1.0: 32 }; static void genoFindIndexInitHeader(struct genoFindIndex *gfIdx, struct genoFindIndexFileHdr* hdr) /* fill in the file header struct from in-memory struct */ { zeroBytes(hdr, sizeof(struct genoFindIndexFileHdr)); safecpy(hdr->magic, sizeof(hdr->magic), indexFileMagic); safecpy(hdr->version, sizeof(hdr->version), indexFileVerison); hdr->indexAddressSize = 32; hdr->isTrans = gfIdx->isTrans; } static void genoFindIndexReadHeader(void* memMapped, struct genoFindIndexFileHdr* hdr, @@ -1756,30 +1767,31 @@ int totalHits = 0, usedHits = 0, clumpCount = 0; int tileSize = gf->tileSize; int bucketShift = 16; /* 64k buckets. */ bits32 bucketSize = (1<<bucketShift); int bucketCount = (gf->totalSeqSize >> bucketShift) + 1; int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough); bits32 boundary = bucketSize - nearEnough; int i; struct gfHit **buckets = NULL, **pb; /* Sort hit list into buckets. */ AllocArray(buckets, bucketCount); for (hit = hitList; hit != NULL; hit = nextHit) { nextHit = hit->next; + assert((hit->tStart >> bucketShift) < bucketCount); pb = buckets + (hit->tStart >> bucketShift); slAddHead(pb, hit); } /* Sort each bucket on diagonal and clump. */ for (i=0; i<bucketCount; ++i) { int clumpSize; bits32 maxT; struct gfHit *clumpHits; pb = buckets + i; gfHitSortDiagonal(pb); for (hit = *pb; hit != NULL; ) { /* Each time through this loop will get info on a clump. Will only