bd1404d9126eb1575ed20098204038364ea90308 kent Tue Mar 5 10:04:23 2013 -0800 Big bed extra indexes seem to work now. Code still needs a little polish and testing. diff --git src/lib/bigBed.c src/lib/bigBed.c index 1108797..1bf3609 100644 --- src/lib/bigBed.c +++ src/lib/bigBed.c @@ -190,45 +190,30 @@ summarySize, summary); } boolean bigBedSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, enum bbiSummaryType summaryType, int summarySize, double *summaryValues) /* Fill in summaryValues with data from indicated chromosome range in bigBed file. * Be sure to initialize summaryValues to a default value, which will not be touched * for regions without data in file. (Generally you want the default value to either * be 0.0 or nan("") depending on the application.) Returns FALSE if no data * at that position. */ { return bbiSummaryArray(bbi, chrom, start, end, bigBedCoverageIntervals, summaryType, summarySize, summaryValues); } -#ifdef OLD -void bigBedAttachNameIndex(struct bbiFile *bbi) -/* Attach name index part of bbiFile to bbi */ -{ -if (bbi->nameBpt == NULL) - { - if (bbi->nameIndexOffset == 0) - errAbort("%s has no name index", bbi->fileName); - udcSeek(bbi->udc, bbi->nameIndexOffset); - bbi->nameBpt = bptFileAttach(bbi->fileName, bbi->udc); - } -uglyAbort("bigBedAttachNameIndex() - no can do"); -} -#endif /* OLD */ - struct offsetSize /* Simple file offset and file size. */ { bits64 offset; bits64 size; }; static int cmpOffsetSizeRef(const void *va, const void *vb) /* Compare to sort slRef pointing to offsetSize. Sort is kind of hokey, * but guarantees all items that are the same will be next to each other * at least, which is all we care about. */ { const struct slRef *a = *((struct slRef **)va); const struct slRef *b = *((struct slRef **)vb); return memcmp(a->val, b->val, sizeof(struct offsetSize)); @@ -294,62 +279,83 @@ { char *name = names[nameIx]; struct slRef *oneList = bptFileFindMultiple(index, name, strlen(name), sizeof(struct offsetSize)); blockList = slCat(oneList, blockList); } /* Create nonredundant list of blocks. */ struct fileOffsetSize *fosList = fosFromRedundantBlockList(&blockList, bbi->isSwapped); /* Clean up and resturn result. */ slRefFreeListAndVals(&blockList); return fosList; } -typedef boolean (*BbFirstWordMatch)(char *line, void *target); +typedef boolean (*BbFirstWordMatch)(char *line, int fieldIx, void *target); /* A function that returns TRUE if first word in tab-separated line matches target. */ +static void extractField(char *line, int fieldIx, char **retField, int *retFieldSize) +/* Go through tab separated line and figure out start and size of given field. */ +{ +int i; +fieldIx -= 3; /* Skip over chrom/start/end, which are not in line. */ +for (i=0; iisSwapped; for (fos = fosList; fos != NULL; fos = fos->next) { /* Read in raw data */ udcSeek(bbi->udc, fos->offset); char *rawData = needLargeMem(fos->size); udcRead(bbi->udc, rawData, fos->size); /* Optionally uncompress data, and set data pointer to uncompressed version. */ char *uncompressedData = NULL; @@ -373,80 +379,81 @@ /* Read next record into local variables. */ while (blockPt < blockEnd) { bits32 chromIx = memReadBits32(&blockPt, isSwapped); bits32 s = memReadBits32(&blockPt, isSwapped); bits32 e = memReadBits32(&blockPt, isSwapped); int c; dyStringClear(dy); while ((c = *blockPt++) >= 0) { if (c == 0) break; dyStringAppendC(dy, c); } - if ((*matcher)(dy->string, target)) + if ((*matcher)(dy->string, fieldIx, target)) { lmAllocVar(lm, interval); interval->start = s; interval->end = e; interval->rest = cloneString(dy->string); interval->chromId = chromIx; slAddHead(&intervalList, interval); } } /* Clean up temporary buffers. */ dyStringFree(&dy); freez(&uncompressedData); freez(&rawData); } slReverse(&intervalList); return intervalList; } struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, struct bptFile *index, - char *name, struct lm *lm) + int fieldIx, char *name, struct lm *lm) /* Return list of intervals matching file. These intervals will be allocated out of lm. */ { struct fileOffsetSize *fosList = bigBedChunksMatchingName(bbi, index, name); struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, - bbFirstWordMatchesName, name, lm); + bbWordMatchesName, fieldIx, name, lm); slFreeList(&fosList); return intervalList; } struct bigBedInterval *bigBedMultiNameQuery(struct bbiFile *bbi, struct bptFile *index, - char **names, int nameCount, struct lm *lm) -/* Fetch all records matching any of the names. Return list is allocated out of lm. */ + int fieldIx, char **names, int nameCount, struct lm *lm) +/* Fetch all records matching any of the names. Using given index on given field. + * Return list is allocated out of lm. */ { /* Set up name index and get list of chunks that match any of our names. */ struct fileOffsetSize *fosList = bigBedChunksMatchingNames(bbi, index, names, nameCount); /* Create hash of all names. */ struct hash *hash = newHash(0); int nameIx; for (nameIx=0; nameIx < nameCount; ++nameIx) hashAdd(hash, names[nameIx], NULL); /* Get intervals where name matches hash target. */ struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, - bbFirstWordIsInHash, hash, lm); + bbWordIsInHash, fieldIx, hash, lm); /* Clean up and return results. */ slFreeList(&fosList); hashFree(&hash); return intervalList; } void bigBedIntervalListToBedFile(struct bbiFile *bbi, struct bigBedInterval *intervalList, FILE *f) /* Write out big bed interval list to bed file, looking up chromosome. */ { char chromName[bbi->chromBpt->keySize+1]; int lastChromId = -1; struct bigBedInterval *interval; for (interval = intervalList; interval != NULL; interval = interval->next) { @@ -475,50 +482,44 @@ } char *bigBedAutoSqlText(struct bbiFile *bbi) /* Get autoSql text if any associated with file. Do a freeMem of this when done. */ { if (bbi->asOffset == 0) return NULL; struct udcFile *f = bbi->udc; udcSeek(f, bbi->asOffset); return udcReadStringAndZero(f); } struct asObject *bigBedAs(struct bbiFile *bbi) /* Get autoSql object definition if any associated with file. */ { -struct asObject *as = bbi->cachedAs; -if (as != NULL) - return as; if (bbi->asOffset == 0) return NULL; char *asText = bigBedAutoSqlText(bbi); -bbi->cachedAs = as = asParseText(asText); +struct asObject *as = asParseText(asText); freeMem(asText); return as; } struct asObject *bigBedAsOrDefault(struct bbiFile *bbi) // Get asObject associated with bigBed - if none exists in file make it up from field counts. { -struct asObject *as = bbi->cachedAs; -if (as != NULL) - return as; -as = bigBedAs(bbi); +struct asObject *as = bigBedAs(bbi); if (as == NULL) - bbi->cachedAs = as = asParseText(bedAsDef(bbi->definedFieldCount, bbi->fieldCount)); + as = asParseText(bedAsDef(bbi->definedFieldCount, bbi->fieldCount)); return as; } struct asObject *bigBedFileAsObjOrDefault(char *fileName) // Get asObject associated with bigBed file, or the default. { struct bbiFile *bbi = bigBedFileOpen(fileName); if (bbi) { struct asObject *as = bigBedAsOrDefault(bbi); bbiFileClose(&bbi); return as; } return NULL; } @@ -571,43 +572,48 @@ struct asObject *as = bigBedAsOrDefault(bbi); /* Make list of field names out of list of field numbers */ struct slName *nameList = NULL; for (intEl = intList; intEl != NULL; intEl = intEl->next) { struct asColumn *col = slElementFromIx(as->columnList, intEl->val); if (col == NULL) { warn("Inconsistent bigBed file %s", bbi->fileName); internalErr(); } slNameAddHead(&nameList, col->name); } +asObjectFree(&as); return nameList; } -struct bptFile *bigBedOpenExtraIndex(struct bbiFile *bbi, char *fieldName) -/* Return index associated with fieldName. Aborts if no such index. */ +struct bptFile *bigBedOpenExtraIndex(struct bbiFile *bbi, char *fieldName, int *retFieldIx) +/* Return index associated with fieldName. Aborts if no such index. Optionally return + * index in a row of this field. */ { struct udcFile *udc = bbi->udc; boolean isSwapped = bbi->isSwapped; struct asObject *as = bigBedAsOrDefault(bbi); struct asColumn *col = asColumnFind(as, fieldName); if (col == NULL) errAbort("No field %s in %s", fieldName, bbi->fileName); int colIx = slIxFromElement(as->columnList, col); +if (retFieldIx != NULL) + *retFieldIx = colIx; +asObjectFree(&as); /* See if we have any extra indexes, and if so seek to there. */ bits64 offset = bbi->extraIndexListOffset; if (offset == 0) errAbort("%s has no indexes", bbi->fileName); udcSeek(udc, offset); /* Go through each extra index and see if it's a match */ int i; for (i=0; iextraIndexCount; ++i) { bits16 type = udcReadBits16(udc, isSwapped); bits16 fieldCount = udcReadBits16(udc, isSwapped); bits64 fileOffset = udcReadBits64(udc, isSwapped); udcSeekCur(udc, 4); // skip over reserved bits