54406b80d5d435970989acff7b22dd6146c6b411 braney Sat Jan 22 15:24:59 2022 -0800 adding chrom alias support to big files diff --git src/lib/bbiRead.c src/lib/bbiRead.c index f517a34..2c7fadc 100644 --- src/lib/bbiRead.c +++ src/lib/bbiRead.c @@ -65,41 +65,42 @@ if (isSwapped) { magic = byteSwap32(magic); if (magic != sig) return FALSE; } else { if (magic != sig) return FALSE; } return TRUE; } -struct bbiFile *bbiFileOpen(char *fileName, bits32 sig, char *typeName) -/* Open up big wig or big bed file. */ +struct bbiFile *bbiFileOpenAlias(char *fileName, bits32 sig, char *typeName, struct hash *aliasHash) +/* Open up big wig or big bed file using a chrom alias hash if non-NULL */ { /* This code needs to agree with code in two other places currently - bigBedFileCreate, * and bigWigFileCreate. I'm thinking of refactoring to share at least between * bigBedFileCreate and bigWigFileCreate. It'd be great so it could be structured * so that it could send the input in one chromosome at a time, and send in the zoom * stuff only after all the chromosomes are done. This'd potentially reduce the memory * footprint by a factor of 2 or 4. Still, for now it works. -JK */ struct bbiFile *bbi; AllocVar(bbi); +bbi->aliasHash = aliasHash; bbi->fileName = cloneString(fileName); struct udcFile *udc = bbi->udc = udcFileOpen(fileName, udcDefaultDir()); /* Read magic number at head of file and use it to see if we are proper file type, and * see if we are byte-swapped. */ bits32 magic; boolean isSwapped = FALSE; udcMustRead(udc, &magic, sizeof(magic)); if (magic != sig) { magic = byteSwap32(magic); isSwapped = TRUE; if (magic != sig) errAbort("%s is not a %s file", fileName, typeName); } @@ -138,72 +139,110 @@ if (bbi->extensionOffset != 0) { udcSeek(udc, bbi->extensionOffset); bbi->extensionSize = udcReadBits16(udc, isSwapped); bbi->extraIndexCount = udcReadBits16(udc, isSwapped); bbi->extraIndexListOffset = udcReadBits64(udc, isSwapped); } /* Attach B+ tree of chromosome names and ids. */ udcSeek(udc, bbi->chromTreeOffset); bbi->chromBpt = bptFileAttach(fileName, udc); return bbi; } +struct bbiFile *bbiFileOpen(char *fileName, bits32 sig, char *typeName) +/* Open up big wig or big bed file. */ +{ +return bbiFileOpenAlias(fileName, sig, typeName, NULL); +} + void bbiFileClose(struct bbiFile **pBwf) /* Close down a big wig/big bed file. */ { struct bbiFile *bwf = *pBwf; if (bwf != NULL) { cirTreeFileDetach(&bwf->unzoomedCir); slFreeList(&bwf->levelList); slFreeList(&bwf->levelList); bptFileDetach(&bwf->chromBpt); udcFileClose(&bwf->udc); freeMem(bwf->fileName); freez(pBwf); } } static void chromIdSizeHandleSwapped(boolean isSwapped, struct bbiChromIdSize *idSize) /* Swap bytes in chromosome Id and Size as needed. */ { if (isSwapped) { idSize->chromId = byteSwap32(idSize->chromId); idSize->chromSize = byteSwap32(idSize->chromSize); } } -struct fileOffsetSize *bbiOverlappingBlocks(struct bbiFile *bbi, struct cirTreeFile *ctf, - char *chrom, bits32 start, bits32 end, bits32 *retChromId) -/* Fetch list of file blocks that contain items overlapping chromosome range. */ +static struct bbiChromIdSize *getChromIdSize(struct bbiFile *bbi, char *chrom) +/* Return idSize for given chrom, using chrom alias if available. */ +{ +struct bbiChromIdSize *idSize; +AllocVar(idSize); + +// first look for the given chrom name +if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), idSize, sizeof(idSize))) { -struct bbiChromIdSize idSize; -if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize))) + if (bbi->aliasHash) + { + // didn't find chrom name, but have an alias hash. Try the aliases + struct hashEl *hel = hashLookup(bbi->aliasHash, chrom); + + while(hel) + { + char *alias = hel->val; + if (bptFileFind(bbi->chromBpt, alias, strlen(alias), idSize, sizeof(idSize))) + break; + + hel = hashLookupNext(hel); + } + } + else { // if chrom is not found and the chrom starts with "chr", try without "chr" - if (!startsWith("chr", chrom) || !bptFileFind(bbi->chromBpt, &chrom[3], strlen(chrom) - 3, &idSize, sizeof(idSize))) + if (!startsWith("chr", chrom) || !bptFileFind(bbi->chromBpt, &chrom[3], strlen(chrom) - 3, idSize, sizeof(idSize))) return NULL; } -chromIdSizeHandleSwapped(bbi->isSwapped, &idSize); + } +chromIdSizeHandleSwapped(bbi->isSwapped, idSize); + +return idSize; +} + +struct fileOffsetSize *bbiOverlappingBlocks(struct bbiFile *bbi, struct cirTreeFile *ctf, + char *chrom, bits32 start, bits32 end, bits32 *retChromId) +/* Fetch list of file blocks that contain items overlapping chromosome range. */ +{ +struct bbiChromIdSize *idSize = getChromIdSize(bbi, chrom); + +if (idSize == NULL) + return NULL; + if (retChromId != NULL) - *retChromId = idSize.chromId; -return cirTreeFindOverlappingBlocks(ctf, idSize.chromId, start, end); + *retChromId = idSize->chromId; +return cirTreeFindOverlappingBlocks(ctf, idSize->chromId, start, end); } struct chromNameCallbackContext /* Some stuff that the bPlusTree traverser needs for context. */ { struct bbiChromInfo *list; /* The list we are building. */ boolean isSwapped; /* Need to byte-swap things? */ }; static void chromNameCallback(void *context, void *key, int keySize, void *val, int valSize) /* Callback that captures chromInfo from bPlusTree. */ { struct chromNameCallbackContext *c = context; struct bbiChromInfo *info; struct bbiChromIdSize *idSize = val; @@ -218,35 +257,36 @@ struct bbiChromInfo *bbiChromList(struct bbiFile *bbi) /* Return list of chromosomes. */ { struct chromNameCallbackContext context; context.list = NULL; context.isSwapped = bbi->isSwapped; bptFileTraverse(bbi->chromBpt, &context, chromNameCallback); slReverse(&context.list); return context.list; } bits32 bbiChromSize(struct bbiFile *bbi, char *chrom) /* Return chromosome size, or 0 if no such chromosome in file. */ { -struct bbiChromIdSize idSize; -if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize))) +struct bbiChromIdSize *idSize = getChromIdSize(bbi, chrom); + +if (idSize == NULL) return 0; -chromIdSizeHandleSwapped(bbi->isSwapped, &idSize); -return idSize.chromSize; + +return idSize->chromSize; } void bbiChromInfoFree(struct bbiChromInfo **pInfo) /* Free up one chromInfo */ { struct bbiChromInfo *info = *pInfo; if (info != NULL) { freeMem(info->name); freez(pInfo); } } void bbiChromInfoFreeList(struct bbiChromInfo **pList) /* Free a list of dynamically allocated bbiChromInfo's */ @@ -491,35 +531,36 @@ maxVal = sum->maxVal; if (minVal > sum->minVal) minVal = sum->minVal; } } if (countFactor > 0) validCount = normalizeCount(el, countFactor, minVal, maxVal, sumData, sumSquares); } return validCount; } static int bbiChromId(struct bbiFile *bbi, char *chrom) /* Return chromosome Id */ { -struct bbiChromIdSize idSize; -if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize))) +struct bbiChromIdSize *idSize = getChromIdSize(bbi, chrom); + +if (idSize == NULL) return -1; -chromIdSizeHandleSwapped(bbi->isSwapped, &idSize); -return idSize.chromId; + +return idSize->chromId; } static boolean bbiSummaryArrayFromZoom(struct bbiZoomLevel *zoom, struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, int summarySize, struct bbiSummaryElement *summary) /* Look up region in index and get data at given zoom level. Summarize this data * in the summary array. */ { boolean result = FALSE; int chromId = bbiChromId(bbi, chrom); if (chromId < 0) return FALSE; struct bbiSummary *sum, *sumList = bbiSummariesInRegion(zoom, bbi, chromId, start, end); if (sumList != NULL) {