4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/lib/bigBed.c src/lib/bigBed.c index b09326a..276e274 100644 --- src/lib/bigBed.c +++ src/lib/bigBed.c @@ -1,710 +1,710 @@ /* bigBed - interface to binary file with bed-style values (that is a bunch of * possibly overlapping regions. */ /* Copyright (C) 2013 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "hash.h" #include "linefile.h" #include "obscure.h" #include "dystring.h" #include "rangeTree.h" #include "cirTree.h" #include "bPlusTree.h" #include "basicBed.h" #include "asParse.h" #include "zlibFace.h" #include "sig.h" #include "udc.h" #include "bbiFile.h" #include "bigBed.h" struct bbiFile *bigBedFileOpen(char *fileName) /* Open up big bed file. */ { return bbiFileOpen(fileName, bigBedSig, "big bed"); } boolean bigBedFileCheckSigs(char *fileName) /* check file signatures at beginning and end of file */ { return bbiFileCheckSigs(fileName, bigBedSig, "big bed"); } struct bigBedInterval *bigBedIntervalQuery(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, int maxItems, struct lm *lm) /* Get data for interval. Return list allocated out of lm. Set maxItems to maximum * number of items to return, or to 0 for all items. */ { struct bigBedInterval *el, *list = NULL; int itemCount = 0; bbiAttachUnzoomedCir(bbi); // Find blocks with padded start and end to make sure we include zero-length insertions: bits32 paddedStart = (start > 0) ? start-1 : start; bits32 paddedEnd = end+1; bits32 chromId; struct fileOffsetSize *blockList = bbiOverlappingBlocks(bbi, bbi->unzoomedCir, chrom, paddedStart, paddedEnd, &chromId); struct fileOffsetSize *block, *beforeGap, *afterGap; struct udcFile *udc = bbi->udc; boolean isSwapped = bbi->isSwapped; /* Set up for uncompression optionally. */ char *uncompressBuf = NULL; if (bbi->uncompressBufSize > 0) uncompressBuf = needLargeMem(bbi->uncompressBufSize); char *mergedBuf = NULL; for (block = blockList; block != NULL; ) { /* Find contigious blocks and read them into mergedBuf. */ fileOffsetSizeFindGap(block, &beforeGap, &afterGap); bits64 mergedOffset = block->offset; bits64 mergedSize = beforeGap->offset + beforeGap->size - mergedOffset; udcSeek(udc, mergedOffset); mergedBuf = needLargeMem(mergedSize); udcMustRead(udc, mergedBuf, mergedSize); char *blockBuf = mergedBuf; /* Loop through individual blocks within merged section. */ for (;block != afterGap; block = block->next) { /* Uncompress if necessary. */ char *blockPt, *blockEnd; if (uncompressBuf) { blockPt = uncompressBuf; int uncSize = zUncompress(blockBuf, block->size, uncompressBuf, bbi->uncompressBufSize); blockEnd = blockPt + uncSize; } else { blockPt = blockBuf; blockEnd = blockPt + block->size; } while (blockPt < blockEnd) { /* Read next record into local variables. */ bits32 chr = memReadBits32(&blockPt, isSwapped); bits32 s = memReadBits32(&blockPt, isSwapped); bits32 e = memReadBits32(&blockPt, isSwapped); /* calculate length of rest of bed fields */ int restLen = strlen(blockPt); /* If we're actually in range then copy it into a new element and add to list. */ if (chr == chromId && ((s < end && e > start) // Make sure to include zero-length insertion elements at start or end: || (s == e && (s == end || e == start)))) { ++itemCount; if (maxItems > 0 && itemCount > maxItems) break; lmAllocVar(lm, el); el->start = s; el->end = e; if (restLen > 0) el->rest = lmCloneStringZ(lm, blockPt, restLen); el->chromId = chromId; slAddHead(&list, el); } // move blockPt pointer to end of previous bed blockPt += restLen + 1; } if (maxItems > 0 && itemCount > maxItems) break; blockBuf += block->size; } if (maxItems > 0 && itemCount > maxItems) break; freez(&mergedBuf); } freez(&mergedBuf); freeMem(uncompressBuf); slFreeList(&blockList); slReverse(&list); return list; } int bigBedIntervalToRow(struct bigBedInterval *interval, char *chrom, char *startBuf, char *endBuf, char **row, int rowSize) /* Convert bigBedInterval into an array of chars equivalent to what you'd get by * parsing the bed file. The startBuf and endBuf are used to hold the ascii representation of * start and end. Note that the interval->rest string will have zeroes inserted as a side effect. */ { int fieldCount = 3; sprintf(startBuf, "%u", interval->start); sprintf(endBuf, "%u", interval->end); row[0] = chrom; row[1] = startBuf; row[2] = endBuf; if (!isEmpty(interval->rest)) { int wordCount = chopByChar(cloneString(interval->rest), '\t', row+3, rowSize-3); fieldCount += wordCount; } return fieldCount; } static struct bbiInterval *bigBedCoverageIntervals(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, struct lm *lm) /* Return intervals where the val is the depth of coverage. */ { /* Get list of overlapping intervals */ struct bigBedInterval *bi, *biList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); if (biList == NULL) return NULL; /* Make a range tree that collects coverage. */ struct rbTree *rangeTree = rangeTreeNew(); for (bi = biList; bi != NULL; bi = bi->next) rangeTreeAddToCoverageDepth(rangeTree, bi->start, bi->end); struct range *range, *rangeList = rangeTreeList(rangeTree); /* Convert rangeList to bbiInterval list. */ struct bbiInterval *bwi, *bwiList = NULL; for (range = rangeList; range != NULL; range = range->next) { lmAllocVar(lm, bwi); bwi->start = range->start; if (bwi->start < start) bwi->start = start; bwi->end = range->end; if (bwi->end > end) bwi->end = end; bwi->val = ptToInt(range->val); slAddHead(&bwiList, bwi); } slReverse(&bwiList); /* Clean up and go home. */ rangeTreeFree(&rangeTree); return bwiList; } boolean bigBedSummaryArrayExtended(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, int summarySize, struct bbiSummaryElement *summary) /* Get extended summary information for summarySize evenly spaced elements into * the summary array. */ { return bbiSummaryArrayExtended(bbi, chrom, start, end, bigBedCoverageIntervals, summarySize, summary); } boolean bigBedSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, enum bbiSummaryType summaryType, int summarySize, double *summaryValues) /* Fill in summaryValues with data from indicated chromosome range in bigBed file. * Be sure to initialize summaryValues to a default value, which will not be touched * for regions without data in file. (Generally you want the default value to either * be 0.0 or nan("") depending on the application.) Returns FALSE if no data * at that position. */ { return bbiSummaryArray(bbi, chrom, start, end, bigBedCoverageIntervals, summaryType, summarySize, summaryValues); } struct offsetSize /* Simple file offset and file size. */ { bits64 offset; bits64 size; }; static int cmpOffsetSizeRef(const void *va, const void *vb) /* Compare to sort slRef pointing to offsetSize. Sort is kind of hokey, * but guarantees all items that are the same will be next to each other * at least, which is all we care about. */ { const struct slRef *a = *((struct slRef **)va); const struct slRef *b = *((struct slRef **)vb); return memcmp(a->val, b->val, sizeof(struct offsetSize)); } static struct fileOffsetSize *fosFromRedundantBlockList(struct slRef **pBlockList, boolean isSwapped) /* Convert from list of references to offsetSize format to list of fileOffsetSize * format, while removing redundancy. Sorts *pBlockList as a side effect. */ { /* Sort input so it it easy to uniquify. */ slSort(pBlockList, cmpOffsetSizeRef); struct slRef *blockList = *pBlockList; /* Make new fileOffsetSize for each unique offsetSize. */ struct fileOffsetSize *fosList = NULL, *fos; struct offsetSize lastOffsetSize = {0,0}; struct slRef *blockRef; for (blockRef = blockList; blockRef != NULL; blockRef = blockRef->next) { if (memcmp(&lastOffsetSize, blockRef->val, sizeof(lastOffsetSize)) != 0) { memcpy(&lastOffsetSize, blockRef->val, sizeof(lastOffsetSize)); AllocVar(fos); if (isSwapped) { fos->offset = byteSwap64(lastOffsetSize.offset); fos->size = byteSwap64(lastOffsetSize.size); } else { fos->offset = lastOffsetSize.offset; fos->size = lastOffsetSize.size; } slAddHead(&fosList, fos); } } slReverse(&fosList); return fosList; } static struct fileOffsetSize *bigBedChunksMatchingName(struct bbiFile *bbi, struct bptFile *index, char *name) /* Get list of file chunks that match name. Can slFreeList this when done. */ { struct slRef *blockList = bptFileFindMultiple(index, name, strlen(name), sizeof(struct offsetSize)); struct fileOffsetSize *fosList = fosFromRedundantBlockList(&blockList, bbi->isSwapped); slRefFreeListAndVals(&blockList); return fosList; } static struct fileOffsetSize *bigBedChunksMatchingNames(struct bbiFile *bbi, struct bptFile *index, char **names, int nameCount) /* Get list of file chunks that match any of the names. Can slFreeList this when done. */ { /* Go through all names and make a blockList that includes all blocks with any hit to any name. * Many of these blocks will occur multiple times. */ struct slRef *blockList = NULL; int nameIx; for (nameIx = 0; nameIx < nameCount; ++nameIx) { char *name = names[nameIx]; struct slRef *oneList = bptFileFindMultiple(index, name, strlen(name), sizeof(struct offsetSize)); blockList = slCat(oneList, blockList); } /* Create nonredundant list of blocks. */ struct fileOffsetSize *fosList = fosFromRedundantBlockList(&blockList, bbi->isSwapped); /* Clean up and resturn result. */ slRefFreeListAndVals(&blockList); return fosList; } typedef boolean (*BbFirstWordMatch)(char *line, int fieldIx, void *target); /* A function that returns TRUE if first word in tab-separated line matches target. */ static void extractField(char *line, int fieldIx, char **retField, int *retFieldSize) /* Go through tab separated line and figure out start and size of given field. */ { int i; fieldIx -= 3; /* Skip over chrom/start/end, which are not in line. */ for (i=0; i<fieldIx; ++i) { line = strchr(line, '\t'); if (line == NULL) { warn("Not enough fields in extractField of %s", line); internalErr(); } line += 1; } char *end = strchr(line, '\t'); if (end == NULL) end = line + strlen(line); *retField = line; *retFieldSize = end - line; } static boolean bbWordMatchesName(char *line, int fieldIx, void *target) /* Return true if first word of line is same as target, which is just a string. */ { char *name = target; int fieldSize; char *field; extractField(line, fieldIx, &field, &fieldSize); return strlen(name) == fieldSize && memcmp(name, field, fieldSize) == 0; } static boolean bbWordIsInHash(char *line, int fieldIx, void *target) /* Return true if first word of line is same as target, which is just a string. */ { int fieldSize; char *field; extractField(line, fieldIx, &field, &fieldSize); char fieldString[fieldSize+1]; memcpy(fieldString, field, fieldSize); fieldString[fieldSize] = 0; /* Return boolean value that reflects whether we found it in hash */ struct hash *hash = target; return hashLookup(hash, fieldString) != NULL; } static struct bigBedInterval *bigBedIntervalsMatchingName(struct bbiFile *bbi, struct fileOffsetSize *fosList, BbFirstWordMatch matcher, int fieldIx, void *target, struct lm *lm) /* Return list of intervals inside of sectors of bbiFile defined by fosList where the name * matches target somehow. */ { struct bigBedInterval *interval, *intervalList = NULL; struct fileOffsetSize *fos; boolean isSwapped = bbi->isSwapped; for (fos = fosList; fos != NULL; fos = fos->next) { /* Read in raw data */ udcSeek(bbi->udc, fos->offset); char *rawData = needLargeMem(fos->size); udcRead(bbi->udc, rawData, fos->size); /* Optionally uncompress data, and set data pointer to uncompressed version. */ char *uncompressedData = NULL; char *data = NULL; int dataSize = 0; if (bbi->uncompressBufSize > 0) { data = uncompressedData = needLargeMem(bbi->uncompressBufSize); dataSize = zUncompress(rawData, fos->size, uncompressedData, bbi->uncompressBufSize); } else { data = rawData; dataSize = fos->size; } /* Set up for "memRead" routines to more or less treat memory block like file */ char *blockPt = data, *blockEnd = data + dataSize; struct dyString *dy = dyStringNew(32); // Keep bits outside of chrom/start/end here /* Read next record into local variables. */ while (blockPt < blockEnd) { bits32 chromIx = memReadBits32(&blockPt, isSwapped); bits32 s = memReadBits32(&blockPt, isSwapped); bits32 e = memReadBits32(&blockPt, isSwapped); int c; dyStringClear(dy); // TODO - can simplify this probably just to for (;;) {if ((c = *blockPt++) == 0) ... while ((c = *blockPt++) >= 0) { if (c == 0) break; dyStringAppendC(dy, c); } if ((*matcher)(dy->string, fieldIx, target)) { lmAllocVar(lm, interval); interval->start = s; interval->end = e; interval->rest = cloneString(dy->string); interval->chromId = chromIx; slAddHead(&intervalList, interval); } } /* Clean up temporary buffers. */ dyStringFree(&dy); freez(&uncompressedData); freez(&rawData); } slReverse(&intervalList); return intervalList; } struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, struct bptFile *index, int fieldIx, char *name, struct lm *lm) /* Return list of intervals matching file. These intervals will be allocated out of lm. */ { struct fileOffsetSize *fosList = bigBedChunksMatchingName(bbi, index, name); struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, bbWordMatchesName, fieldIx, name, lm); slFreeList(&fosList); return intervalList; } struct bigBedInterval *bigBedMultiNameQuery(struct bbiFile *bbi, struct bptFile *index, int fieldIx, char **names, int nameCount, struct lm *lm) /* Fetch all records matching any of the names. Using given index on given field. * Return list is allocated out of lm. */ { /* Set up name index and get list of chunks that match any of our names. */ struct fileOffsetSize *fosList = bigBedChunksMatchingNames(bbi, index, names, nameCount); /* Create hash of all names. */ struct hash *hash = newHash(0); int nameIx; for (nameIx=0; nameIx < nameCount; ++nameIx) hashAdd(hash, names[nameIx], NULL); /* Get intervals where name matches hash target. */ struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, bbWordIsInHash, fieldIx, hash, lm); /* Clean up and return results. */ slFreeList(&fosList); hashFree(&hash); return intervalList; } void bigBedIntervalListToBedFile(struct bbiFile *bbi, struct bigBedInterval *intervalList, FILE *f) /* Write out big bed interval list to bed file, looking up chromosome. */ { char chromName[bbi->chromBpt->keySize+1]; int lastChromId = -1; struct bigBedInterval *interval; for (interval = intervalList; interval != NULL; interval = interval->next) { bbiCachedChromLookup(bbi, interval->chromId, lastChromId, chromName, sizeof(chromName)); lastChromId = interval->chromId; fprintf(f, "%s\t%u\t%u\t%s\n", chromName, interval->start, interval->end, interval->rest); } } int bigBedIntervalToRowLookupChrom(struct bigBedInterval *interval, struct bigBedInterval *prevInterval, struct bbiFile *bbi, char *chromBuf, int chromBufSize, char *startBuf, char *endBuf, char **row, int rowSize) /* Convert bigBedInterval to array of chars equivalend to what you'd get by parsing the * bed file. If you already know what chromosome the interval is on use the simpler * bigBedIntervalToRow. This one will look up the chromosome based on the chromId field * of the interval, which is relatively time consuming. To avoid doing this unnecessarily * pass in a non-NULL prevInterval, and if the chromId is the same on prevInterval as this, * it will avoid the lookup. The chromBufSize should be at greater or equal to * bbi->chromBpt->keySize+1. The startBuf and endBuf are used to hold the ascii representation of * start and end, and should be 16 bytes. Note that the interval->rest string will have zeroes * inserted as a side effect. Returns number of fields in row. */ { int lastChromId = (prevInterval == NULL ? -1 : prevInterval->chromId); bbiCachedChromLookup(bbi, interval->chromId, lastChromId, chromBuf, chromBufSize); return bigBedIntervalToRow(interval, chromBuf, startBuf, endBuf, row, rowSize); } char *bigBedAutoSqlText(struct bbiFile *bbi) /* Get autoSql text if any associated with file. Do a freeMem of this when done. */ { if (bbi->asOffset == 0) return NULL; struct udcFile *f = bbi->udc; udcSeek(f, bbi->asOffset); return udcReadStringAndZero(f); } struct asObject *bigBedAs(struct bbiFile *bbi) /* Get autoSql object definition if any associated with file. */ { if (bbi->asOffset == 0) return NULL; char *asText = bigBedAutoSqlText(bbi); struct asObject *as = asParseText(asText); freeMem(asText); return as; } struct asObject *bigBedAsOrDefault(struct bbiFile *bbi) // Get asObject associated with bigBed - if none exists in file make it up from field counts. { struct asObject *as = bigBedAs(bbi); if (as == NULL) as = asParseText(bedAsDef(bbi->definedFieldCount, bbi->fieldCount)); return as; } struct asObject *bigBedFileAsObjOrDefault(char *fileName) // Get asObject associated with bigBed file, or the default. { struct bbiFile *bbi = bigBedFileOpen(fileName); if (bbi) { struct asObject *as = bigBedAsOrDefault(bbi); bbiFileClose(&bbi); return as; } return NULL; } struct slName *bbFieldNames(struct bbiFile *bbi) /* Get list of fields in bigBed */ { if (!bbi) return NULL; struct asObject *as = bigBedAs(bbi); if (!as) return NULL; struct asColumn *col; struct slName *colNames = NULL; for (col = as->columnList; col; col = col->next) slNameAddHead(&colNames, col->name); asObjectFree(&as); slReverse(&colNames); return colNames; } int bbFieldIndex(struct bbiFile *bbi, char* fieldName) /* return the index of a given field */ { if (fieldName==NULL) return -1; struct asObject *as = bigBedAsOrDefault(bbi); if (as == NULL) return -1; // search for field name, return index if found struct asColumn *col = as->columnList; int ix = 0; for (;col != NULL;col=col->next, ix+=1) if (sameString(col->name, fieldName)) return ix; return -1; } int bbExtraFieldIndex(struct bbiFile *bbi, char* fieldName) /* return the index of a given extra field */ { if (fieldName==NULL) return 0; struct asObject *as = bigBedAsOrDefault(bbi); if (as == NULL) return 0; // search for field name, return index if found struct asColumn *col = as->columnList; int ix = 0; for (;col != NULL;col=col->next, ix+=1) if (sameString(col->name, fieldName)) return max(ix-3, 0); // never return a negative value return 0; } bits64 bigBedItemCount(struct bbiFile *bbi) /* Return total items in file. */ { udcSeek(bbi->udc, bbi->unzoomedDataOffset); return udcReadBits64(bbi->udc, bbi->isSwapped); } struct slName *bigBedListExtraIndexes(struct bbiFile *bbi) /* Return list of names of extra indexes beyond primary chrom:start-end one" */ { struct udcFile *udc = bbi->udc; boolean isSwapped = bbi->isSwapped; /* See if we have any extra indexes, and if so seek to there. */ bits64 offset = bbi->extraIndexListOffset; if (offset == 0) return NULL; udcSeek(udc, offset); /* Construct list of field that are being indexed. List is list of * field numbers within asObj. */ int i; struct slInt *intList = NULL, *intEl; for (i=0; i<bbi->extraIndexCount; ++i) { bits16 fieldCount; udcReadBits16(udc, isSwapped); // type fieldCount = udcReadBits16(udc, isSwapped); udcSeekCur(udc, sizeof(bits64)); // skip over fileOffset udcSeekCur(udc, 4); // skip over reserved bits if (fieldCount == 1) { bits16 fieldId = udcReadBits16(udc, isSwapped); udcSeekCur(udc, 2); // skip over reserved bits intEl = slIntNew(fieldId); slAddHead(&intList, intEl); } else { warn("Not yet understanding indexes on multiple fields at once."); internalErr(); } } /* Now have to make an asObject to find out name that corresponds to this field. */ struct asObject *as = bigBedAsOrDefault(bbi); /* Make list of field names out of list of field numbers */ struct slName *nameList = NULL; for (intEl = intList; intEl != NULL; intEl = intEl->next) { struct asColumn *col = slElementFromIx(as->columnList, intEl->val); if (col == NULL) { warn("Inconsistent bigBed file %s", bbi->fileName); internalErr(); } slNameAddHead(&nameList, col->name); } asObjectFree(&as); return nameList; } struct bptFile *bigBedOpenExtraIndex(struct bbiFile *bbi, char *fieldName, int *retFieldIx) /* Return index associated with fieldName. Aborts if no such index. Optionally return * index in a row of this field. */ { struct udcFile *udc = bbi->udc; boolean isSwapped = bbi->isSwapped; struct asObject *as = bigBedAsOrDefault(bbi); struct asColumn *col = asColumnFind(as, fieldName); if (col == NULL) errAbort("No field %s in %s", fieldName, bbi->fileName); int colIx = slIxFromElement(as->columnList, col); if (retFieldIx != NULL) *retFieldIx = colIx; asObjectFree(&as); /* See if we have any extra indexes, and if so seek to there. */ bits64 offset = bbi->extraIndexListOffset; if (offset == 0) errAbort("%s has no indexes", bbi->fileName); udcSeek(udc, offset); /* Go through each extra index and see if it's a match */ int i; for (i=0; i<bbi->extraIndexCount; ++i) { bits16 type = udcReadBits16(udc, isSwapped); bits16 fieldCount = udcReadBits16(udc, isSwapped); bits64 fileOffset = udcReadBits64(udc, isSwapped); udcSeekCur(udc, 4); // skip over reserved bits if (type != 0) { warn("Don't understand type %d", type); internalErr(); } if (fieldCount == 1) { bits16 fieldId = udcReadBits16(udc, isSwapped); udcSeekCur(udc, 2); // skip over reserved bits if (fieldId == colIx) { udcSeek(udc, fileOffset); struct bptFile *bpt = bptFileAttach(bbi->fileName, udc); return bpt; } } else { warn("Not yet understanding indexes on multiple fields at once."); internalErr(); } } errAbort("%s is not indexed in %s", fieldName, bbi->fileName); return NULL; }