e70152e44cc66cc599ff6b699eb8adc07f3e656a kent Sat May 24 21:09:34 2014 -0700 Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment. diff --git src/hg/lib/rangeTreeFile.c src/hg/lib/rangeTreeFile.c index 02b07d8..36d0ff9 100644 --- src/hg/lib/rangeTreeFile.c +++ src/hg/lib/rangeTreeFile.c @@ -1,747 +1,750 @@ /* rangeTree - This module is a way of keeping track of * non-overlapping ranges (half-open intervals). It is * based on the self-balancing rbTree code. Use it in * place of a bitmap when the total number of ranges * is significantly smaller than the number of bits would * be. * Beware the several static/global variables which can be * changed by various function calls. */ +/* Copyright (C) 2014 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + #include "common.h" #include "limits.h" #include "localmem.h" #include "rangeTree.h" #include "rangeTreeFile.h" struct range rangeReadOne(FILE *f, boolean isSwapped) /* Returns a single range from the file */ { struct range r = {NULL, 0, 0, NULL}; r.start = readBits32(f, isSwapped); r.end = r.start + readBits32(f, isSwapped); return r; } void rangeWriteOne(struct range *r, FILE *f) /* Write out one range structure to binary file f. * This only writes start and size. */ { bits32 start = r->start; bits32 size = r->end-r->start; writeOne(f, start); writeOne(f, size); } void rangeReadArray(FILE *f, struct rangeStartSize *r, int n, boolean isSwapped) /* Read 'n' elements of range array (start,size) from file 'f'. */ { mustRead(f, r, n*sizeof(*r)); if (isSwapped) { while (n) { r->start = byteSwap32(r->start); r->size = byteSwap32(r->size); --n; ++r; } } } void rangeWriteArray(struct rangeStartSize *r, int n, FILE *f) /* Write 'n' elements of range array (start,size) to file 'f'. */ { mustWrite(f, r, n*sizeof(*r)); } void rangeWriteArrayToBed(char *chrom, struct rangeStartSize *r, int n, boolean withId, boolean mergeAdjacent, FILE *f) /* Write 'n' elements of range array (start,size) to file 'f' in bed format. * If withId then adds an id in the name field in the format 'chrom.N' . * If mergeAdjacent then any ranges which are adjacent, and would otherwise appear * on multiple bed lines, are merged into a single bed line. */ { int i, id = 0; int start, end; if (n==0) return; start = r[0].start; end = r[0].start+r[0].size; for (i=1 ; i<n ; ++i) { if (end > r[i].start) errAbort("overlapping bed records in chrom %s (start,end=%d) (start=%d,end=%d)\n", chrom, end, r[i].start, r[i].start+r[i].size); if (mergeAdjacent && end == r[i].start) /* keep the previous start and dont write bed */ { end = r[i].start+r[i].size; } else /* print the previous range and update */ { if (withId) fprintf(f, "%s\t%d\t%d\t%s.%d\n", chrom, start, end, chrom, ++id); else fprintf(f, "%s\t%d\t%d\n", chrom, start, end); start = r[i].start; end = r[i].start+r[i].size; } } /* print the last item */ if (withId) fprintf(f, "%s\t%d\t%d\t%s.%d\n", chrom, start, end, chrom, ++id); else fprintf(f, "%s\t%d\t%d\n", chrom, start, end); } unsigned rangeArraySize(struct rangeStartSize *r, int n) /* calculate the total size of the array */ { unsigned size = 0; int i; for ( i = 0 ; i<n ; ++i ) { size += r[i].size; } return size; } void rangeReadWriteN(FILE *inF, int n, boolean isSwapped, FILE *outF) /* Read 'n' ranges in from file 'inF' and write them to file 'outF'. * Reads and writes ranges one at a time. */ { int i; struct range r; for (i=0 ; i<n ; ++i ) { r = rangeReadOne(inF, isSwapped); rangeWriteOne(&r, outF); } } struct range rangeReadOneWithVal(FILE *f, boolean isSwapped, void *(*valReadOne)(FILE *f, boolean isSwapped)) /* Read one range structure from binary file f, including range val. * Returns start, end. * valWriteOne should point to a function which writes the value (not called if null). * Returns val if valWriteOne is not null */ { struct range r = {NULL, 0, 0, NULL}; r.start = readBits32(f, isSwapped); r.end = r.start + readBits32(f, isSwapped); if (valReadOne) r.val = valReadOne(f, isSwapped); return r; } void rangeWriteOneWithVal(struct range *r, FILE *f, void (*valWriteOne)(void *val, FILE *f)) /* Write out one range structure to binary file f. * Writes start and size. * valWriteOne should point to a function which writes the value (not called if null). */ { rangeWriteOne(r, f); if (valWriteOne) valWriteOne(r->val, f); } /* Some file-globals to be used by rangeTreeWriteFile, rangeTreeWriteFileWithVal, rangeTreeSizeInFileWithVal */ static FILE *tempF = NULL; static void (*tempFuncWriteNodesWithVal)(void *val, FILE *f) = NULL; static int tempValSize = 0; static int (*tempFuncRangeValSize)(void *val) = NULL; static void rangeWriteFile(void *item) /* Utility function to pass file handle. See rangeTreeWriteOne() below. * Not thread-safe. */ { rangeWriteOne(item, tempF); } static void rangeWriteFileWithVal(void *item) /* Utility function to pass file handle and func ptr. See rangeTreeWriteOneWithVal() below. * Not thread-safe. */ { rangeWriteOneWithVal(item, tempF, tempFuncWriteNodesWithVal); } static void rangeValSize(void *val) /* Utility function to calculate binary file size of range val. * Not thread-safe. */ { tempValSize += tempFuncRangeValSize(val); } void rangeArrayToTree(struct rbTree *rt, struct rangeStartSize r[], int numNodes) /* Reads numNodes ranges from the range array 'r' and adds them to rangeTree rt. * Does not read range val. */ { int i; for (i = 0 ; i<numNodes ; ++i) { rangeTreeAdd(rt, r[i].start, r[i].start+r[i].size); } } void rangeTreeReadNodes(FILE *f, struct rbTree *rt, int numNodes, boolean isSwapped) /* Reads numNodes ranges from the file and adds them to rangeTree rt. * Does not read range val. */ { struct range r; int i; for (i = 0 ; i<numNodes ; ++i) { r = rangeReadOne(f, isSwapped); rangeTreeAdd(rt, r.start, r.end); } } void rangeTreeWriteNodes(struct rbTree *tree, FILE *f) /* Write out one rangeTree structure to binary file f. * Note this does not include the name, which is stored only in index. * Ranges are written in start sequence (depth-first tree traversal). * Writes start and size but not val. * Not thread-safe. */ { tempF = f; rbTreeTraverse(tree, rangeWriteFile); } static void *assignVal(void *existing, void *new) /* Local helper function to assign range val in case of reading rangeTree * with guaranteed non-overlapping ranges on disk (no merging of values would occur). * Simply assignes new value and ignores null existing value. * Produces error if existing value is non-null. */ { if (existing != NULL) errAbort("assignVal found existing val (%p) where NULL expected\n", existing); return new; } void rangeTreeReadNodesWithVal(FILE *f, struct rbTree *rt, int numNodes, boolean isSwapped, void *(*valReadOne)(FILE *f, boolean isSwapped)) /* Reads numNodes ranges from the file and adds them to rangeTree rt. * If rt contains no nodes, and since rangeTree was saved to disk * implying its ranges are already non-overlapping, it is safe * to use a mergeVal function which simply assigns the stored value * to the range since the existing val must always be null. * Produces an error if the rt contains nodes already. */ { if (rt->n > 0) errAbort("Range tree already contains %d ranges. Use rangeTreeReadOneWithValMerge().\n", rt->n); rangeTreeReadNodesWithValMerge(f, rt, numNodes, isSwapped, valReadOne, assignVal); } void rangeTreeReadNodesWithValMerge(FILE *f, struct rbTree *rt, int numNodes, boolean isSwapped, void *(*valReadOne)(FILE *f, boolean isSwapped), void *(*mergeVals)(void *existing, void *new) ) /* Reads numNodes ranges from the file and adds them to rangeTree rt. * Reads range values using valReadOne function, if function is non-null. * Input rangeTree rt could already have nodes, so mergeVals is called * to merge values read from disk to values in the tree. */ { struct range r; int i; for (i = 0 ; i<numNodes ; ++i) { r = rangeReadOneWithVal(f, isSwapped, valReadOne); rangeTreeAddVal(rt, r.start, r.end, r.val, mergeVals); } } void rangeTreeWriteNodesWithVal(struct rbTree *tree, FILE *f, void (*valWriteOne)(void *val, FILE *f)) /* Write out one rangeTree structure to binary file f. * Note this does not include the name, which is stored only in index. * Ranges are written in start sequence (depth-first tree traversal). * Writes start and size. * valWriteOne should be a function which writes the range val. Not called if null. * Not thread-safe. */ { tempF = f; tempFuncWriteNodesWithVal = valWriteOne; rbTreeTraverse(tree, rangeWriteFileWithVal); } int rangeTreeSizeInFile(struct rbTree *tree) /* Returns size of rangeTree written in binary file format. * Includes start and size. * Does not include val. */ { return sizeof(bits32)*2*tree->n; } int rangeTreeSizeInFileWithVal(struct rbTree *tree, int (*rangeValSizeInFile)(void *val)) /* Returns size of rangeTree written in binary file format. * Includes start, size, and val. * rangeValSizeInFile should refer to a function which calculates the size of the val * in a binary file. Not called if null. */ { tempValSize = 0; tempFuncRangeValSize = rangeValSizeInFile; rbTreeTraverse(tree, rangeValSize); return rangeTreeSizeInFile(tree) + tempValSize; } unsigned rangeArrayIntersection(struct rangeStartSize *r1, struct rangeStartSize *r2, int n1, int n2, struct rangeStartSize **pRange, boolean saveMem, struct bed **pBed, char *chrom, struct lm *lm, int *n) /* Create intersection of array of 'n1' ranges (start,size) in r1 with * 'n2' ranges (start,size) in r2. * If pRange is not null, the ranges are saved into an allocated array 'r' * and then returned in *pRange. * Note that the ranges in *pRange are as stored on disk (start,size), * not as in the rangeTree (start,end). * If saveMem is true, tries to shrink size of array r before returning it. * Free array with freez(&r). * If pBed is not null, the ranges are saved as a list of bed records * and returned in *pBed. The bed->chrom field is set to chrom. Memory * for bed records is allocated from lm. Free using lmCleanup. * The number of merged ranges is stored in n. * If no ranges, returns NULL in *pRange and *pBed. * Returns total size of ranges in intersection or zero if none. */ { int r1end, r2end, i = 0; unsigned totSize = 0; struct rangeStartSize *r = NULL; int rSize = max(n1,n2); /* max size needed is larger of two */ struct bed *bList = NULL; if (pBed && !lm) errAbort("specify localmem when requesting bed output"); if (pBed && !chrom) errAbort("specify chrom when requesting bed output"); if (pRange) { AllocArray(r, rSize); } /* loop around merging while we have ranges in both lists */ while ( n1 > 0 && n2 > 0) { verbose(2,"loop n1=%d n2=%d\n", n1, n2); r1end = r1->start+r1->size; r2end = r2->start+r2->size; /* check which should be merged or output */ if ( r1end <= r2->start ) /* r1 is upstream, get next one */ { --n1; ++r1; } else if (r2end <= r1->start) /* r2 is upstream, get next one */ { --n2; ++r2; } else /* Overlap: take the max(start) and min(end) as the intersection */ /* Overlap: keep the one with rightmost end, and extend its start position * to the left if need be. Note that the next element from this rangeTree * will not overlap. * Read the next element from the other rangeTree as it may overlap. * If they both end on the same base, output the largest of the two * and read in the next from each rangeTree. */ { if (r1end > r2end) /* 1 is rightmost, get highest start, fix size, and read next r2 */ { int start = max(r1->start,r2->start); int l = r2end-start; if(pBed) { struct bed *b; lmAllocVar(lm, b); b->chrom = chrom; b->chromStart = start; b->chromEnd = r2end; slAddHead(&bList, b); } if (r) { r[i].start = start; r[i].size = l; } totSize += l; ++i; --n2; ++r2; } else if (r2end > r1end) /* 2 is rightmost, set lowest start, fix size, and read next r1 */ { int start = max(r1->start,r2->start); int l = r1end-start; if(pBed) { struct bed *b; lmAllocVar(lm, b); b->chrom = chrom; b->chromStart = start; b->chromEnd = r1end; slAddHead(&bList, b); } if (r) { r[i].start = start; r[i].size = l; } totSize += l; ++i; --n1; ++r1; } else /* 1 and 2 both end on same base, write out smallest one, and read both */ { int start = max(r1->start,r2->start); int l = r1end-start; if(pBed) { struct bed *b; lmAllocVar(lm, b); b->chrom = chrom; b->chromStart = start; b->chromEnd = r1end; slAddHead(&bList, b); } if (r) { r[i].start = start; r[i].size = l; } totSize += l; ++i; --n1; ++r1; --n2; ++r2; } } } *n = i; if (r && *n == 0) /* if no results, free this memory */ freez(&r); else if (r && saveMem && *n < rSize/2) /* we seriously overestimated mem reqts so free it up */ { struct rangeStartSize *r0 = CloneArray(r, *n); freez(&r); r = r0; } if (pBed) { slReverse(&bList); *pBed = bList; } if (pRange) *pRange = r; return totSize; } unsigned rangeArrayUnion(struct rangeStartSize *r1, struct rangeStartSize *r2, int n1, int n2, struct rangeStartSize **pRange, boolean saveMem, struct bed **pBed, char *chrom, struct lm *lm, int *n) /* Create union of array of 'n1' ranges (start,size) in r1 with * 'n2' ranges (start,size) in r2. * If pRange is not null, the ranges are saved into an allocated array 'r' * and then returned in *pRange. * Note that the ranges in *pRange are as stored on disk (start,size), * not as in the rangeTree (start,end). * If saveMem is true, tries to shrink size of array r before returning it. * Free array with freez(&r). * If pBed is not null, the ranges are saved as a list of bed records * and returned in *pBed. The bed->chrom field is set to chrom. Memory * for bed records is allocated from lm. Free using lmCleanup. * The number of merged ranges is stored in n. * If no ranges, returns NULL in *pRange and *pBed. * Returns total size of ranges in intersection or zero if none. */ { int r1end, r2end, i = 0; unsigned totSize = 0; struct rangeStartSize *r = NULL; int rSize = n1 + n2; /* worst case these dont overlap at all */ struct bed *bList = NULL; if (pBed && !lm) errAbort("specify localmem when requesting bed output"); if (pBed && !chrom) errAbort("specify chrom when requesting bed output"); if (pRange) { AllocArray(r, rSize); } /* loop around merging while we have ranges in both lists */ while ( n1 > 0 && n2 > 0) { r1end = r1->start+r1->size; r2end = r2->start+r2->size; /* check which should be merged or output */ if ( r1end <= r2->start ) /* r1 is upstream, write it and get next one */ { if(pBed) { struct bed *b; lmAllocVar(lm, b); b->chrom = chrom; b->chromStart = r1->start; b->chromEnd = r1->start+r1->size; slAddHead(&bList, b); } if (r) { r[i].start = r1->start; r[i].size = r1->size; } totSize += r1->size; ++i; --n1; ++r1; } else if (r2end <= r1->start) /* r2 is upstream, write it and get next one */ { if(pBed) { struct bed *b; lmAllocVar(lm, b); b->chrom = chrom; b->chromStart = r2->start; b->chromEnd = r2->start+r2->size; slAddHead(&bList, b); } if (r) { r[i].start = r2->start; r[i].size = r2->size; } totSize += r2->size; ++i; --n2; ++r2; } else /* Overlap: keep the one with rightmost end, and extend its start position * to the left if need be. Note that the next element from this rangeTree * will not overlap. * Read the next element from the other rangeTree as it may overlap. * If they both end on the same base, output the largest of the two * and read in the next from each rangeTree. */ { if (r1end > r2end) /* 1 is rightmost, set lowest start, fix size, and read next r2 */ { if (r2->start < r1->start) { r1->start = r2->start; r1->size = r1end - r1->start; } --n2; ++r2; } else if (r2end > r1end) /* 2 is rightmost, set lowest start, fix size, and read next r1 */ { if (r1->start < r2->start) { r2->start = r1->start; r2->size = r2end - r2->start; } --n1; ++r1; } else /* 1 and 2 both end on same base, write out left-most one */ { if (r1->start <= r2->start) { if(pBed) { struct bed *b; lmAllocVar(lm, b); b->chrom = chrom; b->chromStart = r1->start; b->chromEnd = r1->start+r1->size; slAddHead(&bList, b); } if (r) { r[i].start = r1->start; r[i].size = r1->size; } totSize += r1->size; } else { if(pBed) { struct bed *b; lmAllocVar(lm, b); b->chrom = chrom; b->chromStart = r2->start; b->chromEnd = r2->start+r2->size; slAddHead(&bList, b); } if (r) { r[i].start = r2->start; r[i].size = r2->size; } totSize += r2->size; } ++i; --n1; ++r1; --n2; ++r2; } } } /* write out any remaining nodes that are either in n1 or n2 */ while (n1) { if(pBed) { struct bed *b; lmAllocVar(lm, b); b->chrom = chrom; b->chromStart = r1->start; b->chromEnd = r1->start+r1->size; slAddHead(&bList, b); } if (r) { r[i].start = r1->start; r[i].size = r1->size; } totSize += r1->size; ++i; --n1; ++r1; } while (n2) { if(pBed) { struct bed *b; lmAllocVar(lm, b); b->chrom = chrom; b->chromStart = r2->start; b->chromEnd = r2->start+r2->size; slAddHead(&bList, b); } if (r) { r[i].start = r2->start; r[i].size = r2->size; } totSize += r2->size; ++i; --n2; ++r2; } if (i == 0) /* if no results, free this memory */ freez(&r); else if (saveMem && i < rSize/2) /* we seriously overestimated mem reqts so free it up */ { struct rangeStartSize *r0 = CloneArray(r, i); freez(&r); r = r0; } *n = i; if (pBed) { slReverse(&bList); *pBed = bList; } if (pRange) *pRange = r; return totSize; } unsigned rangeArrayUnionToFile(struct rangeStartSize *r1, struct rangeStartSize *r2, int n1, int n2, FILE *of, int *n) /* Create union of array of 'n1' ranges (start,size) in r1 with * 'n2' ranges in r2, writing them to output file 'of' and returning * the number of merged ranges written in 'n'. * Note that the ranges are as stored on disk (start,size) * not as in the rangeTree (start,end). * Writes the ranges one-by-one. * Returns total size of ranges in merged file. */ { int r1end, r2end; unsigned size = 0; *n = 0; /* loop around merging while we have ranges in both lists */ while ( n1 > 0 && n2 > 0) { r1end = r1->start+r1->size; r2end = r2->start+r2->size; /* check which should be merged or output */ if ( r1end <= r2->start ) /* r1 is upstream, write it and get next one */ { if (of) mustWrite(of, r1, sizeof(*r1)); size += r1->size; ++(*n); --n1; ++r1; } else if (r2end <= r1->start) /* r2 is upstream, write it and get next one */ { if (of) mustWrite(of, r2, sizeof(*r2)); size += r2->size; ++(*n); --n2; ++r2; } else /* Overlap: keep the one with rightmost end, and extend its start position * to the left if need be. Note that the next element from this rangeTree * will not overlap. * Read the next element from the other rangeTree as it may overlap. * If they both end on the same base, output the largest of the two * and read in the next from each rangeTree. */ { if (r1end > r2end) /* 1 is rightmost, set lowest start, fix size, and read next r2 */ { if (r2->start < r1->start) { r1->start = r2->start; r1->size = r1end - r1->start; } --n2; ++r2; } else if (r2end > r1end) /* 2 is rightmost, set lowest start, fix size, and read next r1 */ { if (r1->start < r2->start) { r2->start = r1->start; r2->size = r2end - r2->start; } --n1; ++r1; } else /* 1 and 2 both end on same base, write out left-most one */ { if (r1->start <= r2->start) { if (of) mustWrite(of, r1, sizeof(*r1)); size += r1->size; } else { if (of) mustWrite(of, r2, sizeof(*r2)); size += r2->size; } ++(*n); --n1; ++r1; --n2; ++r2; } } } /* write out any remaining nodes that are either in n1 or n2 */ while (n1) { if (of) mustWrite(of, r1, sizeof(*r1)); size += r1->size; ++(*n); --n1; ++r1; } while (n2) { if (of) mustWrite(of, r2, sizeof(*r2)); size += r2->size; ++(*n); --n2; ++r2; } return size; }