3ff9f3d461ea3fc6d42923658c9ea6bf319cbd7b kent Mon Mar 4 21:36:27 2013 -0800 Starting move from only one extra index (on the name field) being allowed to allowing multiple extra indexes. Also defining a header extension block for the file since the header is running out of space. diff --git src/lib/bbiWrite.c src/lib/bbiWrite.c index 811d116..aa9730e 100644 --- src/lib/bbiWrite.c +++ src/lib/bbiWrite.c @@ -120,62 +120,84 @@ } void bbiChromUsageFreeList(struct bbiChromUsage **pList) /* free a list of bbiChromUsage structures */ { struct bbiChromUsage *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; bbiChromUsageFree(&el); } *pList = NULL; } +int bbExIndexMakerMaxIndexField(struct bbExIndexMaker *eim) +/* Return the maximum field we have to index. */ +{ +int maxIx = 0; +int i; +for (i=0; i<eim->indexCount; ++i) + { + int ix = eim->indexFields[i]; + if (ix > maxIx) + maxIx = ix; + } +return maxIx; +} + +void bbExIndexMakerUpdateMaxFieldSize(struct bbExIndexMaker *eim, char **row) +/* Fold in information about row into bbExIndexMaker into eim->maxFieldSize */ +{ +int i; +for (i=0; i<eim->indexCount; ++i) + { + int rowIx = eim->indexFields[i]; + int size = strlen(row[rowIx]); + if (size > eim->maxFieldSize[i]) + eim->maxFieldSize[i] = size; + } +} + struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, - int *retMinDiff, double *retAveSize, bits64 *retBedCount, int *retMaxNameSize) -/* Go through bed file and collect chromosomes and statistics. */ + struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount) +/* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL + * collect max field sizes there too. */ { -char *row[4]; -int maxRowSize = (retMaxNameSize != NULL ? 4 : 3); +int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1); +char *row[maxRowSize]; struct hash *uniqHash = hashNew(0); struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; -int maxNameSize = 0; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = lineFileChopNext(lf, row, maxRowSize); if (rowSize == 0) break; lineFileExpectAtLeast(lf, maxRowSize, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); - if (rowSize > 3) - { - char *name = row[3]; - int nameSize = strlen(name); - if (nameSize > maxNameSize) - maxNameSize = nameSize; - } + if (eim != NULL) + bbExIndexMakerUpdateMaxFieldSize(eim, row); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { if (hashLookup(uniqHash, chrom)) { errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); } hashAdd(uniqHash, chrom, NULL); @@ -198,32 +220,30 @@ int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); *retMinDiff = minDiff; *retAveSize = (double)totalBases/bedCount; *retBedCount = bedCount; -if (retMaxNameSize != NULL) - *retMaxNameSize = maxNameSize; freeHash(&uniqHash); return usageList; } int bbiCountSectionsNeeded(struct bbiChromUsage *usageList, int itemsPerSlot) /* Count up number of sections needed for data. */ { struct bbiChromUsage *usage; int count = 0; for (usage = usageList; usage != NULL; usage = usage->next) { int countOne = (usage->itemCount + itemsPerSlot - 1)/itemsPerSlot; count += countOne; verbose(2, "%s %d, %d blocks of %d\n", usage->name, usage->itemCount, countOne, itemsPerSlot); }