916f0b7ad3bfd70ae91ecdb4c30ccfc67094a015 braney Sun May 24 14:06:04 2015 -0700 make sure that chroms are sorted in a case-sensitive manner in bedToBigBed because otherwise the indexes don't work #15400 diff --git src/lib/bbiWrite.c src/lib/bbiWrite.c index e36f591..2f27699 100644 --- src/lib/bbiWrite.c +++ src/lib/bbiWrite.c @@ -163,65 +163,64 @@ { int rowIx = eim->indexFields[i]; int size = strlen(row[rowIx]); if (size > eim->maxFieldSize[i]) eim->maxFieldSize[i] = size; } } struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount) /* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL * collect max field sizes there too. */ { int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1); char *row[maxRowSize]; -struct hash *uniqHash = hashNew(0); struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = lineFileChopNext(lf, row, maxRowSize); if (rowSize == 0) break; lineFileExpectAtLeast(lf, maxRowSize, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (eim != NULL) bbExIndexMakerUpdateMaxFieldSize(eim, row); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { - if (hashLookup(uniqHash, chrom)) + /* make sure chrom names are sorted in ASCII order */ + if ((usage != NULL) && strcmp(usage->name, chrom) > 0) { - errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", + errAbort("%s is not case-sensitive sorted at line %d. Please use \"sort -k1,1 -k2,2n\" with LC_COLLATE=C, or bedSort and try again.", lf->fileName, lf->lineIx); } - hashAdd(uniqHash, chrom, NULL); struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) @@ -232,31 +231,30 @@ if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); double aveSize = 0; if (bedCount > 0) aveSize = (double)totalBases/bedCount; *retMinDiff = minDiff; *retAveSize = aveSize; *retBedCount = bedCount; -freeHash(&uniqHash); return usageList; } int bbiCalcResScalesAndSizes(int aveSize, int resScales[bbiMaxZoomLevels], int resSizes[bbiMaxZoomLevels]) /* Fill in resScales with amount to zoom at each level, and zero out resSizes based * on average span. Returns the number of zoom levels we actually will use. */ { int resTryCount = bbiMaxZoomLevels, resTry; int resIncrement = bbiResIncrement; int minZoom = 10; int res = aveSize; if (res < minZoom) res = minZoom; for (resTry = 0; resTry < resTryCount; ++resTry)