dd6685988d7600e805dcf71b943d806dd10289cf braney Sat Sep 30 17:05:23 2023 -0700 relax the constraint on sorting for bedToBigBed and bedGraphToBigWig. diff --git src/lib/bbiWrite.c src/lib/bbiWrite.c index 374f920..23e5e8e 100644 --- src/lib/bbiWrite.c +++ src/lib/bbiWrite.c @@ -45,58 +45,71 @@ { const struct bbiChromInfo *a = (const struct bbiChromInfo *)va; const struct bbiChromInfo *b = (const struct bbiChromInfo *)vb; return strcmp(a->name, b->name); } void bbiWriteChromInfo(struct bbiChromUsage *usageList, int blockSize, FILE *f) /* Write out information on chromosomes to file. */ { int chromCount = slCount(usageList); struct bbiChromUsage *usage; /* Allocate and fill in array from list. */ struct bbiChromInfo *chromInfoArray = NULL; +struct bbiChromUsage **usageArray = NULL; int maxChromNameSize = 0; if (chromCount > 0) { AllocArray(chromInfoArray, chromCount); + AllocArray(usageArray, chromCount); int i; for (i=0, usage = usageList; inext) { char *chromName = usage->name; int len = strlen(chromName); if (len > maxChromNameSize) maxChromNameSize = len; chromInfoArray[i].name = chromName; chromInfoArray[i].id = usage->id; chromInfoArray[i].size = usage->size; + usageArray[i] = usage; } /* Sort so the b-Tree actually works. */ qsort(chromInfoArray, chromCount, sizeof(chromInfoArray[0]), bbiChromInfoCmp); + /* Now we remap the chromId's so they reflect the order in the bTree */ + for (i=0, usage = usageList; inext) + { + if ( usageArray[chromInfoArray[i].id]->id != i) + { + usageArray[chromInfoArray[i].id]->id = i; + chromInfoArray[i].id = i; + } + } } /* Write chromosome bPlusTree */ int chromBlockSize = min(blockSize, chromCount); bptFileBulkIndexToOpenFile(chromInfoArray, sizeof(chromInfoArray[0]), chromCount, chromBlockSize, bbiChromInfoKey, maxChromNameSize, bbiChromInfoVal, sizeof(chromInfoArray[0].id) + sizeof(chromInfoArray[0].size), f); freeMem(chromInfoArray); +freeMem(usageArray); } void bbiWriteFloat(FILE *f, float val) /* Write out floating point val to file. Mostly to convert from double... */ { writeOne(f, val); } struct hash *bbiChromSizesFromFile(char *fileName) /* Read two column file into hash keyed by chrom. */ { struct hash *hash = hashNew(0); struct lineFile *lf = netLineFileOpen(fileName); char *row[2]; while (lineFileRow(lf, row)) @@ -172,64 +185,64 @@ } struct bbiChromUsage *bbiChromUsageFromBedFileInternal(struct lineFile *lf, bbiChromSizeFunc chromSizeFunc, void *chromSizeClosure, struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount, boolean tabSep) /* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL * collect max field sizes there too. */ { int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1); char *row[maxRowSize]; struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; +struct hash *usedHash = newHash(0); lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = 0; if (tabSep) rowSize = lineFileChopCharNext(lf, '\t', row, maxRowSize); else rowSize = lineFileChopNext(lf, row, maxRowSize); if (rowSize == 0) break; lineFileExpectAtLeast(lf, maxRowSize, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (eim != NULL) bbExIndexMakerUpdateMaxFieldSize(eim, row); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { - /* make sure chrom names are sorted in ASCII order */ - if ((usage != NULL) && strcmp(usage->name, chrom) > 0) + if (hashLookup(usedHash, chrom)) { - errAbort("%s is not case-sensitive sorted at line %d. Please use \"LC_ALL=C sort -k1,1 -k2,2n\" or bedSort and try again.", - lf->fileName, lf->lineIx); + errAbort("Error: All data for each sequence needs to be sorted together in file %s. Found sequence named %s not in single block on line %d. Please use \"LC_ALL=C sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, chrom, lf->lineIx); } + hashStore(usedHash, chrom); int chromSize = (*chromSizeFunc)(chromSizeClosure, chrom, lf->lineIx); if (chromSize == 0) errAbort("%s is not found in chromosome sizes file", chrom); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0)