4a73fd653634412cc9ad0fc17aab320c3fb30be2 max Fri Aug 24 17:42:31 2018 -0700 making the wigToBigWig -clip option more tolerant to hg38 chroms with non-Hiram names, no redmine diff --git src/lib/bwgCreate.c src/lib/bwgCreate.c index 01fe4a7..7f22244 100644 --- src/lib/bwgCreate.c +++ src/lib/bwgCreate.c @@ -349,30 +349,51 @@ /* Fill in section and add it to list. */ struct bwgSection *section; lmAllocVar(lm, section); section->chrom = chrom; section->start = packed[0].start; section->end = packed[sectionSize-1].start + span; section->type = bwgTypeVariableStep; section->items.variableStepPacked = packed; section->itemSpan = span; section->itemCount = sectionSize; slAddHead(pSectionList, section); } lmCleanup(&lmLocal); } +static bits32 getChromSize(struct hash *chromSizeHash, char *chrom, boolean clipDontDie) +/* return size of chrom or BIGNUM if hash is NULL. errAbort if not found, unless clipDontDie */ +{ +int chromSize = 0; +if (chromSizeHash) { + chromSize = hashIntValDefault(chromSizeHash, chrom, -1); + if (chromSize==-1) { + warn("chromosome %s is not in chrom sizes file", chrom); + + if (clipDontDie) + chromSize = BIGNUM; + else + noWarnAbort(); + } +} +else + chromSize = BIGNUM; + +return chromSize; +} + static unsigned parseUnsignedVal(struct lineFile *lf, char *var, char *val) /* Return val as an integer, printing error message if it's not. */ { char c = val[0]; if (!isdigit(c)) errAbort("Expecting numerical value for %s, got %s, line %d of %s", var, val, lf->lineIx, lf->fileName); return sqlUnsigned(val); } static void parseSteppedSection(struct lineFile *lf, boolean clipDontDie, struct hash *chromSizeHash, char *initialLine, struct lm *lm, int itemsPerSlot, struct bwgSection **pSectionList) /* Parse out a variableStep or fixedStep section and add it to list, breaking it up as need be. */ { @@ -408,31 +429,33 @@ span = parseUnsignedVal(lf, var, val); else if (sameString(var, "step")) step = parseUnsignedVal(lf, var, val); else if (sameString(var, "start")) { start = parseUnsignedVal(lf, var, val); } else errAbort("Unknown setting %s=%s line %d of %s", var, val, lf->lineIx, lf->fileName); } /* Check that we have all that are required and no more, and call type-specific routine to parse * rest of section. */ if (chrom == NULL) errAbort("Missing chrom= setting line %d of %s\n", lf->lineIx, lf->fileName); -bits32 chromSize = (chromSizeHash ? hashIntVal(chromSizeHash, chrom) : BIGNUM); + +bits32 chromSize = getChromSize(chromSizeHash, chrom, clipDontDie); + if (start > chromSize) { warn("line %d of %s: chromosome %s has %u bases, but item starts at %u", lf->lineIx, lf->fileName, chrom, chromSize, start); if (!clipDontDie) noWarnAbort(); } if (type == bwgTypeFixedStep) { if (start == 0) errAbort("Missing start= setting line %d of %s\n", lf->lineIx, lf->fileName); if (step == 0) errAbort("Missing step= setting line %d of %s\n", lf->lineIx, lf->fileName); if (span == 0) span = step; @@ -490,31 +513,31 @@ break; } /* Parse out our line and make sure it has exactly 4 columns. */ char *words[5]; int wordCount = chopLine(line, words); lineFileExpectWords(lf, 4, wordCount); /* Get chromosome. */ char *chromName = words[0]; chrom = hashFindVal(chromHash, chromName); if (chrom == NULL) { lmAllocVar(chromHash->lm, chrom); hashAddSaveName(chromHash, chromName, chrom, &chrom->name); - chrom->size = (chromSizeHash ? hashIntVal(chromSizeHash, chromName) : BIGNUM); + chrom->size = getChromSize(chromSizeHash, chromName, clipDontDie); slAddHead(&chromList, chrom); } /* Convert to item and add to chromosome list. */ lmAllocVar(lm, item); item->start = lineFileNeedNum(lf, words, 1); item->end = lineFileNeedNum(lf, words, 2); item->val = lineFileNeedDouble(lf, words, 3); /* Do sanity checking on coordinates. */ if (item->start > item->end) errAbort("bedGraph error: start (%u) after end line (%u) %d of %s.", item->start, item->end, lf->lineIx, lf->fileName); if (item->end > chrom->size) { @@ -573,31 +596,31 @@ section->end = endItem->end; section->type = bwgTypeBedGraph; section->items.bedGraphList = startItem; section->itemCount = sectionSize; slAddHead(pSectionList, section); } } /* Free up hash, no longer needed. Free's chromList as a side effect since chromList is in * hash's memory. */ hashFree(&chromHash); chromList = NULL; } void bwgMakeChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash, - int *retChromCount, struct bbiChromInfo **retChromArray, + boolean clipDontDie, int *retChromCount, struct bbiChromInfo **retChromArray, int *retMaxChromNameSize) /* Fill in chromId field in sectionList. Return array of chromosome name/ids. * The chromSizeHash is keyed by name, and has int values. */ { /* Build up list of unique chromosome names. */ struct bwgSection *section; char *chromName = ""; int chromCount = 0; int maxChromNameSize = 0; struct slRef *uniq, *uniqList = NULL; for (section = sectionList; section != NULL; section = section->next) { if (!sameString(section->chrom, chromName)) { chromName = section->chrom; @@ -607,79 +630,79 @@ if (len > maxChromNameSize) maxChromNameSize = len; } section->chromId = chromCount-1; } slReverse(&uniqList); /* Allocate and fill in results array. */ struct bbiChromInfo *chromArray; AllocArray(chromArray, chromCount); int i; for (i = 0, uniq = uniqList; i < chromCount; ++i, uniq = uniq->next) { chromArray[i].name = uniq->val; chromArray[i].id = i; - chromArray[i].size = hashIntVal(chromSizeHash, uniq->val); + chromArray[i].size = getChromSize(chromSizeHash, uniq->val, clipDontDie); } /* Clean up, set return values and go home. */ slFreeList(&uniqList); *retChromCount = chromCount; *retChromArray = chromArray; *retMaxChromNameSize = maxChromNameSize; } static int bwgStrcmp (const void * A, const void * B) { char * stringA = *((char **) A); char * stringB = *((char **) B); return strcmp(stringA, stringB); } void bwgMakeAllChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash, - int *retChromCount, struct bbiChromInfo **retChromArray, + boolean clipDontDie, int *retChromCount, struct bbiChromInfo **retChromArray, int *retMaxChromNameSize) /* Fill in chromId field in sectionList. Return array of chromosome name/ids. * The chromSizeHash is keyed by name, and has int values. */ { /* Build up list of unique chromosome names. */ int maxChromNameSize = 0; /* Get list of values */ int chromCount = chromSizeHash->elCount; char ** chromName, ** chromNames; AllocArray(chromNames, chromCount); chromName = chromNames; struct hashEl* el; struct hashCookie cookie = hashFirst(chromSizeHash); for (el = hashNext(&cookie); el; el = hashNext(&cookie)) { *chromName = el->name; if (strlen(el->name) > maxChromNameSize) maxChromNameSize = strlen(el->name); chromName++; } qsort(chromNames, chromCount, sizeof(char *), bwgStrcmp); /* Allocate and fill in results array. */ struct bbiChromInfo *chromArray; AllocArray(chromArray, chromCount); int i; for (i = 0; i < chromCount; ++i) { chromArray[i].name = chromNames[i]; chromArray[i].id = i; - chromArray[i].size = hashIntVal(chromSizeHash, chromNames[i]); + chromArray[i].size = getChromSize(chromSizeHash, chromNames[i], clipDontDie); } // Assign IDs to sections: struct bwgSection *section; char *name = ""; bits32 chromId = 0; for (section = sectionList; section != NULL; section = section->next) { if (!sameString(section->chrom, name)) { for (i = 0; i < chromCount; ++i) { if (sameString(section->chrom, chromArray[i].name)) { section->chromId = i; @@ -944,61 +967,61 @@ bits64 reduction = reductionAmounts[0] = presetReductions[0]; reduceSummaries[0] = bwgReduceSectionList(sectionList, chromInfoArray, presetReductions[0]); for (i=1; i<REDUCTION_COUNT; i++) { reduction = reductionAmounts[i] = presetReductions[i]; reduceSummaries[i] = bbiReduceSummaryList(reduceSummaries[i-1], chromInfoArray, reduction); } *summaryCount = REDUCTION_COUNT; } void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash, int blockSize, int itemsPerSlot, boolean doCompress, boolean keepAllChromosomes, - boolean fixedSummaries, char *fileName) + boolean fixedSummaries, boolean clipDontDie, char *fileName) /* Create a bigWig file out of a sorted sectionList. */ { bits64 sectionCount = slCount(sectionList); FILE *f = mustOpen(fileName, "wb"); bits32 sig = bigWigSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = 0; bits16 reserved16 = 0; bits32 reserved32 = 0; bits64 reserved64 = 0; bits64 dataOffset = 0, dataOffsetPos; bits64 indexOffset = 0, indexOffsetPos; bits64 chromTreeOffset = 0, chromTreeOffsetPos; bits64 totalSummaryOffset = 0, totalSummaryOffsetPos; bits32 uncompressBufSize = 0; bits64 uncompressBufSizePos; struct bbiSummary *reduceSummaries[10]; bits32 reductionAmounts[10]; bits64 reductionDataOffsetPos[10]; bits64 reductionDataOffsets[10]; bits64 reductionIndexOffsets[10]; int i; /* Figure out chromosome ID's. */ struct bbiChromInfo *chromInfoArray; int chromCount, maxChromNameSize; if (keepAllChromosomes) - bwgMakeAllChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize); + bwgMakeAllChromInfo(sectionList, chromSizeHash, clipDontDie, &chromCount, &chromInfoArray, &maxChromNameSize); else - bwgMakeChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize); + bwgMakeChromInfo(sectionList, chromSizeHash, clipDontDie, &chromCount, &chromInfoArray, &maxChromNameSize); if (fixedSummaries) bwgComputeFixedSummaries(sectionList, reduceSummaries, &summaryCount, chromInfoArray, reductionAmounts); else bwgComputeDynamicSummaries(sectionList, reduceSummaries, &summaryCount, chromInfoArray, chromCount, reductionAmounts, doCompress); /* Write fixed header. */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); chromTreeOffsetPos = ftell(f); writeOne(f, chromTreeOffset); dataOffsetPos = ftell(f); writeOne(f, dataOffset); indexOffsetPos = ftell(f); @@ -1205,31 +1228,31 @@ char *outName) /* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) * to binary big wig format. */ { /* This code needs to agree with code in two other places currently - bigBedFileCreate, * and bbiFileOpen. I'm thinking of refactoring to share at least between * bigBedFileCreate and bigWigFileCreate. It'd be great so it could be structured * so that it could send the input in one chromosome at a time, and send in the zoom * stuff only after all the chromosomes are done. This'd potentially reduce the memory * footprint by a factor of 2 or 4. Still, for now it works. -JK */ struct hash *chromSizeHash = bbiChromSizesFromFile(chromSizes); struct lm *lm = lmInit(0); struct bwgSection *sectionList = bwgParseWig(inName, clipDontDie, chromSizeHash, itemsPerSlot, lm); if (sectionList == NULL) errAbort("%s is empty of data", inName); -bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, keepAllChromosomes, fixedSummaries, outName); +bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, keepAllChromosomes, fixedSummaries, clipDontDie, outName); lmCleanup(&lm); } void bigWigFileCreate( char *inName, /* Input file in ascii wiggle format. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 512 is good. */ boolean clipDontDie, /* If TRUE then clip items off end of chrom rather than dying. */ boolean compress, /* If TRUE then compress data. */ char *outName) /* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) * to binary big wig format. */ { bigWigFileCreateEx( inName, chromSizes, blockSize, itemsPerSlot, clipDontDie,