4a73fd653634412cc9ad0fc17aab320c3fb30be2 max Fri Aug 24 17:42:31 2018 -0700 making the wigToBigWig -clip option more tolerant to hg38 chroms with non-Hiram names, no redmine diff --git src/lib/bwgCreate.c src/lib/bwgCreate.c index 01fe4a7..7f22244 100644 --- src/lib/bwgCreate.c +++ src/lib/bwgCreate.c @@ -349,30 +349,51 @@ /* Fill in section and add it to list. */ struct bwgSection *section; lmAllocVar(lm, section); section->chrom = chrom; section->start = packed[0].start; section->end = packed[sectionSize-1].start + span; section->type = bwgTypeVariableStep; section->items.variableStepPacked = packed; section->itemSpan = span; section->itemCount = sectionSize; slAddHead(pSectionList, section); } lmCleanup(&lmLocal); } +static bits32 getChromSize(struct hash *chromSizeHash, char *chrom, boolean clipDontDie) +/* return size of chrom or BIGNUM if hash is NULL. errAbort if not found, unless clipDontDie */ +{ +int chromSize = 0; +if (chromSizeHash) { + chromSize = hashIntValDefault(chromSizeHash, chrom, -1); + if (chromSize==-1) { + warn("chromosome %s is not in chrom sizes file", chrom); + + if (clipDontDie) + chromSize = BIGNUM; + else + noWarnAbort(); + } +} +else + chromSize = BIGNUM; + +return chromSize; +} + static unsigned parseUnsignedVal(struct lineFile *lf, char *var, char *val) /* Return val as an integer, printing error message if it's not. */ { char c = val[0]; if (!isdigit(c)) errAbort("Expecting numerical value for %s, got %s, line %d of %s", var, val, lf->lineIx, lf->fileName); return sqlUnsigned(val); } static void parseSteppedSection(struct lineFile *lf, boolean clipDontDie, struct hash *chromSizeHash, char *initialLine, struct lm *lm, int itemsPerSlot, struct bwgSection **pSectionList) /* Parse out a variableStep or fixedStep section and add it to list, breaking it up as need be. */ { @@ -408,31 +429,33 @@ span = parseUnsignedVal(lf, var, val); else if (sameString(var, "step")) step = parseUnsignedVal(lf, var, val); else if (sameString(var, "start")) { start = parseUnsignedVal(lf, var, val); } else errAbort("Unknown setting %s=%s line %d of %s", var, val, lf->lineIx, lf->fileName); } /* Check that we have all that are required and no more, and call type-specific routine to parse * rest of section. */ if (chrom == NULL) errAbort("Missing chrom= setting line %d of %s\n", lf->lineIx, lf->fileName); -bits32 chromSize = (chromSizeHash ? hashIntVal(chromSizeHash, chrom) : BIGNUM); + +bits32 chromSize = getChromSize(chromSizeHash, chrom, clipDontDie); + if (start > chromSize) { warn("line %d of %s: chromosome %s has %u bases, but item starts at %u", lf->lineIx, lf->fileName, chrom, chromSize, start); if (!clipDontDie) noWarnAbort(); } if (type == bwgTypeFixedStep) { if (start == 0) errAbort("Missing start= setting line %d of %s\n", lf->lineIx, lf->fileName); if (step == 0) errAbort("Missing step= setting line %d of %s\n", lf->lineIx, lf->fileName); if (span == 0) span = step; @@ -490,31 +513,31 @@ break; } /* Parse out our line and make sure it has exactly 4 columns. */ char *words[5]; int wordCount = chopLine(line, words); lineFileExpectWords(lf, 4, wordCount); /* Get chromosome. */ char *chromName = words[0]; chrom = hashFindVal(chromHash, chromName); if (chrom == NULL) { lmAllocVar(chromHash->lm, chrom); hashAddSaveName(chromHash, chromName, chrom, &chrom->name); - chrom->size = (chromSizeHash ? hashIntVal(chromSizeHash, chromName) : BIGNUM); + chrom->size = getChromSize(chromSizeHash, chromName, clipDontDie); slAddHead(&chromList, chrom); } /* Convert to item and add to chromosome list. */ lmAllocVar(lm, item); item->start = lineFileNeedNum(lf, words, 1); item->end = lineFileNeedNum(lf, words, 2); item->val = lineFileNeedDouble(lf, words, 3); /* Do sanity checking on coordinates. */ if (item->start > item->end) errAbort("bedGraph error: start (%u) after end line (%u) %d of %s.", item->start, item->end, lf->lineIx, lf->fileName); if (item->end > chrom->size) { @@ -573,31 +596,31 @@ section->end = endItem->end; section->type = bwgTypeBedGraph; section->items.bedGraphList = startItem; section->itemCount = sectionSize; slAddHead(pSectionList, section); } } /* Free up hash, no longer needed. Free's chromList as a side effect since chromList is in * hash's memory. */ hashFree(&chromHash); chromList = NULL; } void bwgMakeChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash, - int *retChromCount, struct bbiChromInfo **retChromArray, + boolean clipDontDie, int *retChromCount, struct bbiChromInfo **retChromArray, int *retMaxChromNameSize) /* Fill in chromId field in sectionList. Return array of chromosome name/ids. * The chromSizeHash is keyed by name, and has int values. */ { /* Build up list of unique chromosome names. */ struct bwgSection *section; char *chromName = ""; int chromCount = 0; int maxChromNameSize = 0; struct slRef *uniq, *uniqList = NULL; for (section = sectionList; section != NULL; section = section->next) { if (!sameString(section->chrom, chromName)) { chromName = section->chrom; @@ -607,79 +630,79 @@ if (len > maxChromNameSize) maxChromNameSize = len; } section->chromId = chromCount-1; } slReverse(&uniqList); /* Allocate and fill in results array. */ struct bbiChromInfo *chromArray; AllocArray(chromArray, chromCount); int i; for (i = 0, uniq = uniqList; i < chromCount; ++i, uniq = uniq->next) { chromArray[i].name = uniq->val; chromArray[i].id = i; - chromArray[i].size = hashIntVal(chromSizeHash, uniq->val); + chromArray[i].size = getChromSize(chromSizeHash, uniq->val, clipDontDie); } /* Clean up, set return values and go home. */ slFreeList(&uniqList); *retChromCount = chromCount; *retChromArray = chromArray; *retMaxChromNameSize = maxChromNameSize; } static int bwgStrcmp (const void * A, const void * B) { char * stringA = *((char **) A); char * stringB = *((char **) B); return strcmp(stringA, stringB); } void bwgMakeAllChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash, - int *retChromCount, struct bbiChromInfo **retChromArray, + boolean clipDontDie, int *retChromCount, struct bbiChromInfo **retChromArray, int *retMaxChromNameSize) /* Fill in chromId field in sectionList. Return array of chromosome name/ids. * The chromSizeHash is keyed by name, and has int values. */ { /* Build up list of unique chromosome names. */ int maxChromNameSize = 0; /* Get list of values */ int chromCount = chromSizeHash->elCount; char ** chromName, ** chromNames; AllocArray(chromNames, chromCount); chromName = chromNames; struct hashEl* el; struct hashCookie cookie = hashFirst(chromSizeHash); for (el = hashNext(&cookie); el; el = hashNext(&cookie)) { *chromName = el->name; if (strlen(el->name) > maxChromNameSize) maxChromNameSize = strlen(el->name); chromName++; } qsort(chromNames, chromCount, sizeof(char *), bwgStrcmp); /* Allocate and fill in results array. */ struct bbiChromInfo *chromArray; AllocArray(chromArray, chromCount); int i; for (i = 0; i < chromCount; ++i) { chromArray[i].name = chromNames[i]; chromArray[i].id = i; - chromArray[i].size = hashIntVal(chromSizeHash, chromNames[i]); + chromArray[i].size = getChromSize(chromSizeHash, chromNames[i], clipDontDie); } // Assign IDs to sections: struct bwgSection *section; char *name = ""; bits32 chromId = 0; for (section = sectionList; section != NULL; section = section->next) { if (!sameString(section->chrom, name)) { for (i = 0; i < chromCount; ++i) { if (sameString(section->chrom, chromArray[i].name)) { section->chromId = i; @@ -944,61 +967,61 @@ bits64 reduction = reductionAmounts[0] = presetReductions[0]; reduceSummaries[0] = bwgReduceSectionList(sectionList, chromInfoArray, presetReductions[0]); for (i=1; i . */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 512 is good. */ boolean clipDontDie, /* If TRUE then clip items off end of chrom rather than dying. */ boolean compress, /* If TRUE then compress data. */ char *outName) /* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) * to binary big wig format. */ { bigWigFileCreateEx( inName, chromSizes, blockSize, itemsPerSlot, clipDontDie,