src/lib/bwgCreate.c 4a73fd653634412cc9ad0fc17aab320c3fb30be2

4a73fd653634412cc9ad0fc17aab320c3fb30be2
max
  Fri Aug 24 17:42:31 2018 -0700
making the wigToBigWig -clip option more tolerant to hg38 chroms with non-Hiram names, no redmine

diff --git src/lib/bwgCreate.c src/lib/bwgCreate.c
index 01fe4a7..7f22244 100644
--- src/lib/bwgCreate.c
+++ src/lib/bwgCreate.c
@@ -349,30 +349,51 @@
     /* Fill in section and add it to list. */
     struct bwgSection *section;
     lmAllocVar(lm, section);
     section->chrom = chrom;
     section->start = packed[0].start;
     section->end = packed[sectionSize-1].start + span;
     section->type = bwgTypeVariableStep;
     section->items.variableStepPacked = packed;
     section->itemSpan = span;
     section->itemCount = sectionSize;
     slAddHead(pSectionList, section);
     }
 lmCleanup(&lmLocal);
 }
 
+static bits32 getChromSize(struct hash *chromSizeHash, char *chrom, boolean clipDontDie)
+/* return size of chrom or BIGNUM if hash is NULL. errAbort if not found, unless clipDontDie */
+{
+int chromSize = 0;
+if (chromSizeHash) {
+    chromSize = hashIntValDefault(chromSizeHash, chrom, -1);
+    if (chromSize==-1) {
+        warn("chromosome %s is not in chrom sizes file", chrom);
+
+        if (clipDontDie)
+            chromSize = BIGNUM;
+        else
+            noWarnAbort();
+    }
+}
+else
+    chromSize = BIGNUM;
+
+return chromSize;
+}
+
 static unsigned parseUnsignedVal(struct lineFile *lf, char *var, char *val)
 /* Return val as an integer, printing error message if it's not. */
 {
 char c = val[0];
 if (!isdigit(c))
     errAbort("Expecting numerical value for %s, got %s, line %d of %s", 
     	var, val, lf->lineIx, lf->fileName);
 return sqlUnsigned(val);
 }
 
 static void parseSteppedSection(struct lineFile *lf, boolean clipDontDie, 
 	struct hash *chromSizeHash, char *initialLine, 
 	struct lm *lm, int itemsPerSlot, struct bwgSection **pSectionList)
 /* Parse out a variableStep or fixedStep section and add it to list, breaking it up as need be. */
 {
@@ -408,31 +429,33 @@
 	span = parseUnsignedVal(lf, var, val);
     else if (sameString(var, "step"))
 	step = parseUnsignedVal(lf, var, val);
     else if (sameString(var, "start"))
 	{
         start = parseUnsignedVal(lf, var, val);
 	}
     else
 	errAbort("Unknown setting %s=%s line %d of %s", var, val, lf->lineIx, lf->fileName);
     }
 
 /* Check that we have all that are required and no more, and call type-specific routine to parse
  * rest of section. */
 if (chrom == NULL)
     errAbort("Missing chrom= setting line %d of %s\n", lf->lineIx, lf->fileName);
-bits32 chromSize = (chromSizeHash ? hashIntVal(chromSizeHash, chrom) : BIGNUM);
+
+bits32 chromSize = getChromSize(chromSizeHash, chrom, clipDontDie);
+
 if (start > chromSize)
     {
     warn("line %d of %s: chromosome %s has %u bases, but item starts at %u",
     	lf->lineIx, lf->fileName, chrom, chromSize, start);
     if (!clipDontDie)
         noWarnAbort();
     }
 if (type == bwgTypeFixedStep)
     {
     if (start == 0)
 	errAbort("Missing start= setting line %d of %s\n", lf->lineIx, lf->fileName);
     if (step == 0)
 	errAbort("Missing step= setting line %d of %s\n", lf->lineIx, lf->fileName);
     if (span == 0)
 	span = step;
@@ -490,31 +513,31 @@
 	break;
 	}
 
     /* Parse out our line and make sure it has exactly 4 columns. */
     char *words[5];
     int wordCount = chopLine(line, words);
     lineFileExpectWords(lf, 4, wordCount);
 
     /* Get chromosome. */
     char *chromName = words[0];
     chrom = hashFindVal(chromHash, chromName);
     if (chrom == NULL)
         {
 	lmAllocVar(chromHash->lm, chrom);
 	hashAddSaveName(chromHash, chromName, chrom, &chrom->name);
-	chrom->size = (chromSizeHash ? hashIntVal(chromSizeHash, chromName) : BIGNUM);
+	chrom->size = getChromSize(chromSizeHash, chromName, clipDontDie);
 	slAddHead(&chromList, chrom);
 	}
 
     /* Convert to item and add to chromosome list. */
     lmAllocVar(lm, item);
     item->start = lineFileNeedNum(lf, words, 1);
     item->end = lineFileNeedNum(lf, words, 2);
     item->val = lineFileNeedDouble(lf, words, 3);
 
     /* Do sanity checking on coordinates. */
     if (item->start > item->end)
         errAbort("bedGraph error: start (%u) after end line (%u) %d of %s.", 
 		item->start, item->end, lf->lineIx, lf->fileName);
     if (item->end > chrom->size)
 	{
@@ -573,31 +596,31 @@
 	section->end = endItem->end;
 	section->type = bwgTypeBedGraph;
 	section->items.bedGraphList = startItem;
 	section->itemCount = sectionSize;
 	slAddHead(pSectionList, section);
 	}
     }
 
 /* Free up hash, no longer needed. Free's chromList as a side effect since chromList is in 
  * hash's memory. */
 hashFree(&chromHash);
 chromList = NULL;
 }
 
 void bwgMakeChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash,
-	int *retChromCount, struct bbiChromInfo **retChromArray,
+	boolean clipDontDie, int *retChromCount, struct bbiChromInfo **retChromArray,
 	int *retMaxChromNameSize)
 /* Fill in chromId field in sectionList.  Return array of chromosome name/ids. 
  * The chromSizeHash is keyed by name, and has int values. */
 {
 /* Build up list of unique chromosome names. */
 struct bwgSection *section;
 char *chromName = "";
 int chromCount = 0;
 int maxChromNameSize = 0;
 struct slRef *uniq, *uniqList = NULL;
 for (section = sectionList; section != NULL; section = section->next)
     {
     if (!sameString(section->chrom, chromName))
         {
 	chromName = section->chrom;
@@ -607,79 +630,79 @@
 	if (len > maxChromNameSize)
 	    maxChromNameSize = len;
 	}
     section->chromId = chromCount-1;
     }
 slReverse(&uniqList);
 
 /* Allocate and fill in results array. */
 struct bbiChromInfo *chromArray;
 AllocArray(chromArray, chromCount);
 int i;
 for (i = 0, uniq = uniqList; i < chromCount; ++i, uniq = uniq->next)
     {
     chromArray[i].name = uniq->val;
     chromArray[i].id = i;
-    chromArray[i].size = hashIntVal(chromSizeHash, uniq->val);
+    chromArray[i].size = getChromSize(chromSizeHash, uniq->val, clipDontDie);
     }
 
 /* Clean up, set return values and go home. */
 slFreeList(&uniqList);
 *retChromCount = chromCount;
 *retChromArray = chromArray;
 *retMaxChromNameSize = maxChromNameSize;
 }
 
 static int bwgStrcmp (const void * A, const void * B) {
 	char * stringA = *((char **) A);
 	char * stringB = *((char **) B);
 	return strcmp(stringA, stringB);
 }
 
 void bwgMakeAllChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash,
-	int *retChromCount, struct bbiChromInfo **retChromArray,
+	boolean clipDontDie, int *retChromCount, struct bbiChromInfo **retChromArray,
 	int *retMaxChromNameSize)
 /* Fill in chromId field in sectionList.  Return array of chromosome name/ids. 
  * The chromSizeHash is keyed by name, and has int values. */
 {
 /* Build up list of unique chromosome names. */
 int maxChromNameSize = 0;
 
 /* Get list of values */
 int chromCount = chromSizeHash->elCount;
 char ** chromName, ** chromNames;
 AllocArray(chromNames, chromCount);
 chromName = chromNames;
 struct hashEl* el;
 struct hashCookie cookie = hashFirst(chromSizeHash);
 for (el = hashNext(&cookie); el; el = hashNext(&cookie)) {
 	*chromName = el->name;
 	if (strlen(el->name) > maxChromNameSize)
 		maxChromNameSize = strlen(el->name);
 	chromName++;
 }
 qsort(chromNames, chromCount, sizeof(char *), bwgStrcmp);
 
 /* Allocate and fill in results array. */
 struct bbiChromInfo *chromArray;
 AllocArray(chromArray, chromCount);
 int i;
 for (i = 0; i < chromCount; ++i)
     {
     chromArray[i].name = chromNames[i];
     chromArray[i].id = i;
-    chromArray[i].size = hashIntVal(chromSizeHash, chromNames[i]);
+    chromArray[i].size = getChromSize(chromSizeHash, chromNames[i], clipDontDie);
     }
 
 // Assign IDs to sections:
 struct bwgSection *section;
 char *name = "";
 bits32 chromId = 0;
 for (section = sectionList; section != NULL; section = section->next)
     {
     if (!sameString(section->chrom, name))
         {
         for (i = 0; i < chromCount; ++i)
             {
 	    if (sameString(section->chrom, chromArray[i].name)) 
 	        {
 		    section->chromId = i;
@@ -944,61 +967,61 @@
 bits64 reduction = reductionAmounts[0] = presetReductions[0];
 reduceSummaries[0] = bwgReduceSectionList(sectionList, chromInfoArray, presetReductions[0]);
 
 for (i=1; i<REDUCTION_COUNT; i++)
     {
     reduction = reductionAmounts[i] = presetReductions[i];
     reduceSummaries[i] = bbiReduceSummaryList(reduceSummaries[i-1], chromInfoArray, 
     	reduction);
     }
 
 *summaryCount = REDUCTION_COUNT;
 }
 
 void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash, 
 	int blockSize, int itemsPerSlot, boolean doCompress, boolean keepAllChromosomes,
-        boolean fixedSummaries, char *fileName)
+        boolean fixedSummaries, boolean clipDontDie, char *fileName)
 /* Create a bigWig file out of a sorted sectionList. */
 {
 bits64 sectionCount = slCount(sectionList);
 FILE *f = mustOpen(fileName, "wb");
 bits32 sig = bigWigSig;
 bits16 version = bbiCurrentVersion;
 bits16 summaryCount = 0;
 bits16 reserved16 = 0;
 bits32 reserved32 = 0;
 bits64 reserved64 = 0;
 bits64 dataOffset = 0, dataOffsetPos;
 bits64 indexOffset = 0, indexOffsetPos;
 bits64 chromTreeOffset = 0, chromTreeOffsetPos;
 bits64 totalSummaryOffset = 0, totalSummaryOffsetPos;
 bits32 uncompressBufSize = 0;
 bits64 uncompressBufSizePos;
 struct bbiSummary *reduceSummaries[10];
 bits32 reductionAmounts[10];
 bits64 reductionDataOffsetPos[10];
 bits64 reductionDataOffsets[10];
 bits64 reductionIndexOffsets[10];
 int i;
 
 /* Figure out chromosome ID's. */
 struct bbiChromInfo *chromInfoArray;
 int chromCount, maxChromNameSize;
 if (keepAllChromosomes)
-    bwgMakeAllChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize);
+    bwgMakeAllChromInfo(sectionList, chromSizeHash, clipDontDie, &chromCount, &chromInfoArray, &maxChromNameSize);
 else
-    bwgMakeChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize);
+    bwgMakeChromInfo(sectionList, chromSizeHash, clipDontDie, &chromCount, &chromInfoArray, &maxChromNameSize);
 
 if (fixedSummaries) 
     bwgComputeFixedSummaries(sectionList, reduceSummaries, &summaryCount, chromInfoArray, reductionAmounts);
 else 
     bwgComputeDynamicSummaries(sectionList, reduceSummaries, &summaryCount, chromInfoArray, chromCount, reductionAmounts, doCompress);
 
 /* Write fixed header. */
 writeOne(f, sig);
 writeOne(f, version);
 writeOne(f, summaryCount);
 chromTreeOffsetPos = ftell(f);
 writeOne(f, chromTreeOffset);
 dataOffsetPos = ftell(f);
 writeOne(f, dataOffset);
 indexOffsetPos = ftell(f);
@@ -1205,31 +1228,31 @@
 	char *outName)
 /* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) 
  * to binary big wig format. */
 {
 /* This code needs to agree with code in two other places currently - bigBedFileCreate,
  * and bbiFileOpen.  I'm thinking of refactoring to share at least between
  * bigBedFileCreate and bigWigFileCreate.  It'd be great so it could be structured
  * so that it could send the input in one chromosome at a time, and send in the zoom
  * stuff only after all the chromosomes are done.  This'd potentially reduce the memory
  * footprint by a factor of 2 or 4.  Still, for now it works. -JK */
 struct hash *chromSizeHash = bbiChromSizesFromFile(chromSizes);
 struct lm *lm = lmInit(0);
 struct bwgSection *sectionList = bwgParseWig(inName, clipDontDie, chromSizeHash, itemsPerSlot, lm);
 if (sectionList == NULL)
     errAbort("%s is empty of data", inName);
-bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, keepAllChromosomes, fixedSummaries, outName);
+bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, keepAllChromosomes, fixedSummaries, clipDontDie, outName);
 lmCleanup(&lm);
 }
 
 void bigWigFileCreate(
 	char *inName, 		/* Input file in ascii wiggle format. */
 	char *chromSizes, 	/* Two column tab-separated file: <chromosome> <size>. */
 	int blockSize,		/* Number of items to bundle in r-tree.  1024 is good. */
 	int itemsPerSlot,	/* Number of items in lowest level of tree.  512 is good. */
 	boolean clipDontDie,	/* If TRUE then clip items off end of chrom rather than dying. */
 	boolean compress,	/* If TRUE then compress data. */
 	char *outName)
 /* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) 
  * to binary big wig format. */
 {
 bigWigFileCreateEx( inName, chromSizes, blockSize, itemsPerSlot, clipDontDie,