916f0b7ad3bfd70ae91ecdb4c30ccfc67094a015
braney
  Sun May 24 14:06:04 2015 -0700
make sure that chroms are sorted in a case-sensitive manner in
bedToBigBed because otherwise the indexes don't work #15400

diff --git src/lib/bbiWrite.c src/lib/bbiWrite.c
index e36f591..2f27699 100644
--- src/lib/bbiWrite.c
+++ src/lib/bbiWrite.c
@@ -163,65 +163,64 @@
     {
     int rowIx = eim->indexFields[i];
     int size = strlen(row[rowIx]);
     if (size > eim->maxFieldSize[i])
         eim->maxFieldSize[i] = size;
     }
 }
 
 struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, 
 	struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount)
 /* Go through bed file and collect chromosomes and statistics.  If eim parameter is non-NULL
  * collect max field sizes there too. */
 {
 int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1);
 char *row[maxRowSize];
-struct hash *uniqHash = hashNew(0);
 struct bbiChromUsage *usage = NULL, *usageList = NULL;
 int lastStart = -1;
 bits32 id = 0;
 bits64 totalBases = 0, bedCount = 0;
 int minDiff = BIGNUM;
 
 lineFileRemoveInitialCustomTrackLines(lf);
 
 for (;;)
     {
     int rowSize = lineFileChopNext(lf, row, maxRowSize);
     if (rowSize == 0)
         break;
     lineFileExpectAtLeast(lf, maxRowSize, rowSize);
     char *chrom = row[0];
     int start = lineFileNeedNum(lf, row, 1);
     int end = lineFileNeedNum(lf, row, 2);
     if (eim != NULL)
 	bbExIndexMakerUpdateMaxFieldSize(eim, row);
     if (start > end)
         {
 	    errAbort("end (%d) before start (%d) line %d of %s",
 	    	end, start, lf->lineIx, lf->fileName);
 	}
     ++bedCount;
     totalBases += (end - start);
     if (usage == NULL || differentString(usage->name, chrom))
         {
-	if (hashLookup(uniqHash, chrom))
+	/* make sure chrom names are sorted in ASCII order */
+	if ((usage != NULL) && strcmp(usage->name, chrom) > 0)
 	    {
-	    errAbort("%s is not sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.",
+	    errAbort("%s is not case-sensitive sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" with LC_COLLATE=C,  or bedSort and try again.",
 	    	lf->fileName, lf->lineIx);
 	    }
-	hashAdd(uniqHash, chrom, NULL);
 	struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom);
 	if (chromHashEl == NULL)
 	    errAbort("%s is not found in chromosome sizes file", chrom);
 	int chromSize = ptToInt(chromHashEl->val);
 	AllocVar(usage);
 	usage->name = cloneString(chrom);
 	usage->id = id++;
 	usage->size = chromSize;
 	slAddHead(&usageList, usage);
 	lastStart = -1;
 	}
     if (end > usage->size)
         errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName);
     usage->itemCount += 1;
     if (lastStart >= 0)
@@ -232,31 +231,30 @@
 	    if (diff < 0)
 		errAbort("%s is not sorted at line %d.  Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.",
 		    lf->fileName, lf->lineIx);
 	    minDiff = diff;
 	    }
 	}
     lastStart = start;
     }
 slReverse(&usageList);
 double aveSize = 0;
 if (bedCount > 0)
     aveSize = (double)totalBases/bedCount;
 *retMinDiff = minDiff;
 *retAveSize = aveSize;
 *retBedCount = bedCount;
-freeHash(&uniqHash);
 return usageList;
 }
 
 int bbiCalcResScalesAndSizes(int aveSize, 
     int resScales[bbiMaxZoomLevels], int resSizes[bbiMaxZoomLevels])
 /* Fill in resScales with amount to zoom at each level, and zero out resSizes based
  * on average span. Returns the number of zoom levels we actually will use. */
 {
 int resTryCount = bbiMaxZoomLevels, resTry;
 int resIncrement = bbiResIncrement;
 int minZoom = 10;
 int res = aveSize;
 if (res < minZoom)
     res = minZoom;
 for (resTry = 0; resTry < resTryCount; ++resTry)