src/cbPyLib/cellbrowser/cellbrowser.py a3e5065af17b8da62cf1fe1b310877c60fef13a8

a3e5065af17b8da62cf1fe1b310877c60fef13a8
max
  Wed Jul 2 11:05:57 2025 -0700
fixing int/float nan value replacement, refs #26440

diff --git src/cbPyLib/cellbrowser/cellbrowser.py src/cbPyLib/cellbrowser/cellbrowser.py
index cfec862..877ce8f 100755
--- src/cbPyLib/cellbrowser/cellbrowser.py
+++ src/cbPyLib/cellbrowser/cellbrowser.py
@@ -74,33 +74,32 @@
 # the default html dir, used if the --htmlDir option is set but empty
 # this variable is initialized below (no forward declaration in Python)
 # just before cbBuild_parseArgs
 defOutDir = None
 
 CBHOMEURL = "https://cells.ucsc.edu/downloads/cellbrowserData/"
 CBHOMEURL_TEST = "https://cells-test.gi.ucsc.edu/downloads/cellbrowserData/"
 
 # a special value that is used for both x and y to indicate that the cell should not be shown
 # must match the same value in maxPlot.js
 HIDDENCOORD = 12345
 
 # special value representing NaN in floating point arrays
 # sorting is used everywhere in the code, and sorting is fast, so we replace all NaNs with -inf
 # This means that all arrays can be sorted with the normal, fast javascript functions
-# must match the same value in cellBrowser.js
+# This must match the same value in cellBrowser.js
 FLOATNAN = float('-inf')
-
 # special value representing NaN in integer arrays, again, we want this to be first after sorting
 # must match the same value in cellBrowser.js
 INTNAN = -2**16
 
 # how many md5 characters to keep in version identifiers. We load all files using their md5 to get around
 # internet browser caching
 MD5LEN = 10
 
 # list of tags that are required:
 # for cellbrowser.conf of a dataset
 reqTagsDataset =['coords', 'meta']
 # for cellbrowser.conf of a collection
 reqTagsColl =['shortLabel']
 
 coordLabels = {
@@ -918,33 +917,33 @@
     # my new files are smaller and have headers
     elif line1=="#geneId\tsymbol" or fieldCount==2:
         d = {}
         for row in lineFileNextRow(fname):
             if row.symbol=="":
                 continue
             geneId = row.geneId
             if geneId.startswith("EN") and "." in geneId:
                 geneId = geneId.split(".")[0]
             d[geneId]=row.symbol
     else:
         assert(False) # symbols file does not have a header like #geneId<tab>symbol
     logging.debug("Found symbols for %d genes" % len(d))
     return d
 
-def getDecilesList_np(values):
-    deciles = np.percentile( values, [0,10,20,30,40,50,60,70,80,90,100] )
-    return deciles
+#def getDecilesList_np(values):
+    #deciles = np.percentile( values, [0,10,20,30,40,50,60,70,80,90,100] )
+    #return deciles
 
 def bytesAndFmt(x):
     """ how many bytes do we need to store x values and what is the sprintf
     format string for it?
     Return tuple of Javascript type and python struct type.
     See :
         Javascript typed array names, https://developer.mozilla.org/en-US/docs/Web/JavaScript/Typed_arrays
         https://docs.python.org/2/library/struct.html
     """
 
     if x > 65535:
         return "Uint32", "<I"
     elif x > 255:
         return "Uint16", "<H"
     else:
@@ -958,161 +957,161 @@
 #    >>> l = [0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10]
 #    >>> getDecilesWithZeros(l)
 #     ([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 #    """
 #    nonZeros  = [x for x in numVals if x!=0.0]
 #
 #    zeroCount = len(numVals) - len(nonZeros)
 #    deciles   = getDecilesList_np(nonZeros)
 #
 #    decArr = np.searchsorted(deciles, numVals)
 #    decCounts(deciles, nonZeros)
 #
 #    decCounts.insert(0, zeroCount)
 #    return deciles, decCounts, newVals
 
-def findBins(numVals, breakVals, hasNan):
-    """
-    find the right bin index defined by breakVals for every value in numVals.
-    Special handling for the last value. The comparison uses "<=". The first
-    break is assumed to be the minimum of numVals and is therefore ignored.
-    Also returns an array with the count for every bin. hasNan triggers a special
-    mode where the first bin is reserved for NaNs (encoded as -inf)
-    >>> findBins([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1, 2,3,5,6])
-    ([0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3], [6, 2, 4, 2])
-    """
-    bArr = []
-    binCounts = [0]*(len(breakVals)-1)
-
-    # NaNs mean that the non-NaN bins are all +1
-    if hasNan:
-        breaks = breakVals[2:]
-    else:
-        breaks = breakVals[1:]
-
-    if hasNan:
-        for x in numVals:
-            # we use -inf for the NaN value everywhere, because sorting is undefined in lists that contain NaN
-            if math.isinf(x):
-                binIdx = 0
-            else:
-                binIdx = bisect.bisect_left(breaks, x)+1
-            bArr.append(binIdx)
-            binCounts[binIdx]+=1
-    else:
-        for x in numVals:
-            binIdx = bisect.bisect_left(breaks, x) # comparison operator is <=
-            bArr.append(binIdx)
-            binCounts[binIdx]+=1
-
-    return bArr, binCounts
-
-def countBinsBetweenBreaks(numVals, breakVals):
-    """ count how many numVals fall into the bins defined by breakVals.
-    Special handling for the last value. Comparison uses "<=". The first
-    break is assumed to be the minimum of numVals.
-    Also returns an array with the bin for every element in numVals
-    >>> countBinsBetweenBreaks([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1,2,3,5,6])
-    ([6, 2, 4, 2], [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3])
-    """
-
-    binCounts = []
-    binCount = 0
-    i = 1
-    dArr = []
-    for x in numVals:
-        if x <= breakVals[i]:
-            binCount+=1
-        else:
-            binCounts.append(binCount)
-            binCount = 1
-            i += 1
-        dArr.append(i-1)
-
-    binCounts.append(binCount)
-
-    assert(len(dArr)==len(numVals))
-    assert(len(binCounts)==len(breakVals)-1)
-    return binCounts, dArr
-
-def discretizeArray(numVals, fieldMeta):
-    """
-    discretize numeric values based on quantiles.
-    """
-    maxBinCount = 10
-    counts = Counter(numVals).most_common()
-    counts.sort() # sort by value, not count
-
-    if len(counts) < maxBinCount:
-        # if we have just a few values, do not do any binning
-        binCounts = [y for x,y in counts]
-        values = [x for x,y in counts]
-
-        valToBin = {}
-        for i, x in enumerate(values):
-            valToBin[x] = i
-
-        dArr = [valToBin[x] for x in numVals]
+#def findBins(numVals, breakVals, hasNan):
+    #"""
+    #find the right bin index defined by breakVals for every value in numVals.
+    #Special handling for the last value. The comparison uses "<=". The first
+    #break is assumed to be the minimum of numVals and is therefore ignored.
+    #Also returns an array with the count for every bin. hasNan triggers a special
+    #mode where the first bin is reserved for NaNs (encoded as -inf)
+    #>>> findBins([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1, 2,3,5,6])
+    #([0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3], [6, 2, 4, 2])
+    #"""
+    #bArr = []
+    #binCounts = [0]*(len(breakVals)-1)
+#
+    ## NaNs mean that the non-NaN bins are all +1
+    #if hasNan:
+        #breaks = breakVals[2:]
+    #else:
+        #breaks = breakVals[1:]
+#
+    #if hasNan:
+        #for x in numVals:
+            ## we use -inf for the NaN value everywhere, because sorting is undefined in lists that contain NaN
+            #if math.isinf(x):
+                #binIdx = 0
+            #else:
+                #binIdx = bisect.bisect_left(breaks, x)+1
+            #bArr.append(binIdx)
+            #binCounts[binIdx]+=1
+    #else:
+        #for x in numVals:
+            #binIdx = bisect.bisect_left(breaks, x) # comparison operator is <=
+            #bArr.append(binIdx)
+            #binCounts[binIdx]+=1
+#
+    #return bArr, binCounts
+
+#def countBinsBetweenBreaks(numVals, breakVals):
+    #""" count how many numVals fall into the bins defined by breakVals.
+    #Special handling for the last value. Comparison uses "<=". The first
+    #break is assumed to be the minimum of numVals.
+    #Also returns an array with the bin for every element in numVals
+    #>>> countBinsBetweenBreaks([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1,2,3,5,6])
+    #([6, 2, 4, 2], [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3])
+    #"""
+#
+    #binCounts = []
+    #binCount = 0
+    #i = 1
+    #dArr = []
+    #for x in numVals:
+        #if x <= breakVals[i]:
+            #binCount+=1
+        #else:
+            #binCounts.append(binCount)
+            #binCount = 1
+            #i += 1
+        #dArr.append(i-1)
+#
+    #binCounts.append(binCount)
+#
+    #assert(len(dArr)==len(numVals))
+    #assert(len(binCounts)==len(breakVals)-1)
+    #return binCounts, dArr
 
-        fieldMeta["binMethod"] = "raw"
-        fieldMeta["values"] = values
-        fieldMeta["binCounts"] = binCounts
-        return dArr, fieldMeta
+#def discretizeArray(numVals, fieldMeta):
+    #"""
+    #discretize numeric values based on quantiles.
+    #"""
+    #maxBinCount = 10
+    #counts = Counter(numVals).most_common()
+    #counts.sort() # sort by value, not count
+#
+    #if len(counts) < maxBinCount:
+        ## if we have just a few values, do not do any binning
+        #binCounts = [y for x,y in counts]
+        #values = [x for x,y in counts]
+#
+        #valToBin = {}
+        #for i, x in enumerate(values):
+            #valToBin[x] = i
+#
+        #dArr = [valToBin[x] for x in numVals]
+#
+        #fieldMeta["binMethod"] = "raw"
+        #fieldMeta["values"] = values
+        #fieldMeta["binCounts"] = binCounts
+        #return dArr, fieldMeta
 
     # ten breaks
-    breakPercs = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
-    countLen = len(counts)
-    breakIndices = [int(round(bp*countLen)) for bp in breakPercs]
-    # as with all histograms, the last break is always a special case (0-based array)
-    breakIndices.append(countLen-1)
-    breakVals = [counts[idx][0] for idx in breakIndices]
-
+    #breakPercs = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+    #countLen = len(counts)
+    #breakIndices = [int(round(bp*countLen)) for bp in breakPercs]
+    ## as with all histograms, the last break is always a special case (0-based array)
+    #breakIndices.append(countLen-1)
+    #breakVals = [counts[idx][0] for idx in breakIndices]
+#
     # NaNs are encoded as -inf so they always are the first break
     # The first non-NaN value is at index 1
     # If we have NaNs, we need one more bin, with the first non-Nan value
-    hasNan = False
-    if math.isinf(breakVals[0]):
-        hasNan = True
-        breakVals.insert(1, counts[1][0])
-        logging.info("Field has NaN/Unknown values")
-    logging.debug("Breaks are: %s" % breakVals)
+    #hasNan = False
+    #if math.isinf(breakVals[0]):
+        #hasNan = True
+        #breakVals.insert(1, counts[1][0])
+        #logging.info("Field has NaN/Unknown values")
+    #logging.debug("Breaks are: %s" % breakVals)
 
-    dArr, binCounts = findBins(numVals, breakVals, hasNan)
-    logging.info("Number of values per decile-bin: %s" % binCounts)
+    #dArr, binCounts = findBins(numVals, breakVals, hasNan)
+    #logging.info("Number of values per decile-bin: %s" % binCounts)
 
     # we should have 11 breaks/10 bins, or 12 breaks/11bins if we have NaN elements
-    assert((not hasNan and len(breakVals)==11) or (hasNan and len(breakVals)==12))
-    assert((not hasNan and len(binCounts)==10) or (hasNan and len(binCounts)==11))
-    assert((len(binCounts)+1 == len(breakVals)))
-
-    fieldMeta["binMethod"] = "quantiles"
-    fieldMeta["binCounts"] = binCounts
-    if math.isinf(breakVals[0]): # -infinity is not valid in JSON
-        breakVals[0] = "Unknown"
-    fieldMeta["breaks"] = breakVals
-
-    return dArr, fieldMeta
-
-def discretizeNumField(numVals, fieldMeta, numType):
-    " given a list of numbers, add attributes to fieldMeta that describe the binning scheme "
-    digArr, fieldMeta = discretizeArray(numVals, fieldMeta)
+    #assert((not hasNan and len(breakVals)==11) or (hasNan and len(breakVals)==12))
+    #assert((not hasNan and len(binCounts)==10) or (hasNan and len(binCounts)==11))
+    #assert((len(binCounts)+1 == len(breakVals)))
+#
+    #fieldMeta["binMethod"] = "quantiles"
+    #fieldMeta["binCounts"] = binCounts
+    #if math.isinf(breakVals[0]): # -infinity is not valid in JSON
+        #breakVals[0] = "Unknown"
+    #fieldMeta["breaks"] = breakVals
 
-    #deciles, binCounts, newVals = getDecilesWithZeros(numVals)
+    #return dArr, fieldMeta
 
-    fieldMeta["arrType"] = "uint8"
-    fieldMeta["_fmt"] = "<B"
-    return digArr, fieldMeta
+#def discretizeNumField(numVals, fieldMeta, numType):
+    #" given a list of numbers, add attributes to fieldMeta that describe the binning scheme "
+    #digArr, fieldMeta = discretizeArray(numVals, fieldMeta)
+#
+    ##deciles, binCounts, newVals = getDecilesWithZeros(numVals)
+#
+    #fieldMeta["arrType"] = "uint8"
+    #fieldMeta["_fmt"] = "<B"
+    #return digArr, fieldMeta
 
 def typeForStrings(strings):
     """ given a list of strings, determine if they're all ints or floats or strings
     """
     floatCount = 0
     intCount = 0
     for val in strings:
         try:
             newVal = int(val)
             intCount += 1
         except:
             try:
                 newVal = float(val)
                 floatCount += 1
             except:
@@ -1804,54 +1803,54 @@
 
 def findBin(ranges, val):
     """ given an array of values, find the index i where ranges[i] < val <= ranges[i+1]
     ranges have to be sorted.
     This is a dumb brute force implementation - maybe binary search is faster, if ever revisit this again
     Someone said up to 10 binary search is not faster.
     """
     if val==0: # speedup
         return 0
     for i in range(0, len(ranges)):
         if (val < ranges[i]):
             return i
     # if doesn't fit in anywhere, return beyond last possible index
     return i+1
 
-def discretizeArr_uniform(arr, fieldMeta):
-    """ given an array of numbers, get min/max, create 10 bins between min and max then
-    translate the array to bins and return the list of bins
-    """
-    arrMin = min(arr)
-    arrMax = max(arr)
-    stepSize = (arrMax-arrMin)/10.0
-
-    dArr = [0]*len(arr)
-    binCounts = [0]*10
-    for i, x in enumerate(arr):
-        binIdx = int(round((x - arrMin)/stepSize))
-        if x == arrMax:
-            binIdx = 9
-        assert(binIdx <= 9)
-        dArr[i] = binIdx
-        binCounts[binIdx]+=1
-
-    fieldMeta["binMethod"] = "uniform"
-    fieldMeta["minVal"] = arrMin
-    fieldMeta["maxVal"] = arrMax
-    fieldMeta["stepSize"] = stepSize
-    fieldMeta["binCounts"] = binCounts
-    return dArr, fieldMeta
+#def discretizeArr_uniform(arr, fieldMeta):
+    #""" given an array of numbers, get min/max, create 10 bins between min and max then
+    #translate the array to bins and return the list of bins
+    #"""
+    #arrMin = min(arr)
+    #arrMax = max(arr)
+    #stepSize = (arrMax-arrMin)/10.0
+#
+    #dArr = [0]*len(arr)
+    #binCounts = [0]*10
+    #for i, x in enumerate(arr):
+        #binIdx = int(round((x - arrMin)/stepSize))
+        #if x == arrMax:
+            #binIdx = 9
+        #assert(binIdx <= 9)
+        #dArr[i] = binIdx
+        #binCounts[binIdx]+=1
+#
+    #fieldMeta["binMethod"] = "uniform"
+    #fieldMeta["minVal"] = arrMin
+    #fieldMeta["maxVal"] = arrMax
+    #fieldMeta["stepSize"] = stepSize
+    #fieldMeta["binCounts"] = binCounts
+    #return dArr, fieldMeta
 
 def digitize_py(arr, matType):
     """ calculate deciles ignoring 0s from arr, use these deciles to digitize the whole arr,
     return (digArr, zeroCount, bins).
     bins is an array of (min, max, count)
     There are at most 11 bins and bin0 is just for the value zero.
     For bin0, min and max are both 0.0
 
     matType can be "int" or "float".
     If it is 'int' and arr has only <= 11 values, will not calculate deciles, but rather just
     count the numbers and use them to create bins, one per number.
     #>>> digitize_py([1,1,1,1,1,2,3,4,5,6,4,5,5,5,5], "float")
     """
     if matType=="int":
         valCount = len(set(arr))
@@ -2051,57 +2050,61 @@
     else:
         logging.debug("%s is not an MTX file" % path)
         return False
 
 def exprEncode(geneDesc, exprArr, matType):
     """ convert an array of numbers of type matType (int or float) to a compressed string of
     float32s
     The format of a record is:
     - 2 bytes: length of descStr, e.g. gene identifier or else
     - len(descStr) bytes: the descriptive string descStr
     - array of n 4-byte floats (n = number of cells) or 4-byte unsigned ints
     """
     geneDesc = str(geneDesc) # make sure no unicode
     geneIdLen = struct.pack("<H", len(geneDesc))
 
+    if matType=="float":
+        nanValue = FLOATNAN
+    else:
+        nanValue = INTNAN
+
     # on cortex-dev, numpy was around 30% faster. Not a huge difference.
     if numpyLoaded:
         if matType=="float":
             exprArr = exprArr.astype("float32")
         elif matType=="int":
             exprArr = exprArr.astype("uint32")
         else:
             assert(False) # internal error
         exprStr = exprArr.tobytes()
-        #minVal = np.amin(exprArr[~np.isneginf(exprArr)])
-        exprArr[np.isnan(exprArr)] = FLOATNAN
+        exprArr[np.isnan(exprArr)] = nanValue
         minVal = np.amin(exprArr)
-        # cortex-dev-splicing/psi has NAN values in the mtx file
+        # e.g. cortex-dev-splicing/psi has NAN values in the mtx file
     else:
         if matType=="float":
             arrType = "f"
         elif matType=="int" or matType=="forceInt":
             arrType = "L"
         else:
             assert(False) # internal error
 
         # if as too-old numpy version is loaded isNumpy is false, but the type may
         # still be a numpy array if we loaded from MTX -> force to a list
         if str(type(exprArr))=="<type 'numpy.ndarray'>":
             exprArr = exprArr.tolist()[0]
 
-        exprArr = [FLOATNAN if math.isnan(x) else x for x in exprArr]
+        exprArr = [nanValue if math.isnan(x) else x for x in exprArr]
         # Python 3.9 removed tostring()
         if sys.version_info >= (3, 2):
             exprStr = array.array(arrType, exprArr).tobytes()
         else:
             exprStr = array.array(arrType, exprArr).tostring()
 
         #minVal = min([x for x in exprArr if not (math.isinf(x) and x < 0)]) # this is super slow...
         minVal = min(exprArr)
 
     if isPy3:
         geneStr = geneIdLen+bytes(geneDesc, encoding="ascii")+exprStr
     else:
         geneStr = geneIdLen+geneDesc+exprStr
 
     geneCompr = zlib.compress(geneStr)