a3e5065af17b8da62cf1fe1b310877c60fef13a8 max Wed Jul 2 11:05:57 2025 -0700 fixing int/float nan value replacement, refs #26440 diff --git src/cbPyLib/cellbrowser/cellbrowser.py src/cbPyLib/cellbrowser/cellbrowser.py index cfec862..877ce8f 100755 --- src/cbPyLib/cellbrowser/cellbrowser.py +++ src/cbPyLib/cellbrowser/cellbrowser.py @@ -74,33 +74,32 @@ # the default html dir, used if the --htmlDir option is set but empty # this variable is initialized below (no forward declaration in Python) # just before cbBuild_parseArgs defOutDir = None CBHOMEURL = "https://cells.ucsc.edu/downloads/cellbrowserData/" CBHOMEURL_TEST = "https://cells-test.gi.ucsc.edu/downloads/cellbrowserData/" # a special value that is used for both x and y to indicate that the cell should not be shown # must match the same value in maxPlot.js HIDDENCOORD = 12345 # special value representing NaN in floating point arrays # sorting is used everywhere in the code, and sorting is fast, so we replace all NaNs with -inf # This means that all arrays can be sorted with the normal, fast javascript functions -# must match the same value in cellBrowser.js +# This must match the same value in cellBrowser.js FLOATNAN = float('-inf') - # special value representing NaN in integer arrays, again, we want this to be first after sorting # must match the same value in cellBrowser.js INTNAN = -2**16 # how many md5 characters to keep in version identifiers. We load all files using their md5 to get around # internet browser caching MD5LEN = 10 # list of tags that are required: # for cellbrowser.conf of a dataset reqTagsDataset =['coords', 'meta'] # for cellbrowser.conf of a collection reqTagsColl =['shortLabel'] coordLabels = { @@ -918,33 +917,33 @@ # my new files are smaller and have headers elif line1=="#geneId\tsymbol" or fieldCount==2: d = {} for row in lineFileNextRow(fname): if row.symbol=="": continue geneId = row.geneId if geneId.startswith("EN") and "." in geneId: geneId = geneId.split(".")[0] d[geneId]=row.symbol else: assert(False) # symbols file does not have a header like #geneId<tab>symbol logging.debug("Found symbols for %d genes" % len(d)) return d -def getDecilesList_np(values): - deciles = np.percentile( values, [0,10,20,30,40,50,60,70,80,90,100] ) - return deciles +#def getDecilesList_np(values): + #deciles = np.percentile( values, [0,10,20,30,40,50,60,70,80,90,100] ) + #return deciles def bytesAndFmt(x): """ how many bytes do we need to store x values and what is the sprintf format string for it? Return tuple of Javascript type and python struct type. See : Javascript typed array names, https://developer.mozilla.org/en-US/docs/Web/JavaScript/Typed_arrays https://docs.python.org/2/library/struct.html """ if x > 65535: return "Uint32", "<I" elif x > 255: return "Uint16", "<H" else: @@ -958,161 +957,161 @@ # >>> l = [0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10] # >>> getDecilesWithZeros(l) # ([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) # """ # nonZeros = [x for x in numVals if x!=0.0] # # zeroCount = len(numVals) - len(nonZeros) # deciles = getDecilesList_np(nonZeros) # # decArr = np.searchsorted(deciles, numVals) # decCounts(deciles, nonZeros) # # decCounts.insert(0, zeroCount) # return deciles, decCounts, newVals -def findBins(numVals, breakVals, hasNan): - """ - find the right bin index defined by breakVals for every value in numVals. - Special handling for the last value. The comparison uses "<=". The first - break is assumed to be the minimum of numVals and is therefore ignored. - Also returns an array with the count for every bin. hasNan triggers a special - mode where the first bin is reserved for NaNs (encoded as -inf) - >>> findBins([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1, 2,3,5,6]) - ([0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3], [6, 2, 4, 2]) - """ - bArr = [] - binCounts = [0]*(len(breakVals)-1) - - # NaNs mean that the non-NaN bins are all +1 - if hasNan: - breaks = breakVals[2:] - else: - breaks = breakVals[1:] - - if hasNan: - for x in numVals: - # we use -inf for the NaN value everywhere, because sorting is undefined in lists that contain NaN - if math.isinf(x): - binIdx = 0 - else: - binIdx = bisect.bisect_left(breaks, x)+1 - bArr.append(binIdx) - binCounts[binIdx]+=1 - else: - for x in numVals: - binIdx = bisect.bisect_left(breaks, x) # comparison operator is <= - bArr.append(binIdx) - binCounts[binIdx]+=1 - - return bArr, binCounts - -def countBinsBetweenBreaks(numVals, breakVals): - """ count how many numVals fall into the bins defined by breakVals. - Special handling for the last value. Comparison uses "<=". The first - break is assumed to be the minimum of numVals. - Also returns an array with the bin for every element in numVals - >>> countBinsBetweenBreaks([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1,2,3,5,6]) - ([6, 2, 4, 2], [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3]) - """ - - binCounts = [] - binCount = 0 - i = 1 - dArr = [] - for x in numVals: - if x <= breakVals[i]: - binCount+=1 - else: - binCounts.append(binCount) - binCount = 1 - i += 1 - dArr.append(i-1) - - binCounts.append(binCount) - - assert(len(dArr)==len(numVals)) - assert(len(binCounts)==len(breakVals)-1) - return binCounts, dArr - -def discretizeArray(numVals, fieldMeta): - """ - discretize numeric values based on quantiles. - """ - maxBinCount = 10 - counts = Counter(numVals).most_common() - counts.sort() # sort by value, not count - - if len(counts) < maxBinCount: - # if we have just a few values, do not do any binning - binCounts = [y for x,y in counts] - values = [x for x,y in counts] - - valToBin = {} - for i, x in enumerate(values): - valToBin[x] = i - - dArr = [valToBin[x] for x in numVals] +#def findBins(numVals, breakVals, hasNan): + #""" + #find the right bin index defined by breakVals for every value in numVals. + #Special handling for the last value. The comparison uses "<=". The first + #break is assumed to be the minimum of numVals and is therefore ignored. + #Also returns an array with the count for every bin. hasNan triggers a special + #mode where the first bin is reserved for NaNs (encoded as -inf) + #>>> findBins([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1, 2,3,5,6]) + #([0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3], [6, 2, 4, 2]) + #""" + #bArr = [] + #binCounts = [0]*(len(breakVals)-1) +# + ## NaNs mean that the non-NaN bins are all +1 + #if hasNan: + #breaks = breakVals[2:] + #else: + #breaks = breakVals[1:] +# + #if hasNan: + #for x in numVals: + ## we use -inf for the NaN value everywhere, because sorting is undefined in lists that contain NaN + #if math.isinf(x): + #binIdx = 0 + #else: + #binIdx = bisect.bisect_left(breaks, x)+1 + #bArr.append(binIdx) + #binCounts[binIdx]+=1 + #else: + #for x in numVals: + #binIdx = bisect.bisect_left(breaks, x) # comparison operator is <= + #bArr.append(binIdx) + #binCounts[binIdx]+=1 +# + #return bArr, binCounts + +#def countBinsBetweenBreaks(numVals, breakVals): + #""" count how many numVals fall into the bins defined by breakVals. + #Special handling for the last value. Comparison uses "<=". The first + #break is assumed to be the minimum of numVals. + #Also returns an array with the bin for every element in numVals + #>>> countBinsBetweenBreaks([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1,2,3,5,6]) + #([6, 2, 4, 2], [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3]) + #""" +# + #binCounts = [] + #binCount = 0 + #i = 1 + #dArr = [] + #for x in numVals: + #if x <= breakVals[i]: + #binCount+=1 + #else: + #binCounts.append(binCount) + #binCount = 1 + #i += 1 + #dArr.append(i-1) +# + #binCounts.append(binCount) +# + #assert(len(dArr)==len(numVals)) + #assert(len(binCounts)==len(breakVals)-1) + #return binCounts, dArr - fieldMeta["binMethod"] = "raw" - fieldMeta["values"] = values - fieldMeta["binCounts"] = binCounts - return dArr, fieldMeta +#def discretizeArray(numVals, fieldMeta): + #""" + #discretize numeric values based on quantiles. + #""" + #maxBinCount = 10 + #counts = Counter(numVals).most_common() + #counts.sort() # sort by value, not count +# + #if len(counts) < maxBinCount: + ## if we have just a few values, do not do any binning + #binCounts = [y for x,y in counts] + #values = [x for x,y in counts] +# + #valToBin = {} + #for i, x in enumerate(values): + #valToBin[x] = i +# + #dArr = [valToBin[x] for x in numVals] +# + #fieldMeta["binMethod"] = "raw" + #fieldMeta["values"] = values + #fieldMeta["binCounts"] = binCounts + #return dArr, fieldMeta # ten breaks - breakPercs = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] - countLen = len(counts) - breakIndices = [int(round(bp*countLen)) for bp in breakPercs] - # as with all histograms, the last break is always a special case (0-based array) - breakIndices.append(countLen-1) - breakVals = [counts[idx][0] for idx in breakIndices] - + #breakPercs = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + #countLen = len(counts) + #breakIndices = [int(round(bp*countLen)) for bp in breakPercs] + ## as with all histograms, the last break is always a special case (0-based array) + #breakIndices.append(countLen-1) + #breakVals = [counts[idx][0] for idx in breakIndices] +# # NaNs are encoded as -inf so they always are the first break # The first non-NaN value is at index 1 # If we have NaNs, we need one more bin, with the first non-Nan value - hasNan = False - if math.isinf(breakVals[0]): - hasNan = True - breakVals.insert(1, counts[1][0]) - logging.info("Field has NaN/Unknown values") - logging.debug("Breaks are: %s" % breakVals) + #hasNan = False + #if math.isinf(breakVals[0]): + #hasNan = True + #breakVals.insert(1, counts[1][0]) + #logging.info("Field has NaN/Unknown values") + #logging.debug("Breaks are: %s" % breakVals) - dArr, binCounts = findBins(numVals, breakVals, hasNan) - logging.info("Number of values per decile-bin: %s" % binCounts) + #dArr, binCounts = findBins(numVals, breakVals, hasNan) + #logging.info("Number of values per decile-bin: %s" % binCounts) # we should have 11 breaks/10 bins, or 12 breaks/11bins if we have NaN elements - assert((not hasNan and len(breakVals)==11) or (hasNan and len(breakVals)==12)) - assert((not hasNan and len(binCounts)==10) or (hasNan and len(binCounts)==11)) - assert((len(binCounts)+1 == len(breakVals))) - - fieldMeta["binMethod"] = "quantiles" - fieldMeta["binCounts"] = binCounts - if math.isinf(breakVals[0]): # -infinity is not valid in JSON - breakVals[0] = "Unknown" - fieldMeta["breaks"] = breakVals - - return dArr, fieldMeta - -def discretizeNumField(numVals, fieldMeta, numType): - " given a list of numbers, add attributes to fieldMeta that describe the binning scheme " - digArr, fieldMeta = discretizeArray(numVals, fieldMeta) + #assert((not hasNan and len(breakVals)==11) or (hasNan and len(breakVals)==12)) + #assert((not hasNan and len(binCounts)==10) or (hasNan and len(binCounts)==11)) + #assert((len(binCounts)+1 == len(breakVals))) +# + #fieldMeta["binMethod"] = "quantiles" + #fieldMeta["binCounts"] = binCounts + #if math.isinf(breakVals[0]): # -infinity is not valid in JSON + #breakVals[0] = "Unknown" + #fieldMeta["breaks"] = breakVals - #deciles, binCounts, newVals = getDecilesWithZeros(numVals) + #return dArr, fieldMeta - fieldMeta["arrType"] = "uint8" - fieldMeta["_fmt"] = "<B" - return digArr, fieldMeta +#def discretizeNumField(numVals, fieldMeta, numType): + #" given a list of numbers, add attributes to fieldMeta that describe the binning scheme " + #digArr, fieldMeta = discretizeArray(numVals, fieldMeta) +# + ##deciles, binCounts, newVals = getDecilesWithZeros(numVals) +# + #fieldMeta["arrType"] = "uint8" + #fieldMeta["_fmt"] = "<B" + #return digArr, fieldMeta def typeForStrings(strings): """ given a list of strings, determine if they're all ints or floats or strings """ floatCount = 0 intCount = 0 for val in strings: try: newVal = int(val) intCount += 1 except: try: newVal = float(val) floatCount += 1 except: @@ -1804,54 +1803,54 @@ def findBin(ranges, val): """ given an array of values, find the index i where ranges[i] < val <= ranges[i+1] ranges have to be sorted. This is a dumb brute force implementation - maybe binary search is faster, if ever revisit this again Someone said up to 10 binary search is not faster. """ if val==0: # speedup return 0 for i in range(0, len(ranges)): if (val < ranges[i]): return i # if doesn't fit in anywhere, return beyond last possible index return i+1 -def discretizeArr_uniform(arr, fieldMeta): - """ given an array of numbers, get min/max, create 10 bins between min and max then - translate the array to bins and return the list of bins - """ - arrMin = min(arr) - arrMax = max(arr) - stepSize = (arrMax-arrMin)/10.0 - - dArr = [0]*len(arr) - binCounts = [0]*10 - for i, x in enumerate(arr): - binIdx = int(round((x - arrMin)/stepSize)) - if x == arrMax: - binIdx = 9 - assert(binIdx <= 9) - dArr[i] = binIdx - binCounts[binIdx]+=1 - - fieldMeta["binMethod"] = "uniform" - fieldMeta["minVal"] = arrMin - fieldMeta["maxVal"] = arrMax - fieldMeta["stepSize"] = stepSize - fieldMeta["binCounts"] = binCounts - return dArr, fieldMeta +#def discretizeArr_uniform(arr, fieldMeta): + #""" given an array of numbers, get min/max, create 10 bins between min and max then + #translate the array to bins and return the list of bins + #""" + #arrMin = min(arr) + #arrMax = max(arr) + #stepSize = (arrMax-arrMin)/10.0 +# + #dArr = [0]*len(arr) + #binCounts = [0]*10 + #for i, x in enumerate(arr): + #binIdx = int(round((x - arrMin)/stepSize)) + #if x == arrMax: + #binIdx = 9 + #assert(binIdx <= 9) + #dArr[i] = binIdx + #binCounts[binIdx]+=1 +# + #fieldMeta["binMethod"] = "uniform" + #fieldMeta["minVal"] = arrMin + #fieldMeta["maxVal"] = arrMax + #fieldMeta["stepSize"] = stepSize + #fieldMeta["binCounts"] = binCounts + #return dArr, fieldMeta def digitize_py(arr, matType): """ calculate deciles ignoring 0s from arr, use these deciles to digitize the whole arr, return (digArr, zeroCount, bins). bins is an array of (min, max, count) There are at most 11 bins and bin0 is just for the value zero. For bin0, min and max are both 0.0 matType can be "int" or "float". If it is 'int' and arr has only <= 11 values, will not calculate deciles, but rather just count the numbers and use them to create bins, one per number. #>>> digitize_py([1,1,1,1,1,2,3,4,5,6,4,5,5,5,5], "float") """ if matType=="int": valCount = len(set(arr)) @@ -2051,57 +2050,61 @@ else: logging.debug("%s is not an MTX file" % path) return False def exprEncode(geneDesc, exprArr, matType): """ convert an array of numbers of type matType (int or float) to a compressed string of float32s The format of a record is: - 2 bytes: length of descStr, e.g. gene identifier or else - len(descStr) bytes: the descriptive string descStr - array of n 4-byte floats (n = number of cells) or 4-byte unsigned ints """ geneDesc = str(geneDesc) # make sure no unicode geneIdLen = struct.pack("<H", len(geneDesc)) + if matType=="float": + nanValue = FLOATNAN + else: + nanValue = INTNAN + # on cortex-dev, numpy was around 30% faster. Not a huge difference. if numpyLoaded: if matType=="float": exprArr = exprArr.astype("float32") elif matType=="int": exprArr = exprArr.astype("uint32") else: assert(False) # internal error exprStr = exprArr.tobytes() - #minVal = np.amin(exprArr[~np.isneginf(exprArr)]) - exprArr[np.isnan(exprArr)] = FLOATNAN + exprArr[np.isnan(exprArr)] = nanValue minVal = np.amin(exprArr) - # cortex-dev-splicing/psi has NAN values in the mtx file + # e.g. cortex-dev-splicing/psi has NAN values in the mtx file else: if matType=="float": arrType = "f" elif matType=="int" or matType=="forceInt": arrType = "L" else: assert(False) # internal error # if as too-old numpy version is loaded isNumpy is false, but the type may # still be a numpy array if we loaded from MTX -> force to a list if str(type(exprArr))=="<type 'numpy.ndarray'>": exprArr = exprArr.tolist()[0] - exprArr = [FLOATNAN if math.isnan(x) else x for x in exprArr] + exprArr = [nanValue if math.isnan(x) else x for x in exprArr] # Python 3.9 removed tostring() if sys.version_info >= (3, 2): exprStr = array.array(arrType, exprArr).tobytes() else: exprStr = array.array(arrType, exprArr).tostring() #minVal = min([x for x in exprArr if not (math.isinf(x) and x < 0)]) # this is super slow... minVal = min(exprArr) if isPy3: geneStr = geneIdLen+bytes(geneDesc, encoding="ascii")+exprStr else: geneStr = geneIdLen+geneDesc+exprStr geneCompr = zlib.compress(geneStr)