a3e5065af17b8da62cf1fe1b310877c60fef13a8 max Wed Jul 2 11:05:57 2025 -0700 fixing int/float nan value replacement, refs #26440 diff --git src/cbPyLib/cellbrowser/cellbrowser.py src/cbPyLib/cellbrowser/cellbrowser.py index cfec862..877ce8f 100755 --- src/cbPyLib/cellbrowser/cellbrowser.py +++ src/cbPyLib/cellbrowser/cellbrowser.py @@ -74,33 +74,32 @@ # the default html dir, used if the --htmlDir option is set but empty # this variable is initialized below (no forward declaration in Python) # just before cbBuild_parseArgs defOutDir = None CBHOMEURL = "https://cells.ucsc.edu/downloads/cellbrowserData/" CBHOMEURL_TEST = "https://cells-test.gi.ucsc.edu/downloads/cellbrowserData/" # a special value that is used for both x and y to indicate that the cell should not be shown # must match the same value in maxPlot.js HIDDENCOORD = 12345 # special value representing NaN in floating point arrays # sorting is used everywhere in the code, and sorting is fast, so we replace all NaNs with -inf # This means that all arrays can be sorted with the normal, fast javascript functions -# must match the same value in cellBrowser.js +# This must match the same value in cellBrowser.js FLOATNAN = float('-inf') - # special value representing NaN in integer arrays, again, we want this to be first after sorting # must match the same value in cellBrowser.js INTNAN = -2**16 # how many md5 characters to keep in version identifiers. We load all files using their md5 to get around # internet browser caching MD5LEN = 10 # list of tags that are required: # for cellbrowser.conf of a dataset reqTagsDataset =['coords', 'meta'] # for cellbrowser.conf of a collection reqTagsColl =['shortLabel'] coordLabels = { @@ -918,33 +917,33 @@ # my new files are smaller and have headers elif line1=="#geneId\tsymbol" or fieldCount==2: d = {} for row in lineFileNextRow(fname): if row.symbol=="": continue geneId = row.geneId if geneId.startswith("EN") and "." in geneId: geneId = geneId.split(".")[0] d[geneId]=row.symbol else: assert(False) # symbols file does not have a header like #geneIdsymbol logging.debug("Found symbols for %d genes" % len(d)) return d -def getDecilesList_np(values): - deciles = np.percentile( values, [0,10,20,30,40,50,60,70,80,90,100] ) - return deciles +#def getDecilesList_np(values): + #deciles = np.percentile( values, [0,10,20,30,40,50,60,70,80,90,100] ) + #return deciles def bytesAndFmt(x): """ how many bytes do we need to store x values and what is the sprintf format string for it? Return tuple of Javascript type and python struct type. See : Javascript typed array names, https://developer.mozilla.org/en-US/docs/Web/JavaScript/Typed_arrays https://docs.python.org/2/library/struct.html """ if x > 65535: return "Uint32", " 255: return "Uint16", ">> l = [0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,10] # >>> getDecilesWithZeros(l) # ([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) # """ # nonZeros = [x for x in numVals if x!=0.0] # # zeroCount = len(numVals) - len(nonZeros) # deciles = getDecilesList_np(nonZeros) # # decArr = np.searchsorted(deciles, numVals) # decCounts(deciles, nonZeros) # # decCounts.insert(0, zeroCount) # return deciles, decCounts, newVals -def findBins(numVals, breakVals, hasNan): - """ - find the right bin index defined by breakVals for every value in numVals. - Special handling for the last value. The comparison uses "<=". The first - break is assumed to be the minimum of numVals and is therefore ignored. - Also returns an array with the count for every bin. hasNan triggers a special - mode where the first bin is reserved for NaNs (encoded as -inf) - >>> findBins([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1, 2,3,5,6]) - ([0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3], [6, 2, 4, 2]) - """ - bArr = [] - binCounts = [0]*(len(breakVals)-1) - - # NaNs mean that the non-NaN bins are all +1 - if hasNan: - breaks = breakVals[2:] - else: - breaks = breakVals[1:] - - if hasNan: - for x in numVals: - # we use -inf for the NaN value everywhere, because sorting is undefined in lists that contain NaN - if math.isinf(x): - binIdx = 0 - else: - binIdx = bisect.bisect_left(breaks, x)+1 - bArr.append(binIdx) - binCounts[binIdx]+=1 - else: - for x in numVals: - binIdx = bisect.bisect_left(breaks, x) # comparison operator is <= - bArr.append(binIdx) - binCounts[binIdx]+=1 - - return bArr, binCounts - -def countBinsBetweenBreaks(numVals, breakVals): - """ count how many numVals fall into the bins defined by breakVals. - Special handling for the last value. Comparison uses "<=". The first - break is assumed to be the minimum of numVals. - Also returns an array with the bin for every element in numVals - >>> countBinsBetweenBreaks([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1,2,3,5,6]) - ([6, 2, 4, 2], [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3]) - """ - - binCounts = [] - binCount = 0 - i = 1 - dArr = [] - for x in numVals: - if x <= breakVals[i]: - binCount+=1 - else: - binCounts.append(binCount) - binCount = 1 - i += 1 - dArr.append(i-1) - - binCounts.append(binCount) - - assert(len(dArr)==len(numVals)) - assert(len(binCounts)==len(breakVals)-1) - return binCounts, dArr - -def discretizeArray(numVals, fieldMeta): - """ - discretize numeric values based on quantiles. - """ - maxBinCount = 10 - counts = Counter(numVals).most_common() - counts.sort() # sort by value, not count - - if len(counts) < maxBinCount: - # if we have just a few values, do not do any binning - binCounts = [y for x,y in counts] - values = [x for x,y in counts] - - valToBin = {} - for i, x in enumerate(values): - valToBin[x] = i - - dArr = [valToBin[x] for x in numVals] +#def findBins(numVals, breakVals, hasNan): + #""" + #find the right bin index defined by breakVals for every value in numVals. + #Special handling for the last value. The comparison uses "<=". The first + #break is assumed to be the minimum of numVals and is therefore ignored. + #Also returns an array with the count for every bin. hasNan triggers a special + #mode where the first bin is reserved for NaNs (encoded as -inf) + #>>> findBins([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1, 2,3,5,6]) + #([0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3], [6, 2, 4, 2]) + #""" + #bArr = [] + #binCounts = [0]*(len(breakVals)-1) +# + ## NaNs mean that the non-NaN bins are all +1 + #if hasNan: + #breaks = breakVals[2:] + #else: + #breaks = breakVals[1:] +# + #if hasNan: + #for x in numVals: + ## we use -inf for the NaN value everywhere, because sorting is undefined in lists that contain NaN + #if math.isinf(x): + #binIdx = 0 + #else: + #binIdx = bisect.bisect_left(breaks, x)+1 + #bArr.append(binIdx) + #binCounts[binIdx]+=1 + #else: + #for x in numVals: + #binIdx = bisect.bisect_left(breaks, x) # comparison operator is <= + #bArr.append(binIdx) + #binCounts[binIdx]+=1 +# + #return bArr, binCounts + +#def countBinsBetweenBreaks(numVals, breakVals): + #""" count how many numVals fall into the bins defined by breakVals. + #Special handling for the last value. Comparison uses "<=". The first + #break is assumed to be the minimum of numVals. + #Also returns an array with the bin for every element in numVals + #>>> countBinsBetweenBreaks([1,1,1,2,2,2,3,3,4,4,5,5,6,6], [1,2,3,5,6]) + #([6, 2, 4, 2], [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3]) + #""" +# + #binCounts = [] + #binCount = 0 + #i = 1 + #dArr = [] + #for x in numVals: + #if x <= breakVals[i]: + #binCount+=1 + #else: + #binCounts.append(binCount) + #binCount = 1 + #i += 1 + #dArr.append(i-1) +# + #binCounts.append(binCount) +# + #assert(len(dArr)==len(numVals)) + #assert(len(binCounts)==len(breakVals)-1) + #return binCounts, dArr - fieldMeta["binMethod"] = "raw" - fieldMeta["values"] = values - fieldMeta["binCounts"] = binCounts - return dArr, fieldMeta +#def discretizeArray(numVals, fieldMeta): + #""" + #discretize numeric values based on quantiles. + #""" + #maxBinCount = 10 + #counts = Counter(numVals).most_common() + #counts.sort() # sort by value, not count +# + #if len(counts) < maxBinCount: + ## if we have just a few values, do not do any binning + #binCounts = [y for x,y in counts] + #values = [x for x,y in counts] +# + #valToBin = {} + #for i, x in enumerate(values): + #valToBin[x] = i +# + #dArr = [valToBin[x] for x in numVals] +# + #fieldMeta["binMethod"] = "raw" + #fieldMeta["values"] = values + #fieldMeta["binCounts"] = binCounts + #return dArr, fieldMeta # ten breaks - breakPercs = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] - countLen = len(counts) - breakIndices = [int(round(bp*countLen)) for bp in breakPercs] - # as with all histograms, the last break is always a special case (0-based array) - breakIndices.append(countLen-1) - breakVals = [counts[idx][0] for idx in breakIndices] - + #breakPercs = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + #countLen = len(counts) + #breakIndices = [int(round(bp*countLen)) for bp in breakPercs] + ## as with all histograms, the last break is always a special case (0-based array) + #breakIndices.append(countLen-1) + #breakVals = [counts[idx][0] for idx in breakIndices] +# # NaNs are encoded as -inf so they always are the first break # The first non-NaN value is at index 1 # If we have NaNs, we need one more bin, with the first non-Nan value - hasNan = False - if math.isinf(breakVals[0]): - hasNan = True - breakVals.insert(1, counts[1][0]) - logging.info("Field has NaN/Unknown values") - logging.debug("Breaks are: %s" % breakVals) + #hasNan = False + #if math.isinf(breakVals[0]): + #hasNan = True + #breakVals.insert(1, counts[1][0]) + #logging.info("Field has NaN/Unknown values") + #logging.debug("Breaks are: %s" % breakVals) - dArr, binCounts = findBins(numVals, breakVals, hasNan) - logging.info("Number of values per decile-bin: %s" % binCounts) + #dArr, binCounts = findBins(numVals, breakVals, hasNan) + #logging.info("Number of values per decile-bin: %s" % binCounts) # we should have 11 breaks/10 bins, or 12 breaks/11bins if we have NaN elements - assert((not hasNan and len(breakVals)==11) or (hasNan and len(breakVals)==12)) - assert((not hasNan and len(binCounts)==10) or (hasNan and len(binCounts)==11)) - assert((len(binCounts)+1 == len(breakVals))) - - fieldMeta["binMethod"] = "quantiles" - fieldMeta["binCounts"] = binCounts - if math.isinf(breakVals[0]): # -infinity is not valid in JSON - breakVals[0] = "Unknown" - fieldMeta["breaks"] = breakVals - - return dArr, fieldMeta - -def discretizeNumField(numVals, fieldMeta, numType): - " given a list of numbers, add attributes to fieldMeta that describe the binning scheme " - digArr, fieldMeta = discretizeArray(numVals, fieldMeta) + #assert((not hasNan and len(breakVals)==11) or (hasNan and len(breakVals)==12)) + #assert((not hasNan and len(binCounts)==10) or (hasNan and len(binCounts)==11)) + #assert((len(binCounts)+1 == len(breakVals))) +# + #fieldMeta["binMethod"] = "quantiles" + #fieldMeta["binCounts"] = binCounts + #if math.isinf(breakVals[0]): # -infinity is not valid in JSON + #breakVals[0] = "Unknown" + #fieldMeta["breaks"] = breakVals - #deciles, binCounts, newVals = getDecilesWithZeros(numVals) + #return dArr, fieldMeta - fieldMeta["arrType"] = "uint8" - fieldMeta["_fmt"] = ">> digitize_py([1,1,1,1,1,2,3,4,5,6,4,5,5,5,5], "float") """ if matType=="int": valCount = len(set(arr)) @@ -2051,57 +2050,61 @@ else: logging.debug("%s is not an MTX file" % path) return False def exprEncode(geneDesc, exprArr, matType): """ convert an array of numbers of type matType (int or float) to a compressed string of float32s The format of a record is: - 2 bytes: length of descStr, e.g. gene identifier or else - len(descStr) bytes: the descriptive string descStr - array of n 4-byte floats (n = number of cells) or 4-byte unsigned ints """ geneDesc = str(geneDesc) # make sure no unicode geneIdLen = struct.pack(" force to a list if str(type(exprArr))=="": exprArr = exprArr.tolist()[0] - exprArr = [FLOATNAN if math.isnan(x) else x for x in exprArr] + exprArr = [nanValue if math.isnan(x) else x for x in exprArr] # Python 3.9 removed tostring() if sys.version_info >= (3, 2): exprStr = array.array(arrType, exprArr).tobytes() else: exprStr = array.array(arrType, exprArr).tostring() #minVal = min([x for x in exprArr if not (math.isinf(x) and x < 0)]) # this is super slow... minVal = min(exprArr) if isPy3: geneStr = geneIdLen+bytes(geneDesc, encoding="ascii")+exprStr else: geneStr = geneIdLen+geneDesc+exprStr geneCompr = zlib.compress(geneStr)