a83d99d31bc55dcad6f349b43ca2ebeac4dcb870 max Thu Oct 23 07:00:13 2025 -0700 handling the case uint32 matrix and Nans with numpy > 2, cortical-lineage-perturb-44tf/lr-clone-level, Marc diff --git src/cbPyLib/cellbrowser/cellbrowser.py src/cbPyLib/cellbrowser/cellbrowser.py index 0005751..6417f7b 100755 --- src/cbPyLib/cellbrowser/cellbrowser.py +++ src/cbPyLib/cellbrowser/cellbrowser.py @@ -2060,37 +2060,41 @@ - array of n 4-byte floats (n = number of cells) or 4-byte unsigned ints """ geneDesc = str(geneDesc) # make sure no unicode geneIdLen = struct.pack("<H", len(geneDesc)) nanValue = None if matType=="float": nanValue = FLOATNAN else: nanValue = INTNAN # on cortex-dev, numpy was around 30% faster. Not a huge difference. if numpyLoaded: if matType=="float": exprArr = exprArr.astype("float32") + exprArr[np.isnan(exprArr)] = nanValue elif matType=="int": + has_nan = np.any(np.isnan(exprArr)) + if has_nan: + assert(False) # integer matrices with Nans are not supported right now. Email the programmer. exprArr = exprArr.astype("uint32") else: assert(False) # internal error - exprStr = exprArr.tobytes() - exprArr[np.isnan(exprArr)] = nanValue + minVal = np.amin(exprArr) + exprStr = exprArr.tobytes() # e.g. cortex-dev-splicing/psi has NAN values in the mtx file else: if matType=="float": arrType = "f" elif matType=="int" or matType=="forceInt": arrType = "L" else: assert(False) # internal error # if as too-old numpy version is loaded isNumpy is false, but the type may # still be a numpy array if we loaded from MTX -> force to a list if str(type(exprArr))=="<type 'numpy.ndarray'>": exprArr = exprArr.tolist()[0] exprArr = [nanValue if math.isnan(x) else x for x in exprArr] @@ -3673,31 +3677,30 @@ if geneOrSym in geneToSym: geneId = geneOrSym sym = geneToSym[geneId] elif geneOrSym in symToGene: sym = geneOrSym geneId = symToGene[sym] else: errAbort("Gene %s in quickgenes file is neither a symbol nor a geneId" % repr(geneOrSym)) if sym in nonUniqueSyms: logging.warn("Symb %s in quick genes files is not unique. Which of these genes do you mean: %s. " "Use geneId|sym in quickgenes file to solve this." % (sym, ",".join(nonUniqueSyms[sym]))) geneStr = geneId+"|"+sym # case 5: it is an ATAC dataset and the quickgenes file has ranges elif ":" in geneOrSym and "-" in geneOrSym: - #geneStr = geneOrSym.replace(":", "|").replace("-", "|") geneStr = findOverlappingRanges(matrixGeneIds, geneOrSym) if geneStr=="": logging.error("quickGene ATAC range %s does not contain any peak in the matrix, skipping it" % geneOrSym) continue else: logging.info("Gene %s in quickgenes file is not in expr matrix and there is no geneId<->symbol mapping " "to resolve it to a geneId in the expression matrix and it is not an ATAC range" % repr(geneOrSym)) continue # if we had no geneToSym, we'll check the symbol later if it's valid info = [geneStr] if len(row)>1: @@ -6907,32 +6910,54 @@ sc.pp.regress_out(adata, ['n_counts']) #Scaling after regression maxValue = conf["regressMax"] pipeLog('Scaling data, max_value=%d' % maxValue) sc.pp.scale(adata, max_value=maxValue) if conf["doPca"]: pcCount = conf["pcCount"] if pcCount=="auto": firstPcCount = 100 else: firstPcCount = pcCount + X = adata.X.A if hasattr(adata.X, "A") else adata.X # handle sparse matrices + pipeLog('Performing initial PCA, number of PCs: %d' % firstPcCount) + pipeLog("Has NaN:"+str(np.isnan(X).any())) + pipeLog("Has +inf:"+str(np.isposinf(X).any())) + pipeLog("Has -inf:"+ str(np.isneginf(X).any())) + pipeLog("Max value:"+ str(np.nanmax(X))) + if np.isposinf(X).any(): + logging.warn("Matrix has +inf values, replacing with 2^32") + X[np.isinf(X)] = np.nan # turn inf into NaN + X = np.nan_to_num(X, nan=2**32) # replace NaN with 2E6 + adata.X = X + sc.tl.pca(adata, n_comps=firstPcCount) + # X[np.isinf(X)] = np.nan # turn inf into NaN + #X = np.nan_to_num(X, nan=0.0) # replace NaN with 0 + #adata.X = X + + # suggestion - Check after normalization: + # sc.pp.normalize_total(adata, target_sum=1e4) + # print(np.isnan(adata.X).any()) + # suggestion2 - Before scaling, clip extreme values: + # sc.pp.scale(adata, max_value=10) + #Multiply by -1 to compare with Seurat #adata.obsm['X_pca'] *= -1 #Plot of pca variance ratio to see if formula matches visual determination of pc_nb to use sc.pl.pca_variance_ratio(adata, log=True) #Computing number of PCs to be used in clustering if pcCount == "auto": pipeLog("Estimating number of useful PCs based on Shekar et al, Cell 2016") pipeLog("PC weight cutoff used is (sqrt(# of Genes/# of cells) + 1)^2") pipeLog("See http://www.cell.com/cell/fulltext/S0092-8674(16)31007-8, STAR methods") pc_cutoff = ( np.sqrt(float(len(adata.var))/len(adata.obs)) +1 ) **2 pc_nb = 0 for i in adata.uns['pca']['variance']: if i > pc_cutoff: pc_nb+=1