src/cbPyLib/cellbrowser/cellbrowser.py a83d99d31bc55dcad6f349b43ca2ebeac4dcb870

a83d99d31bc55dcad6f349b43ca2ebeac4dcb870
max
  Thu Oct 23 07:00:13 2025 -0700
handling the case uint32 matrix and Nans with numpy > 2, cortical-lineage-perturb-44tf/lr-clone-level, Marc

diff --git src/cbPyLib/cellbrowser/cellbrowser.py src/cbPyLib/cellbrowser/cellbrowser.py
index 0005751..6417f7b 100755
--- src/cbPyLib/cellbrowser/cellbrowser.py
+++ src/cbPyLib/cellbrowser/cellbrowser.py
@@ -2060,37 +2060,41 @@
     - array of n 4-byte floats (n = number of cells) or 4-byte unsigned ints
     """
     geneDesc = str(geneDesc) # make sure no unicode
     geneIdLen = struct.pack("<H", len(geneDesc))
 
     nanValue = None
     if matType=="float":
         nanValue = FLOATNAN
     else:
         nanValue = INTNAN
 
     # on cortex-dev, numpy was around 30% faster. Not a huge difference.
     if numpyLoaded:
         if matType=="float":
             exprArr = exprArr.astype("float32")
+            exprArr[np.isnan(exprArr)] = nanValue
         elif matType=="int":
+            has_nan = np.any(np.isnan(exprArr))
+            if has_nan:
+                assert(False) # integer matrices with Nans are not supported right now. Email the programmer.
             exprArr = exprArr.astype("uint32")
         else:
             assert(False) # internal error
-        exprStr = exprArr.tobytes()
-        exprArr[np.isnan(exprArr)] = nanValue
+
         minVal = np.amin(exprArr)
+        exprStr = exprArr.tobytes()
         # e.g. cortex-dev-splicing/psi has NAN values in the mtx file
     else:
         if matType=="float":
             arrType = "f"
         elif matType=="int" or matType=="forceInt":
             arrType = "L"
         else:
             assert(False) # internal error
 
         # if as too-old numpy version is loaded isNumpy is false, but the type may
         # still be a numpy array if we loaded from MTX -> force to a list
         if str(type(exprArr))=="<type 'numpy.ndarray'>":
             exprArr = exprArr.tolist()[0]
 
         exprArr = [nanValue if math.isnan(x) else x for x in exprArr]
@@ -3673,31 +3677,30 @@
             if geneOrSym in geneToSym:
                 geneId = geneOrSym
                 sym = geneToSym[geneId]
             elif geneOrSym in symToGene:
                 sym = geneOrSym
                 geneId = symToGene[sym]
             else:
                 errAbort("Gene %s in quickgenes file is neither a symbol nor a geneId" % repr(geneOrSym))
             if sym in nonUniqueSyms:
                 logging.warn("Symb %s in quick genes files is not unique. Which of these genes do you mean: %s. "
                         "Use geneId|sym in quickgenes file to solve this." % (sym, ",".join(nonUniqueSyms[sym])))
             geneStr = geneId+"|"+sym
 
         # case 5: it is an ATAC dataset and the quickgenes file has ranges
         elif ":" in geneOrSym and "-" in geneOrSym:
-            #geneStr = geneOrSym.replace(":", "|").replace("-", "|")
             geneStr = findOverlappingRanges(matrixGeneIds, geneOrSym)
             if geneStr=="":
                 logging.error("quickGene ATAC range %s does not contain any peak in the matrix, skipping it" % geneOrSym)
                 continue
 
         else:
             logging.info("Gene %s in quickgenes file is not in expr matrix and there is no geneId<->symbol mapping "
                 "to resolve it to a geneId in the expression matrix and it is not an ATAC range" % repr(geneOrSym))
             continue
 
         # if we had no geneToSym, we'll check the symbol later if it's valid
 
         info = [geneStr]
 
         if len(row)>1:
@@ -6907,32 +6910,54 @@
             sc.pp.regress_out(adata, ['n_counts'])
 
         #Scaling after regression 
         maxValue = conf["regressMax"]
         pipeLog('Scaling data, max_value=%d' % maxValue)
         sc.pp.scale(adata, max_value=maxValue)
 
     if conf["doPca"]:
         pcCount = conf["pcCount"]
 
         if pcCount=="auto":
             firstPcCount = 100
         else:
             firstPcCount = pcCount
 
+        X = adata.X.A if hasattr(adata.X, "A") else adata.X  # handle sparse matrices
+
         pipeLog('Performing initial PCA, number of PCs: %d' % firstPcCount)
+        pipeLog("Has NaN:"+str(np.isnan(X).any()))
+        pipeLog("Has +inf:"+str(np.isposinf(X).any()))
+        pipeLog("Has -inf:"+ str(np.isneginf(X).any()))
+        pipeLog("Max value:"+ str(np.nanmax(X)))
+        if np.isposinf(X).any():
+            logging.warn("Matrix has +inf values, replacing with 2^32")
+            X[np.isinf(X)] = np.nan  # turn inf into NaN
+            X = np.nan_to_num(X, nan=2**32)  # replace NaN with 2E6
+            adata.X = X
+
         sc.tl.pca(adata, n_comps=firstPcCount)
+        # X[np.isinf(X)] = np.nan  # turn inf into NaN
+        #X = np.nan_to_num(X, nan=0.0)  # replace NaN with 0
+        #adata.X = X
+
+        # suggestion - Check after normalization:
+        # sc.pp.normalize_total(adata, target_sum=1e4)
+        # print(np.isnan(adata.X).any())
+        # suggestion2 - Before scaling, clip extreme values:
+        # sc.pp.scale(adata, max_value=10)
+
         #Multiply by -1 to compare with Seurat
         #adata.obsm['X_pca'] *= -1
         #Plot of pca variance ratio to see if formula matches visual determination of pc_nb to use
         sc.pl.pca_variance_ratio(adata, log=True)
 
         #Computing number of PCs to be used in clustering
         if pcCount == "auto":
             pipeLog("Estimating number of useful PCs based on Shekar et al, Cell 2016")
             pipeLog("PC weight cutoff used is (sqrt(# of Genes/# of cells) + 1)^2")
             pipeLog("See http://www.cell.com/cell/fulltext/S0092-8674(16)31007-8, STAR methods")
             pc_cutoff = ( np.sqrt(float(len(adata.var))/len(adata.obs)) +1 ) **2
             pc_nb = 0
             for i in adata.uns['pca']['variance']:
                 if i > pc_cutoff:
                     pc_nb+=1