src/cbPyLib/cellbrowser/cellbrowser.py 7c04a101e90c131afccf4d973d2e76da384543b8

7c04a101e90c131afccf4d973d2e76da384543b8
max
  Fri May 16 11:11:00 2025 -0700
do not stop on invalid genes in quickgenes file. skip invalid genes in marker file

diff --git src/cbPyLib/cellbrowser/cellbrowser.py src/cbPyLib/cellbrowser/cellbrowser.py
index 925a311..3c77bf4 100755
--- src/cbPyLib/cellbrowser/cellbrowser.py
+++ src/cbPyLib/cellbrowser/cellbrowser.py
@@ -2981,61 +2981,71 @@
         errAbort("Your marker file has more than 1000 clusters. Are you sure that this is correct? The input format is (clusterName, geneSymName, Score), is it possible that you have inversed the order of cluster and gene?")
 
     # determine if the score field is most likely a p-value, needed for sorting
     revSort = True
     scoreHeader = headers[scoreIdx].lower()
     logging.debug("score field has name '%s'" % scoreHeader)
     if scoreHeader in ["p_val", "p-val", "p.val", "pval", "fdr"]:
         logging.debug("score field name '%s' looks like a p-Value, sorting normally" % scoreHeader)
         revSort = False
 
     for clusterName, rows in iterItems(data):
         rows.sort(key=operator.itemgetter(2), reverse=revSort)
 
     return data, newHeaders
 
-def splitMarkerTable(filename, geneToSym, outDir):
+def splitMarkerTable(filename, geneToSym, matrixGeneIds, outDir):
     """ split .tsv on first field and create many files in outDir with columns 2-end.
         Returns the names of the clusters and a dict topMarkers with clusterName -> list of five top marker genes.
     """
     topMarkerCount = 5
 
     if filename is None:
         return
 
     data, newHeaders = parseMarkerTable(filename, geneToSym)
 
     logging.debug("Splitting cluster markers into directory %s" % (outDir))
     fileCount = 0
     sanNames = set()
     topMarkers = {}
     for clusterName, rows in iterItems(data):
         logging.debug("Cluster: %s" % clusterName)
         sanName = sanitizeName(clusterName)
         assert(sanName not in sanNames) # after removing special chars, cluster names must still be unique. this is most likely due to typos in your meta annotation table. If it is not, we need to change the code here.
         sanNames.add(sanName)
 
         outFname = join(outDir, sanName+".tsv")
         logging.debug("Writing %s" % outFname)
         ofh = open(outFname, "w")
         ofh.write("\t".join(newHeaders))
         ofh.write("\n")
+        missGeneIds = set()
         for row in rows:
             row[2] = "%0.5E" % row[2] # limit score to 5 digits
+            geneId = row[1]
+            if geneId not in matrixGeneIds:
+                missGeneIds.add(geneId)
+                continue
+
             ofh.write("\t".join(row))
             ofh.write("\n")
 
+        if len(missGeneIds)!=0:
+            logging.error("Marker table contains these genes, they were skipped, they are not in the matrix: %s" % (",".join(missGeneIds)))
+            #logging.error("Use --force to accept this.")
+
         topSyms = [row[1] for row in rows[:topMarkerCount]]
         topMarkers[clusterName] = topSyms
 
         ofh.close()
 
         runGzip(outFname)
 
         fileCount += 1
     logging.info("Wrote %d .tsv.gz files into directory %s" % (fileCount, outDir))
     return data.keys(), topMarkers
 
 def syncFiles(inFnames, outDir):
     " compare filesizes and copy to outDir, if needed "
     logging.info("Syncing %s to %s" % (inFnames, outDir))
     for inFname in inFnames:
@@ -3560,38 +3570,42 @@
         if len(line)==0:
             continue
 
         hasDesc = False
         hasPmid = False
         if line.startswith("symbol"):
             continue
         row = line.split(sep)
         geneOrSym = row[0]
 
         # case 1: user provides both geneId and symbol. Rare.
         # Necessary when symbol <-> geneId is not unique
         if "|" in geneOrSym:
             geneId, sym = geneOrSym.split("|")
             if geneId not in matrixGeneIds:
-                errAbort("geneId %s in quickgenes file is not in expression matrix" % repr(geneId))
+                logging.info("case 1: geneId %s in quickgenes file is not in expression matrix" % repr(geneId))
+                continue
             geneStr = geneOrSym
 
         # case 2: matrix has only symbols and user provides symbol. This is our legacy format for old datasets.
         # store only the symbol. We could look up the geneId but that's data inference, 
         # which we try not to do. The lookup could be wrong.
-        elif matrixSyms is not None and geneOrSym in matrixSyms and len(matrixGeneIds)==0:
+        elif matrixSyms is not None and geneOrSym in matrixSyms:
             geneStr = geneOrSym
+            if geneStr not in matrixGeneIds:
+                logging.info("case 2: geneId %s in quickgenes file is not in expression matrix" % repr(geneStr))
+                continue
 
         # case 3: matrix has geneIds and user provides a geneId. add the symbol from our mapping
         # that's data inference, but that should be OK
         elif geneOrSym in matrixGeneIds:
             geneId = geneOrSym
             if not geneToSym:
                 logging.info("quick gene %s has a geneId but we have no geneId/symbol table. You can use "
                         "the format geneId|symbol in the quick genes file to manually assign a label" % repr(geneId))
                 geneStr = geneId
             else:
                 if geneId in geneToSym:
                     geneStr = geneId+"|"+geneToSym[geneId]
                 else:
                     geneStr = geneId
 
@@ -3609,32 +3623,33 @@
                 errAbort("Gene %s in quickgenes file is neither a symbol nor a geneId" % repr(geneOrSym))
             if sym in nonUniqueSyms:
                 logging.warn("Symb %s in quick genes files is not unique. Which of these genes do you mean: %s. "
                         "Use geneId|sym in quickgenes file to solve this." % (sym, ",".join(nonUniqueSyms[sym])))
             geneStr = geneId+"|"+sym
 
         # case 5: it is an ATAC dataset and the quickgenes file has ranges
         elif ":" in geneOrSym and "-" in geneOrSym:
             #geneStr = geneOrSym.replace(":", "|").replace("-", "|")
             geneStr = findOverlappingRanges(matrixGeneIds, geneOrSym)
             if geneStr=="":
                 logging.error("quickGene ATAC range %s does not contain any peak in the matrix, skipping it" % geneOrSym)
                 continue
 
         else:
-            errAbort("Gene '%s' in quickgenes file is not in expr matrix and there is no geneId<->symbol mapping "
+            logging.info("Gene %s in quickgenes file is not in expr matrix and there is no geneId<->symbol mapping "
                 "to resolve it to a geneId in the expression matrix and it is not an ATAC range" % repr(geneOrSym))
+            continue
 
         # if we had no geneToSym, we'll check the symbol later if it's valid
 
         info = [geneStr]
 
         if len(row)>1:
             info.append(row[1])
         if len(row)>2:
             info.append(row[2])
         geneInfo.append(info)
 
     return geneInfo
 
 def readSampleNames(fname):
     " read only the first column of fname, strip the headers "
@@ -3958,55 +3973,55 @@
     notInLabels = markerClusters - labelClusters
     notInMarkers = labelClusters - markerClusters
     if len(notInMarkers)!=0:
         msg = ("%s: the following cluster names are in the meta file but not in the marker file: %s. "+
         "Please fix one of the files, clicks onto a label will otherwise not work.") % (markerFname, notInMarkers)
         if doAbort:
             errAbort(msg)
         else:
             logging.warn(msg)
 
     if len(notInLabels)!=0:
         logging.warn(("%s: the following cluster names are in the marker file but not in the meta file: %s. "+
                 "Users may not notice the problem, but it may indicate an erroneous meta data file.") % \
                 (markerFname, notInLabels))
 
-def convertMarkers(inConf, outConf, geneToSym, clusterLabels, outDir):
+def convertMarkers(inConf, outConf, geneToSym, clusterLabels, matrixGeneIds, outDir):
     """ split the marker tables into one file per cluster and add filenames as 'markers' in outConf
     also add the 'topMarkers' to outConf, the top five markers for every cluster.
     """
     markerFnames = []
     if "markers" in inConf:
         markerFnames = makeAbsDict(inConf, "markers")
 
     newMarkers = []
     #doAbort = True # only the first marker file leads to abort, we're more tolerant for the others
     doAbort = False # temp hack # because of single cell cluster filtering in cbScanpy
     topMarkersDone = False
     for markerIdx, markerInfo in enumerate(markerFnames):
         
         if type(markerInfo)!=type(dict()):
             errAbort("The 'markers' setting in cellbrowser.conf is not a dictionary but must be a dictionary with keys 'file' and 'label'.")
 
         markerFname = markerInfo["file"]
         markerLabel = markerInfo["shortLabel"]
 
         clusterName = "markers_%d" % markerIdx # use sha1 of input file ?
         markerDir = join(outDir, "markers", clusterName)
         makeDir(markerDir)
 
-        clusterNames, topMarkers = splitMarkerTable(markerFname, geneToSym, markerDir)
+        clusterNames, topMarkers = splitMarkerTable(markerFname, geneToSym, matrixGeneIds, markerDir)
         # only use the top markers of the first marker file
         if not topMarkersDone:
             outConf["topMarkers"] = topMarkers
             topMarkersDone = True
 
         checkClusterNames(markerFname, clusterNames, clusterLabels, doAbort)
         doAbort = False
 
         newDict = {"name" : sanitizeName(clusterName), "shortLabel" : markerLabel}
         if "selectOnClick" in markerInfo:
             newDict["selectOnClick"] = markerInfo["selectOnClick"]
         newMarkers.append( newDict )
 
     outConf["markers"] = newMarkers
 
@@ -4048,62 +4063,53 @@
         if "|" in g:
             parts = g.split("|")
             geneId = parts[0]
             sym = parts[1]
             syms.append( sym )
             geneIds.append(geneId)
             hasBoth = True
             geneToSym[geneId] = sym
         else:
             symsOrGeneIds.append( g )
 
     if hasBoth:
         logging.debug("Matrix has both geneIds and symbols")
         geneIds.extend(symsOrGeneIds)
     else:
-        #logging.debug("Matrix does not have both geneIds and symbols, it contains either geneIds only or symbols only")
-        #if areProbablyGeneIds(syms):
-            #logging("80% look like geneIds: Using only the identifiers from the matrix")
-            #geneIds = symsOrGeneIds
-            #syms = geneToSym.values()
-        #else:
-            #logging("matrix identifiers do not look like geneIds: assume they are all symbols")
         logging.debug("Matrix does not have both geneIds and symbols, assuming it contains only geneIds")
         geneIds = symsOrGeneIds
         syms = []
 
     if len(geneToSym)==0:
         logging.info("There are no gene/symbol pairs in the matrix")
         geneToSym = None
 
     symSet = set(syms)
 
     #symCounts = Counter(syms).most_common()
     #for sym, count in symCounts:
         #if count > 1:
             #logging.warn("Symbol %d is not unique" % count)
 
     return symSet, set(geneIds), geneToSym
 
-def readQuickGenes(inConf, geneToSym, outDir, outConf):
+def readQuickGenes(inConf, geneToSym, matrixSyms, matrixGeneIds, geneToSymFromMatrix, outDir, outConf):
     " read quick genes file and make sure that the genes in it are in the matrix "
     quickGeneFname = inConf.get("quickGenesFile")
     if not quickGeneFname:
         return
 
-    matrixSyms, matrixGeneIds, geneToSymFromMatrix = readValidGenes(outDir, inConf)
-
     # prefer the symbols from the matrix over our own symbol tables
     if geneToSymFromMatrix is not None:
         logging.info("Matrix has gene symbols, these are used for quick gene file parsing")
         geneToSym = geneToSymFromMatrix
 
     fname = getAbsPath(inConf, "quickGenesFile")
     quickGenes = parseGeneInfo(geneToSym, fname, matrixSyms, matrixGeneIds)
 
     outConf["quickGenes"] = quickGenes
     logging.info("Read %d quick genes from %s" % (len(quickGenes), fname))
 
 def getFileVersion(fname):
     data = {}
     data["fname"] = fname
     addMd5(data, fname, shortMd5=False)
@@ -4656,33 +4662,35 @@
         convertExprMatrix(inConf, outMatrixFname, outConf, sampleNames, geneToSym, datasetDir, needFilterMatrix)
         # in case we crash after this, keep the current state of the config, as matrix ops are so slow
         writeConfig(inConf, outConf, datasetDir)
     else:
         logging.info("Matrix and meta sample names have not changed, not indexing matrix again")
 
     convertTraces(inConf, sampleNames, datasetDir, outConf)
 
     coordFiles, clusterLabels = convertCoords(inDir, inConf, outConf, sampleNames, outMetaFname, datasetDir)
 
     foundConf = writeDatasetDesc(inConf["inDir"], outConf, datasetDir, coordFiles, outMatrixFname)
 
     if geneToSym==-1:
         geneToSym = readGeneSymbols(inConf.get("geneIdType"), inMatrixFname)
 
-    convertMarkers(inConf, outConf, geneToSym, clusterLabels, datasetDir)
+    matrixSyms, matrixGeneIds, geneToSymFromMatrix = readValidGenes(datasetDir, inConf)
+
+    convertMarkers(inConf, outConf, geneToSym, clusterLabels, matrixGeneIds, datasetDir)
 
-    readQuickGenes(inConf, geneToSym, datasetDir, outConf)
+    readQuickGenes(inConf, geneToSym, matrixSyms, matrixGeneIds, geneToSymFromMatrix, datasetDir, outConf)
 
     copyBackgroundImages(inDir, inConf, outConf, datasetDir)
 
     # a few settings are passed through to the Javascript as they are
     for tag in ["shortLabel", "radius", "alpha", "priority", "tags", "sampleDesc", "geneLabel",
         "clusterField", "defColorField", "xenaPhenoId", "xenaId", "hubUrl", "showLabels", "ucscDb",
         "unit", "violinField", "visibility", "coordLabel", "lineWidth", "hideDataset", "hideDownload",
         "metaBarWidth", "supplFiles", "defQuantPal", "defCatPal", "clusterPngDir", "wrangler", "shepherd",
         "binStrategy", "split",
         "lineAlpha", "lineWidth", "lineColor",
         # the following are there only for old datasets, they are now nested under "facets"
         # they are just here for backwards-compatibility and will eventually get removed
         "body_parts", "organisms", "diseases", "projects", "life_stages", "domains", "sources", "assays", 
         # facets are taking their place now
         "facets", "multiModal"]: