b7f624b1913a3b14792339272310d378db114886
mspeir
  Fri Jul 11 09:01:49 2025 -0700
making changes so that cbMarkerAnnotate works on seurat marker files and outputs annotated seurat marker files that work with cbBuild

diff --git src/cbPyLib/cellbrowser/geneinfo.py src/cbPyLib/cellbrowser/geneinfo.py
index 130b61e..a4b0ab9 100644
--- src/cbPyLib/cellbrowser/geneinfo.py
+++ src/cbPyLib/cellbrowser/geneinfo.py
@@ -220,32 +220,44 @@
     for line in openStaticFile(inFname):
         row = line.rstrip("\n").split("\t")
         ret[row[0]] = row[1]
     return ret
 
 def tabGeneAnnotate(inFname, symToEntrez, symToSfari, entrezToClass, entrezToOmim, entrezToCosmic, entrezToHpo, entrezToLmd, entrezToEuroexpress, humanToMouseEntrezList, mouseEntrezToBrainspanMouseDev):
     " "
     headers = None
     geneToSym = -1
     for row in lineFileNextRow(inFname):
         if headers is None:
             headers = list(row._fields)
             headers.append("_hprdClass")
             headers.append("_expr")
             headers.append("_geneLists")
+
+            # lineFileNextRow makes some changes to seurat headers that we need to undo
+            if headers[0] == "rowName":
+                headers[0] = ''
+            if headers[3] == "pct_1":
+                headers[3] = "pct.1"
+            if headers[4] == "pct_2":
+                headers[4] = "pct.2"
             yield headers
         sym = row[1]
+        isSeurat = False
+        if sym.isnumeric(): # if column 2 is only a number, it's probably a seurat file
+            sym = row[0] # gene symbol is in column 1
+            isSeurat = True
         if "|" in sym: # marker gene lists can carry geneId|symbol, strip the symbol in this case and re-convert below
             sym = sym.split("|")[0]
         if "." in sym: # remove Ensembl version identifier
             sym = sym.split(".")[0]
 
         # convert gene IDs to symbols
         if geneToSym is -1:
             geneToSym = readGeneSymbols(None, [sym])
         if geneToSym is not None:
             geneId = geneToSym.get(sym)
             if geneId is None:
                 logging.debug("Cannot find NCBI Gene ID for symbol: %s" % sym)
                 geneId = sym
             sym = geneId
 
@@ -290,30 +302,33 @@
         if entrezId is not None:
             if entrezId in entrezToLmd:
                 exprParts.append("BrainSpLMD|"+entrezId)
 
             if entrezId in entrezToEuroexpress:
                 eurExpId, annotStr = entrezToEuroexpress[entrezId]
                 annotStr = annotStr.replace(";", ",")
                 exprParts.append("Eurexp|"+eurExpId+"|"+annotStr)
 
             mouseEntrezList = humanToMouseEntrezList[entrezId]
             for mouseEntrez in mouseEntrezList:
                 if mouseEntrez in mouseEntrezToBrainspanMouseDev:
                     exprParts.append("BrainSpMouseDev|"+mouseEntrezToBrainspanMouseDev[mouseEntrez])
 
         row = list(row)
+        if isSeurat:
+            row[0] = sym # for seurat files, column 1 is the gene
+        else:
             row[1] = sym # in case the original ID was a geneID, not a symbol
 
         row.append(hprdClass)
         row.append(";".join(exprParts))
         row.append(";".join(geneLists))
 
         yield row
 
 def cbMarkerAnnotateCli():
     args, options = parseArgs()
 
     entrezToBrainspanMouseDev = parseSimpleMap(options.brainspanMouseDev)
     symToEntrez, hgncIdToEntrez = parseHgnc(options.hgnc)
     mouseEntrezToHumanEntrez, humanToMouseEntrezList = parseMgiOrtho(hgncIdToEntrez, options.mgiOrtho)