b7f624b1913a3b14792339272310d378db114886 mspeir Fri Jul 11 09:01:49 2025 -0700 making changes so that cbMarkerAnnotate works on seurat marker files and outputs annotated seurat marker files that work with cbBuild diff --git src/cbPyLib/cellbrowser/geneinfo.py src/cbPyLib/cellbrowser/geneinfo.py index 130b61e..a4b0ab9 100644 --- src/cbPyLib/cellbrowser/geneinfo.py +++ src/cbPyLib/cellbrowser/geneinfo.py @@ -220,32 +220,44 @@ for line in openStaticFile(inFname): row = line.rstrip("\n").split("\t") ret[row[0]] = row[1] return ret def tabGeneAnnotate(inFname, symToEntrez, symToSfari, entrezToClass, entrezToOmim, entrezToCosmic, entrezToHpo, entrezToLmd, entrezToEuroexpress, humanToMouseEntrezList, mouseEntrezToBrainspanMouseDev): " " headers = None geneToSym = -1 for row in lineFileNextRow(inFname): if headers is None: headers = list(row._fields) headers.append("_hprdClass") headers.append("_expr") headers.append("_geneLists") + + # lineFileNextRow makes some changes to seurat headers that we need to undo + if headers[0] == "rowName": + headers[0] = '' + if headers[3] == "pct_1": + headers[3] = "pct.1" + if headers[4] == "pct_2": + headers[4] = "pct.2" yield headers sym = row[1] + isSeurat = False + if sym.isnumeric(): # if column 2 is only a number, it's probably a seurat file + sym = row[0] # gene symbol is in column 1 + isSeurat = True if "|" in sym: # marker gene lists can carry geneId|symbol, strip the symbol in this case and re-convert below sym = sym.split("|")[0] if "." in sym: # remove Ensembl version identifier sym = sym.split(".")[0] # convert gene IDs to symbols if geneToSym is -1: geneToSym = readGeneSymbols(None, [sym]) if geneToSym is not None: geneId = geneToSym.get(sym) if geneId is None: logging.debug("Cannot find NCBI Gene ID for symbol: %s" % sym) geneId = sym sym = geneId @@ -290,30 +302,33 @@ if entrezId is not None: if entrezId in entrezToLmd: exprParts.append("BrainSpLMD|"+entrezId) if entrezId in entrezToEuroexpress: eurExpId, annotStr = entrezToEuroexpress[entrezId] annotStr = annotStr.replace(";", ",") exprParts.append("Eurexp|"+eurExpId+"|"+annotStr) mouseEntrezList = humanToMouseEntrezList[entrezId] for mouseEntrez in mouseEntrezList: if mouseEntrez in mouseEntrezToBrainspanMouseDev: exprParts.append("BrainSpMouseDev|"+mouseEntrezToBrainspanMouseDev[mouseEntrez]) row = list(row) + if isSeurat: + row[0] = sym # for seurat files, column 1 is the gene + else: row[1] = sym # in case the original ID was a geneID, not a symbol row.append(hprdClass) row.append(";".join(exprParts)) row.append(";".join(geneLists)) yield row def cbMarkerAnnotateCli(): args, options = parseArgs() entrezToBrainspanMouseDev = parseSimpleMap(options.brainspanMouseDev) symToEntrez, hgncIdToEntrez = parseHgnc(options.hgnc) mouseEntrezToHumanEntrez, humanToMouseEntrezList = parseMgiOrtho(hgncIdToEntrez, options.mgiOrtho)