src/cbPyLib/cellbrowser/geneinfo.py b7f624b1913a3b14792339272310d378db114886

b7f624b1913a3b14792339272310d378db114886
mspeir
  Fri Jul 11 09:01:49 2025 -0700
making changes so that cbMarkerAnnotate works on seurat marker files and outputs annotated seurat marker files that work with cbBuild

diff --git src/cbPyLib/cellbrowser/geneinfo.py src/cbPyLib/cellbrowser/geneinfo.py
index 130b61e..a4b0ab9 100644
--- src/cbPyLib/cellbrowser/geneinfo.py
+++ src/cbPyLib/cellbrowser/geneinfo.py
@@ -1,342 +1,357 @@
 # annotate a list of gene IDs with links to various external databases
 
 import logging, sys, optparse, re, unicodedata, string, csv
 from collections import defaultdict, namedtuple
 from os.path import join, basename, dirname, isfile
 
 from .cellbrowser import openStaticFile, staticFileNextRow, openFile, splitOnce, iterItems, lineFileNextRow, setDebug
 from .cellbrowser import readGeneSymbols
 
 dataDir = "geneAnnot"
 
 # no spaces/special chars in filenames - otherwise the URL will be rejected as invalid by urllib2
 HPRD = join(dataDir, "HPRD_molecular_class_081914.txt")
 HGNC = join(dataDir, "hgnc_complete_set_05Dec17.txt")
 SFARI = join(dataDir, "SFARI-Gene_genes_export06-12-2017.csv")
 OMIM = join(dataDir, "mim2gene.txt")
 COSMIC = join(dataDir, "Census_allWed_Dec__6_18_35_54_2017.tsv")
 HPO = join(dataDir, "hpo_frequent_7Dec17.txt")
 BRAINSPANLMD = join(dataDir, "brainspan_genes.csv")
 BRAINSPANMOUSEDEV = join(dataDir, "brainspanMouse_9Dec17.txt")
 MGIORTHO = join(dataDir, "mgi_HGNC_homologene_8Dec17.txt")
 EUREXPRESS = join(dataDir, "eurexpress_7Dec17.txt")
 DDD = join(dataDir, "DDG2P_18_10_2018.csv.gz")
 
 
 # ==== functions =====
     
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] inFname outFname - annotate a tab-sep gene list file with information from other databases
             
     A minimal input file has a header line with at one field called "gene" (=symbol or ENS/Entrez geneID) and
     one field called "cluster".
     
     In the cellbrowser, the cluster name should match the cluster name in the meta data file.""")
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
     parser.add_option("", "--hprd", dest="hprd", action="store", help="location of HPRD file, default %default", default=HPRD)
     parser.add_option("", "--hgnc", dest="hgnc", action="store", help="location of HGNC file, default %default", default=HGNC)
     parser.add_option("", "--sfari", dest="sfari", action="store", help="location of SFARI file, default %default", default=SFARI)
     parser.add_option("", "--omim", dest="omim", action="store", help="location of OMIM file, default %default", default=OMIM)
     parser.add_option("", "--cosmic", dest="cosmic", action="store", help="location of COSMIC Census file, default %default", default=COSMIC)
     parser.add_option("", "--hpo", dest="hpo", action="store", help="location of HPO gene/disease/phenotype file, default %default", default=HPO)
     parser.add_option("", "--lmd", dest="lmd", action="store", help="location of BrainSpan LMD file, default %default", default=BRAINSPANLMD)
     parser.add_option("", "--mgiOrtho", dest="mgiOrtho", action="store", help="location of MGI Homologene file, default %default", default=MGIORTHO)
     parser.add_option("", "--eurexpress", dest="eurexpress", action="store", help="location of Eurexpress file, default %default", default=EUREXPRESS)
     parser.add_option("", "--brainspanMouseDev", dest="brainspanMouseDev", action="store", help="location of brainspan Mouse Development ISH file, default %default", default=BRAINSPANMOUSEDEV)
     #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
     #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
     (options, args) = parser.parse_args()
 
     if args==[]:
         parser.print_help()
         exit(1)
 
     setDebug(options.debug)
     return args, options
 
 # ----------- main --------------
 def parseBrainspanLmd(inFname):
     " return entrez -> brainspanGeneId with all entrez IDs that are in the brainspan LMD set "
     with openStaticFile(inFname, 'r') as csvfile:
         ret = {}
         cr = csv.reader(csvfile)
         headers = None
         for row in cr:
             if headers == None:
                 assert(row==['row_num','gene_id','ensembl_gene_id','gene_symbol','entrez_id'])
                 headers = row
                 continue
             ret[row[4]] = row[1]
     return ret
 
 
 def parseSfari(inFname):
     " return symbol -> class "
     classDesc = {
             "" : "Autism, No category",
             "S" : "Autism, S - Syndromic",
             "1" : "Autism, 1 - High confidence",
             "2" : "Autism, 2 - Strong candidate",
             "3" : "Autism, 3 - Suggestive evidence",
             "4" : "Autism, 4 - Minimal evidence",
             "5" : "Autism, 5 - Hypothesized but untested",
             "6" : "Autism, 6 - Evidence does not support role",
     }
     # ['status', 'gene-symbol', 'gene-name', 'chromosome', 'genetic-category', 'gene-score', 'syndromic', 'number-of-reports']
     headers = None
     ret = {}
     with openStaticFile(inFname, 'r') as csvfile:
         spamreader = csv.reader(csvfile)
         for row in spamreader:
             if headers == None:
                 headers = row
                 assert(headers[5]=="gene-score")
                 continue
             score = row[5]
             ret[row[1]] = classDesc[score]
     return ret
 
 def parseHprd(inFname):
     " return entrez gene ID -> mol class from HPRD "
     ret = {}
     for row in staticFileNextRow(inFname):
         ret[row.Entrez_gene_ID] = row.Molecular_class
     return ret
 
 def parseHgnc(inFname):
     " return dict with symbol -> entrez gene ID and hgncID to entrezId"
     ret = {}
     hgncIdToEntrez = {}
     for row in staticFileNextRow(inFname):
         entrezIds = row.entrez_id.strip('"').split("|")
         for entrezId in entrezIds:
             ret[row.symbol] = entrezId
             hgncIdToEntrez[row.hgnc_id] = entrezId # there is a 1:1 coorespondance between HGNC and entrez gene Id, except two miRNAs in 2017
     return ret, hgncIdToEntrez
 
 def parseOmim(inFname):
     " return entrezId -> OMIM ID, link is https://omim.org/entry/601542 "
     # 100660  gene    218     ALDH3A1 ENSG00000108602
     ret = {}
     for line in openStaticFile(inFname):
         if line.startswith("#"):
             continue
         row = line.rstrip("\n").split("\t")
         if row[2]=="":
             continue
         ret[row[2]] = row[0]
     return ret
 
 def parseCosmic(inFname):
     " return entrezId -> cancer types "
     ret = {}
     for row in staticFileNextRow(inFname):
         entrez = row.Entrez_GeneId
         cancerSomDesc = row.Tumour_Types_Somatic_
         cancerGermDesc = row.Tumour_Types_Germline_
         otherSyn = row.Other_Syndrome
         dis = []
         dis.extend(cancerSomDesc.strip('"').split(","))
         dis.extend(cancerGermDesc.strip('"').split(","))
         dis.extend(otherSyn.strip('"').split(","))
         disStr = ", ".join([s.strip() for s in dis if s!=""])
         disStr = disStr.replace(";", ",") # ; is reserved
         ret[entrez] = disStr
     return ret
 
 def parseHpo(inFname):
     " return entrezId -> human phenotypes "
     geneToNames = defaultdict(set)
     # ORPHA:349	FUCA1	2517	HP:0000943	Dysostosis multiplex
     for line in openStaticFile(inFname):
         if line.startswith("#"):
             continue
         row = line.rstrip("\n").split("\t")
         entrez = row[2]
         hpoName = row[4]
         geneToNames[entrez].add(hpoName)
 
     ret = {}
     for entrez, names in iterItems(geneToNames):
         names = list(names)
         names.sort()
         ret[entrez] = ", ".join(names)
     return ret
 
 def parseMgiOrtho(hgncIdToEntrez, inFname):
     " return dict with mouse entrezId -> human entrez "
     ret = {}
     # MGI Accession ID        Marker Symbol   Marker Name     Feature Type    EntrezGene ID   NCBI Gene chromosome    NCBI Gene start NCBI Gene end   NCBI Gene strand       Ensembl Gene ID Ensembl Gene chromosome Ensembl Gene start      Ensembl Gene end        Ensembl Gene strand     VEGA Gene ID    VEGA Gene chromosome  VEGA Gene start  VEGA Gene end   VEGA Gene strand        CCDS IDs        HGNC ID HomoloGene ID
     humanToMouse = defaultdict(list)
     for row in staticFileNextRow(inFname):
         hgncIds = row.HGNC_ID
         if hgncIds=="null" or hgncIds=="":
             continue
         for hgncId in hgncIds.split("|"):
             humanEntrez = hgncIdToEntrez[hgncId]
             # saw only 5 duplicated entrezIDs on the mouse side
             mouseEntrez = row.EntrezGene_ID
             ret[mouseEntrez] = humanEntrez
             humanToMouse[humanEntrez].append(mouseEntrez)
     return ret, humanToMouse
 
 def parseEurexpress(mouseEntrezToHumanEntrez, inFname):
     " return dict with human entrez -> (eurexpressId, annotationStr) "
     # Template ID     Gene Symbol     Assay ID        EMAP Term       Entrez ID       Theiler Stage
     entrezToTerms = defaultdict(set)
     entrezToEuroexpress = dict()
     skippedMouseIds = set()
     for row in staticFileNextRow(inFname):
         mouseEntrez = row.Entrez_ID
         humanEntrez = mouseEntrezToHumanEntrez.get(mouseEntrez)
         if humanEntrez==None:
             skippedMouseIds.add(mouseEntrez)
             continue
         if row.EMAP_Term!="":
             entrezToTerms[humanEntrez].add(row.EMAP_Term)
         entrezToEuroexpress[humanEntrez] = row.Assay_ID
 
     logging.info("Eurexpress mouse entrez IDs: %d mappable, %d not-mappable to human " % (len(entrezToEuroexpress),len(skippedMouseIds)))
     logging.debug("Eurexpress mouse: mouse entrez IDs not mappable to human: %s" % ",".join(skippedMouseIds))
     ret = {}
     for entrezId, terms in iterItems(entrezToTerms):
         eurexpId = entrezToEuroexpress[entrezId]
         ret[entrezId] = (eurexpId, ", ".join(sorted(list(terms))))
 
     return ret
 
 def parseDDD(fname):
     " parse DDD phenotype file "
     ret = {}
     #for row in staticFileNextRow(inFname):
         #print row
     return ret
     
 def parseSimpleMap(inFname):
     " parse simple tab-sep key-value file and return as dict "
     ret = {}
     for line in openStaticFile(inFname):
         row = line.rstrip("\n").split("\t")
         ret[row[0]] = row[1]
     return ret
 
 def tabGeneAnnotate(inFname, symToEntrez, symToSfari, entrezToClass, entrezToOmim, entrezToCosmic, entrezToHpo, entrezToLmd, entrezToEuroexpress, humanToMouseEntrezList, mouseEntrezToBrainspanMouseDev):
     " "
     headers = None
     geneToSym = -1
     for row in lineFileNextRow(inFname):
         if headers is None:
             headers = list(row._fields)
             headers.append("_hprdClass")
             headers.append("_expr")
             headers.append("_geneLists")
+
+            # lineFileNextRow makes some changes to seurat headers that we need to undo
+            if headers[0] == "rowName":
+                headers[0] = ''
+            if headers[3] == "pct_1":
+                headers[3] = "pct.1"
+            if headers[4] == "pct_2":
+                headers[4] = "pct.2"
             yield headers
         sym = row[1]
+        isSeurat = False
+        if sym.isnumeric(): # if column 2 is only a number, it's probably a seurat file
+            sym = row[0] # gene symbol is in column 1
+            isSeurat = True
         if "|" in sym: # marker gene lists can carry geneId|symbol, strip the symbol in this case and re-convert below
             sym = sym.split("|")[0]
         if "." in sym: # remove Ensembl version identifier
             sym = sym.split(".")[0]
 
         # convert gene IDs to symbols
         if geneToSym is -1:
             geneToSym = readGeneSymbols(None, [sym])
         if geneToSym is not None:
             geneId = geneToSym.get(sym)
             if geneId is None:
                 logging.debug("Cannot find NCBI Gene ID for symbol: %s" % sym)
                 geneId = sym
             sym = geneId
 
         hprdClass = ""
         entrezId = symToEntrez.get(sym)
 
         if entrezId == None:
             logging.debug("Cannot find entrezId for symbol %s" % sym)
 
         # now summarize the presence/absence of this gene in various specialized gene lists:
         # OMIM, COSMIC, SFARI
         hprdClass = entrezToClass.get(entrezId, "")
         geneLists = []
         if sym!="":
             # SFARI
             sfariInfo = symToSfari.get(sym)
             if sfariInfo is not None:
                 sfariInfo = "SFARI||"+sfariInfo
                 geneLists.append(sfariInfo)
 
         if entrezId is not None:
             # OMIM
             omimId = entrezToOmim.get(entrezId)
             if omimId is not None:
                 omimInfo = "OMIM|"+omimId
                 geneLists.append(omimInfo)
 
             # COSMIC
             cosmicDesc = entrezToCosmic.get(entrezId)
             if cosmicDesc is not None:
                 cosmicDesc = "COSMIC||"+cosmicDesc
                 geneLists.append(cosmicDesc)
 
             # HPO
             hpoDesc = entrezToHpo.get(entrezId)
             if hpoDesc is not None:
                 hpoDesc = "HPO|"+entrezId+"|"+hpoDesc
                 geneLists.append(hpoDesc)
 
         # links to gene expression databases
         exprParts = []
         if entrezId is not None:
             if entrezId in entrezToLmd:
                 exprParts.append("BrainSpLMD|"+entrezId)
 
             if entrezId in entrezToEuroexpress:
                 eurExpId, annotStr = entrezToEuroexpress[entrezId]
                 annotStr = annotStr.replace(";", ",")
                 exprParts.append("Eurexp|"+eurExpId+"|"+annotStr)
 
             mouseEntrezList = humanToMouseEntrezList[entrezId]
             for mouseEntrez in mouseEntrezList:
                 if mouseEntrez in mouseEntrezToBrainspanMouseDev:
                     exprParts.append("BrainSpMouseDev|"+mouseEntrezToBrainspanMouseDev[mouseEntrez])
 
         row = list(row)
+        if isSeurat:
+            row[0] = sym # for seurat files, column 1 is the gene
+        else:
             row[1] = sym # in case the original ID was a geneID, not a symbol
 
         row.append(hprdClass)
         row.append(";".join(exprParts))
         row.append(";".join(geneLists))
 
         yield row
 
 def cbMarkerAnnotateCli():
     args, options = parseArgs()
 
     entrezToBrainspanMouseDev = parseSimpleMap(options.brainspanMouseDev)
     symToEntrez, hgncIdToEntrez = parseHgnc(options.hgnc)
     mouseEntrezToHumanEntrez, humanToMouseEntrezList = parseMgiOrtho(hgncIdToEntrez, options.mgiOrtho)
 
     entrezToEuroexpress = parseEurexpress(mouseEntrezToHumanEntrez, options.eurexpress)
     entrezToLmd = parseBrainspanLmd(options.lmd)
     entrezToHpo = parseHpo(options.hpo)
     entrezToCosmic = parseCosmic(options.cosmic)
     entrezToOmim = parseOmim(options.omim)
     symToSfari = parseSfari(options.sfari)
     entrezToClass = parseHprd(options.hprd)
     #symToDdd = parseDdd(DDD)
 
     filename = args[0]
     outFname = args[1]
 
     rowCount = 0
     ofh = open(outFname, "w")
     for row in tabGeneAnnotate(filename, symToEntrez, symToSfari, entrezToClass, entrezToOmim, entrezToCosmic, entrezToHpo, entrezToLmd, entrezToEuroexpress, humanToMouseEntrezList, entrezToBrainspanMouseDev):
         ofh.write("\t".join(row))
         ofh.write("\n")
         rowCount +=1
     ofh.close()
 
     logging.info("Annotated %d marker gene rows, output written to %s" % (rowCount, outFname))