src/hg/makeDb/doc/asmHubs/commonNames.py b26ad6e7a08036ee13cd05598e81dbf5111be67d

b26ad6e7a08036ee13cd05598e81dbf5111be67d
hiram
  Tue Jul 22 13:22:29 2025 -0700
minimum allowed tracks on bacteria lowered and more categories allow scientific name to come through as the "common name" in the browser for odd categories of archaea and bacteria refs #29545

diff --git src/hg/makeDb/doc/asmHubs/commonNames.py src/hg/makeDb/doc/asmHubs/commonNames.py
index 4780209b998..fbdba40bc89 100755
--- src/hg/makeDb/doc/asmHubs/commonNames.py
+++ src/hg/makeDb/doc/asmHubs/commonNames.py
@@ -1,284 +1,284 @@
 #!/cluster/software/bin/python3
 
 import os
 import sys
 import re
 import site
 
 ####################################################################
 ### this is kinda like an environment setting, it gets everything
 ### into a UTF-8 reading mode
 ####################################################################
 def setUtf8Encoding():
     """
     Set UTF-8 encoding for stdin, stdout, and stderr in Python.
     """
     if sys.stdout.encoding != 'utf-8':
         sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1)
     if sys.stderr.encoding != 'utf-8':
         sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1)
 
 
 ### given a path name to an asmRpt, read the file and get out the names
 ### we need
 def extractNames(asmRpt, hapX):
     asmType = "asmType"
     sciName = "sciName"
     yearDate = "yearDate"
     isolate = ""
     cultivar = ""
     orgName = "orgName"
     order = ""
     extraStrings = ""
     try:
       readFile = open(asmRpt, 'r', encoding='utf-8')
       for line in readFile:
         line = line.rstrip()
         if line.startswith('#'):
           if "Assembly type:" in line:
             pat = re.compile(r'.*bly type:\s+', re.IGNORECASE)
             asmType = pat.sub('', line)
             if re.search('alternate-pseudohaplotype', asmType):
               asmType = " alternate hap"
             elif re.search('principal pseudo', asmType):
               asmType = " primary hap"
             else:
               asmType = hapX
           elif "Organism name:" in line:
             saveOrgName = line
             pat = re.compile(r'.*\(')
             orgName = pat.sub('', line)
             pat = re.compile(r'[()\[\]+*]')
             orgName = pat.sub('', orgName)
             pat = re.compile(r'\?')
             orgName = pat.sub('', orgName)
             pat = re.compile(r'.*ism name:\s+')
             orgName = pat.sub('', orgName)
            
 
             pat = re.compile(r'.*ganism name:\s+', re.IGNORECASE)
             sciName = pat.sub('', line)
             pat = re.compile(r'\s+\(.*\)$')
             sciName = pat.sub('', sciName)
             pat = re.compile(r'[()\[\]+*]')
             sciName = pat.sub('', sciName)
             pat = re.compile(r'\?')
             sciName = pat.sub(' ', sciName)
 
-            pat = r'bacteria|kinetoplastids|firmicutes|proteobacteria|high G|enterobacteria|agent of'
+            pat = r'agent of|aquificales|archaea|bacteria|chlamydias|crenarchaeotes|euryarchaeotes|kinetoplastids|firmicutes|mycoplasmas|planctomycetes|proteobacteria|high G|enterobacteria|s__|spirochetes|verrucomicrobia|thermotogales'
             if re.search(pat, orgName):
               orgName = sciName
             else:
               pat = r'apicomplexans|ants|bacteria|bryozoans|bugs|ciliates|crustaceans|diatoms|hydrozoans|dinoflagellates|gastropods|hemichordates|nematodes|flatworm|pelagophytes|scorpions|ascomycete|basidiomycete|budding|microsporidian|smut|fungi|eukaryotes|flies|beetles|mosquitos|bees|moths|sponges|^mites|ticks|^comb|jellies|jellyfishes|chitons|bivalves|bony fishes|birds|eudicots|snakes|bats|tunicates|tsetse fly|cellular slime molds|stony corals'
               if re.search(pat, orgName):
                 order = orgName.split()[0]
                 if re.search('budding', order):
                   order = "budding yeast"
                 elif re.search('smut', order):
                   order = "smut fungi"
                 elif re.search('bony', order):
                   order = "bony fish"
                 elif re.search('ascomycete', order):
                   order = "ascomycetes"
                 elif re.search('eudicots', order):
                   order = "eudicot"
                 elif re.search('birds', order):
                   order = "bird"
                 elif re.search('snakes', order):
                   order = "snake"
                 elif re.search('crustaceans', order):
                   order = "crustacean"
                 elif re.search('mosquitos', order):
                   order = "mosquito"
                 elif re.search('mites', order):
                   order = "mite/tick"
                 elif re.search('comb', order):
                   order = "comb jelly"
                 elif re.search('jellyfishes', order):
                   order = "jellyfish"
                 elif re.search('chitons', order):
                   order = "chiton"
                 elif re.search('bivalves', order):
                   order = "bivalve"
                 elif re.search('beetles', order):
                   order = "beetle"
                 elif re.search('bees', order):
                   order = "bee"
                 elif re.search('bats', order):
                   order = "bat"
                 elif re.search('moths', order):
                   order = "moth"
                 elif re.search('sponges', order):
                   order = "sponge"
                 elif re.search('flatworms', order):
                   order = "flatworm"
                 elif re.search('nematodes', order):
                   order = "nematode"
                 elif re.search('basidiomycete', order):
                   order = "basidiomycetes"
                 words = sciName.split()
                 restWords = " ".join(words[1:])
                 if re.search("eukaryotes", orgName):
                   orgName = words[0][0].upper() + "." + restWords
                 elif re.search("apicomplexans", orgName):
                   orgName = "apicomplexans " + words[0][0].upper() + "." + restWords
                 elif re.search("ants", orgName):
                   orgName = "ant " + words[0][0].upper() + "." + restWords
                 elif re.search("dinoflagellates", orgName):
                   orgName = "dinoflagellates " + words[0][0].upper() + "." + restWords
                 elif re.search("\bbacteria\b", orgName):
                   orgName = "bacteria " + words[0][0].upper() + "." + restWords
                 elif re.search("hemichordates", orgName):
                   orgName = "hemichordates " + words[0][0].upper() + "." + restWords
                 elif re.search("bryozoans", orgName):
                   orgName = "bryozoans " + words[0][0].upper() + "." + restWords
                 elif re.search("ciliates", orgName):
                   orgName = "ciliates " + words[0][0].upper() + "." + restWords
                 elif re.search("diatoms", orgName):
                   orgName = "diatoms " + words[0][0].upper() + "." + restWords
                 elif re.search("hydrozoans", orgName):
                   orgName = "hydrozoans " + words[0][0].upper() + "." + restWords
                 elif re.search("gastropods", orgName):
                   orgName = "gastropods " + words[0][0].upper() + "." + restWords
                 elif re.search("pelagophytes", orgName):
                   orgName = "pelagophytes " + words[0][0].upper() + "." + restWords
                 elif re.search("scorpions", orgName):
                   orgName = "scorpions " + words[0][0].upper() + "." + restWords
                 elif re.search("flies", orgName):
                   orgName = "fly " + words[0][0].upper() + "." + restWords
                 elif re.search("tsetse", orgName):
                   orgName = "tsetse fly " + words[0][0].upper() + "." + restWords
                 elif re.search("cellular slime mold", orgName):
                   orgName = "cellular slime mold " + words[0][0].upper() + "." + restWords
                 elif re.search("stony corals", orgName):
                   orgName = "stony coral " + words[0][0].upper() + "." + restWords
                 elif re.search("tunicates", orgName):
                   orgName = "tunicate " + words[0][0].upper() + "." + restWords
                 else:
                   orgName = order + " " + words[0][0].upper() + "." + restWords
               elif re.search("viruses", orgName):
                 orgName = saveOrgName
                 pat = re.compile(r'.*ism name:\s+')
                 orgName = pat.sub('', orgName)
                 pat = re.compile(r'\s+\(.*\)$')
                 orgName = pat.sub('', orgName)
 
           elif "Date:" in line:
             words = line.split()
             pat = re.compile(r'-.*')
             yearDate = pat.sub('', words[-1])
           elif "Isolate:" in line:
             pat = re.compile(r'.*solate:\s+')
             isolate = pat.sub('', line)
           elif "Infraspecific name:" in line:
             pat = re.compile(r'.*cultivar=|.*ecotype=|.*strain=|.*breed=')
             cultivar = pat.sub('', line)
         else:
           break
       readFile.close()
       if len(isolate) and len(cultivar):
         extraStrings = f"{cultivar} {isolate}{asmType} {yearDate}"
       elif len(isolate):
         extraStrings = f"{isolate}{asmType} {yearDate}"
       elif len(cultivar):
         extraStrings = f"{cultivar}{asmType} {yearDate}"
       if len(extraStrings) < 1:
         pat = re.compile(r'^ +')
         extraStrings = pat.sub('', f"{asmType} {yearDate}")
       
       return asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings
     except FileNotFoundError:
       print(f"Error: File '{asmRpt}' not found.", file=sys.stderr)
       sys.exit(1)
 
 ### inFh is a file handle to a list of assembly identifiers,
 ###  might be a multi column file, ignore anything after the first column
 def processList(inFh):
     ncbiSrc = "/hive/data/outside/ncbi/genomes"
     for asmId in inFh:
       asmId = asmId.rstrip()	# eliminate new line at end of string
       if asmId.startswith('#'):	# ignore comment lines
         print(f"{asmId}", file=sys.stderr)
         continue
       asmId = asmId.split(' ',1)[0]	# only the first column
       p = asmId.split('_', 2)	# split on _ maximum of three results in list
       accession = p[0] + '_' + p[1]
       nPart = p[1]
       by3 = [nPart[i:i+3] for i in range(0, len(nPart), 3)]
       gcX = p[0]
       d0 = by3[0]
       d1 = by3[1]
       d2 = by3[2]
       asmName = "na"
       if (len(p) == 3):
         asmName = p[2]
 #      print(f"{accession}\t{asmName}\t{asmId}")
       srcDir = f"{ncbiSrc}/{gcX}/{d0}/{d1}/{d2}/{asmId}"
       asmRpt = f"{srcDir}/{asmId}_assembly_report.txt"
 #      print(f"{asmRpt}", file=sys.stderr)
       if not os.path.isfile(asmRpt):
         print(f"{asmId}\tmissing '{asmRpt}'", file=sys.stderr)
         continue
       asmNameAbbrev = asmName
       hapX = ""
       abbrevPattern = r'hap1|hap2|alternate_haplotype|primary_haplotype'
       if re.search(abbrevPattern, asmNameAbbrev):
         hapX = asmNameAbbrev
         ab = asmNameAbbrev.replace('.hap1','')
         asmNameAbbrev = ab.replace('.hap2','')
         ab = asmNameAbbrev.replace('_alternate_haplotype','')
         asmNameAbbrev = ab.replace('_primary_haplotype','')
         hx = asmNameAbbrev + '.'
         ab = hapX.replace(hx,'')
         hapX = ab
       asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings = extractNames(asmRpt, hapX)
       outStr = f"{asmId}\t{orgName}"
       if len(extraStrings):
         pat = re.compile(r'[()\[\]+*]')
         extraStrings = pat.sub('', extraStrings)
         pat = re.compile(r'\?')
         extraStrings = pat.sub(' ', extraStrings)
         extraStrings = re.sub(re.escape(asmNameAbbrev) + ' ', '', extraStrings)
         extraStrings = re.sub(r'\s+', ' ', extraStrings)
         words = extraStrings.split()
         orgList = orgName.split()
         orgName = " ".join([orgWord for orgWord in orgList if orgWord not in words])
         orgName = re.sub(r'=|\s+$|^\s+', '', orgName)
         orgName = re.sub(r'\s+', ' ', orgName).strip()
         if len(orgName):
           outStr = f"{asmId}\t{orgName} ({extraStrings})"
         else:
           outStr = f"{asmId}\t{extraStrings}"
 
       print(outStr)
 
 #      print(f"asmType: '{asmType}', {sciName}, {orgName}, {yearDate}, {isolate}, {cultivar} {extraStrings}")
 
 def main():
     site.ENABLE_USER_SITE = False
     if len(sys.argv) != 2:
 #        print(f"sys.path: {sys.path}")
         print("Usage: ./commonNames.py <filename|stdin>")
         print("e.g.: ./commonNames.py some.asmId.list")
         print("    where some.asmId.list is a simple list of NCBI assembly ids")
         print("    will look up the common names for each ID from the assembly_report files")
         sys.exit(1)
 
     # Ensure stdout and stderr use UTF-8 encoding
     setUtf8Encoding()
 
     listFile = sys.argv[1]
 
     if listFile == 'stdin':
         fileIn = sys.stdin
     else:
         try:
           fileIn = open(listFile, 'r')
         except FileNotFoundError:
           print(f"Error: File '{listFile}' not found.", file=sys.stderr)
           sys.exit(1)
 
     processList(fileIn)
 
 if __name__ == "__main__":
     main()