10628e93d4e547ae2a2d8562f87d164e86e6ce9d
hiram
  Tue Jul 23 12:48:08 2024 -0700
rewrite of commonNames.pl to fix cluster home access and speed up processing refs #33720

diff --git src/hg/makeDb/doc/asmHubs/commonNames.py src/hg/makeDb/doc/asmHubs/commonNames.py
new file mode 100755
index 0000000..d7d47c6
--- /dev/null
+++ src/hg/makeDb/doc/asmHubs/commonNames.py
@@ -0,0 +1,252 @@
+#!/cluster/software/bin/python3
+
+import os
+import sys
+import re
+import site
+
+####################################################################
+### this is kinda like an environment setting, it gets everything
+### into a UTF-8 reading mode
+####################################################################
+def setUtf8Encoding():
+    """
+    Set UTF-8 encoding for stdin, stdout, and stderr in Python.
+    """
+    if sys.stdout.encoding != 'utf-8':
+        sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1)
+    if sys.stderr.encoding != 'utf-8':
+        sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1)
+
+
+### given a path name to an asmRpt, read the file and get out the names
+### we need
+def extractNames(asmRpt, hapX):
+    asmType = "asmType"
+    sciName = "sciName"
+    yearDate = "yearDate"
+    isolate = ""
+    cultivar = ""
+    orgName = "orgName"
+    order = ""
+    extraStrings = ""
+    try:
+      readFile = open(asmRpt, 'r', encoding='utf-8')
+      for line in readFile:
+        line = line.rstrip()
+        if line.startswith('#'):
+          if "Assembly type:" in line:
+            pat = re.compile(r'.*bly type:\s+', re.IGNORECASE)
+            asmType = pat.sub('', line)
+            if re.search('alternate-pseudohaplotype', asmType):
+              asmType = " alternate hap"
+            elif re.search('principal pseudo', asmType):
+              asmType = " primary hap"
+            else:
+              asmType = hapX
+          elif "Organism name:" in line:
+            saveOrgName = line
+            pat = re.compile(r'.*\(')
+            orgName = pat.sub('', line)
+            pat = re.compile(r'[()\[\]+*]')
+            orgName = pat.sub('', orgName)
+            pat = re.compile(r'\?')
+            orgName = pat.sub('', orgName)
+            pat = re.compile(r'.*ism name:\s+')
+            orgName = pat.sub('', orgName)
+           
+
+            pat = re.compile(r'.*ganism name:\s+', re.IGNORECASE)
+            sciName = pat.sub('', line)
+            pat = re.compile(r'\s+\(.*\)$')
+            sciName = pat.sub('', sciName)
+            pat = re.compile(r'[()\[\]+*]')
+            sciName = pat.sub('', sciName)
+            pat = re.compile(r'\?')
+            sciName = pat.sub(' ', sciName)
+
+            pat = r'kinetoplastids|firmicutes|proteobacteria|high G|enterobacteria|agent of'
+            if re.search(pat, orgName):
+              orgName = sciName
+            else:
+              pat = r'bugs|crustaceans|nematodes|flatworm|ascomycete|basidiomycete|budding|microsporidian|smut|fungi|eukaryotes|flies|beetles|mosquitos|bees|moths|sponges|^mites|ticks|^comb|jellies|jellyfishes|chitons|bivalves|bony fishes|birds|eudicots|snakes|bats'
+              if re.search(pat, orgName):
+                order = orgName.split()[0]
+                if re.search('budding', order):
+                  order = "budding yeast"
+                elif re.search('smut', order):
+                  order = "smut fungi"
+                elif re.search('bony', order):
+                  order = "bony fish"
+                elif re.search('ascomycete', order):
+                  order = "ascomycetes"
+                elif re.search('eudicots', order):
+                  order = "eudicot"
+                elif re.search('birds', order):
+                  order = "bird"
+                elif re.search('snakes', order):
+                  order = "snake"
+                elif re.search('crustaceans', order):
+                  order = "crustacean"
+                elif re.search('mosquitos', order):
+                  order = "mosquito"
+                elif re.search('mites', order):
+                  order = "mite/tick"
+                elif re.search('comb', order):
+                  order = "comb jelly"
+                elif re.search('jellyfishes', order):
+                  order = "jellyfish"
+                elif re.search('chitons', order):
+                  order = "chiton"
+                elif re.search('bivalves', order):
+                  order = "bivalve"
+                elif re.search('beetles', order):
+                  order = "beetle"
+                elif re.search('bees', order):
+                  order = "bee"
+                elif re.search('bats', order):
+                  order = "bat"
+                elif re.search('moths', order):
+                  order = "moth"
+                elif re.search('sponges', order):
+                  order = "sponge"
+                elif re.search('flatworms', order):
+                  order = "flatworm"
+                elif re.search('nematodes', order):
+                  order = "nematode"
+                elif re.search('basidiomycete', order):
+                  order = "basidiomycetes"
+                words = sciName.split()
+                restWords = " ".join(words[1:])
+                if re.search("eukaryotes", orgName):
+                  orgName = words[0][0].upper() + "." + restWords
+                elif re.search("flies", orgName):
+                  orgName = "fly " + words[0][0].upper() + "." + restWords
+                else:
+                  orgName = order + " " + words[0][0].upper() + "." + restWords
+              elif re.search("viruses", orgName):
+                orgName = saveOrgName
+                pat = re.compile(r'.*ism name:\s+')
+                orgName = pat.sub('', orgName)
+                pat = re.compile(r'\s+\(.*\)$')
+                orgName = pat.sub('', orgName)
+
+          elif "Date:" in line:
+            words = line.split()
+            pat = re.compile(r'-.*')
+            yearDate = pat.sub('', words[-1])
+          elif "Isolate:" in line:
+            pat = re.compile(r'.*solate:\s+')
+            isolate = pat.sub('', line)
+          elif "Infraspecific name:" in line:
+            pat = re.compile(r'.*cultivar=|.*ecotype=|.*strain=|.*breed=')
+            cultivar = pat.sub('', line)
+        else:
+          break
+      readFile.close()
+      if len(isolate) and len(cultivar):
+        extraStrings = f"{cultivar} {isolate}{asmType} {yearDate}"
+      elif len(isolate):
+        extraStrings = f"{isolate}{asmType} {yearDate}"
+      elif len(cultivar):
+        extraStrings = f"{cultivar}{asmType} {yearDate}"
+      if len(extraStrings) < 1:
+        pat = re.compile(r'^ +')
+        extraStrings = pat.sub('', f"{asmType} {yearDate}")
+      
+      return asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings
+    except FileNotFoundError:
+      print(f"Error: File '{asmRpt}' not found.", file=sys.stderr)
+      sys.exit(1)
+
+### inFh is a file handle to a list of assembly identifiers,
+###  might be a multi column file, ignore anything after the first column
+def processList(inFh):
+    ncbiSrc = "/hive/data/outside/ncbi/genomes"
+    for asmId in inFh:
+      asmId = asmId.rstrip()	# eliminate new line at end of string
+      if asmId.startswith('#'):	# ignore comment lines
+        print(f"{asmId}", file=sys.stderr)
+        continue
+      asmId = asmId.split(' ',1)[0]	# only the first column
+      p = asmId.split('_', 2)	# split on _ maximum of three results in list
+      accession = p[0] + '_' + p[1]
+      nPart = p[1]
+      by3 = [nPart[i:i+3] for i in range(0, len(nPart), 3)]
+      gcX = p[0]
+      d0 = by3[0]
+      d1 = by3[1]
+      d2 = by3[2]
+      asmName = "na"
+      if (len(p) == 3):
+        asmName = p[2]
+#      print(f"{accession}\t{asmName}\t{asmId}")
+      srcDir = f"{ncbiSrc}/{gcX}/{d0}/{d1}/{d2}/{asmId}"
+      asmRpt = f"{srcDir}/{asmId}_assembly_report.txt"
+#      print(f"{asmRpt}", file=sys.stderr)
+      if not os.path.isfile(asmRpt):
+        print(f"{asmId}\tmissing '{asmRpt}'", file=sys.stderr)
+        continue
+      asmNameAbbrev = asmName
+      hapX = ""
+      abbrevPattern = r'hap1|hap2|alternate_haplotype|primary_haplotype'
+      if re.search(abbrevPattern, asmNameAbbrev):
+        hapX = asmNameAbbrev
+        ab = asmNameAbbrev.replace('.hap1','')
+        asmNameAbbrev = ab.replace('.hap2','')
+        ab = asmNameAbbrev.replace('_alternate_haplotype','')
+        asmNameAbbrev = ab.replace('_primary_haplotype','')
+        hx = asmNameAbbrev + '.'
+        ab = hapX.replace(hx,'')
+        hapX = ab
+      asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings = extractNames(asmRpt, hapX)
+      outStr = f"{asmId}\t{orgName}"
+      if len(extraStrings):
+        pat = re.compile(r'[()\[\]+*]')
+        extraStrings = pat.sub('', extraStrings)
+        pat = re.compile(r'\?')
+        extraStrings = pat.sub(' ', extraStrings)
+        extraStrings = re.sub(re.escape(asmNameAbbrev) + ' ', '', extraStrings)
+        extraStrings = re.sub(r'\s+', ' ', extraStrings)
+        words = extraStrings.split()
+        orgList = orgName.split()
+        orgName = " ".join([orgWord for orgWord in orgList if orgWord not in words])
+        orgName = re.sub(r'=|\s+$|^\s+', '', orgName)
+        orgName = re.sub(r'\s+', ' ', orgName).strip()
+        if len(orgName):
+          outStr = f"{asmId}\t{orgName} ({extraStrings})"
+        else:
+          outStr = f"{asmId}\t{extraStrings}"
+
+      print(outStr)
+
+#      print(f"asmType: '{asmType}', {sciName}, {orgName}, {yearDate}, {isolate}, {cultivar} {extraStrings}")
+
+def main():
+    site.ENABLE_USER_SITE = False
+    if len(sys.argv) != 2:
+#        print(f"sys.path: {sys.path}")
+        print("Usage: ./commonNames.py <filename|stdin>")
+        print("e.g.: ./commonNames.py some.asmId.list")
+        print("    where some.asmId.list is a simple list of NCBI assembly ids")
+        print("    will look up the common names for each ID from the assembly_report files")
+        sys.exit(1)
+
+    # Ensure stdout and stderr use UTF-8 encoding
+    setUtf8Encoding()
+
+    listFile = sys.argv[1]
+
+    if listFile == 'stdin':
+        fileIn = sys.stdin
+    else:
+        try:
+          fileIn = open(listFile, 'r')
+        except FileNotFoundError:
+          print(f"Error: File '{listFile}' not found.", file=sys.stderr)
+          sys.exit(1)
+
+    processList(fileIn)
+
+if __name__ == "__main__":
+    main()