src/hg/makeDb/doc/asmHubs/tsvToJson.py 5c3fb7c203029aae27a69a7104b690b30da7f558

5c3fb7c203029aae27a69a7104b690b30da7f558
hiram
  Mon Sep 23 13:05:57 2024 -0700
fixup to allow UCSC database names and reading dbGcaGcf.tsf to get GCA GCF accessions on those UCSC names refs #34337

diff --git src/hg/makeDb/doc/asmHubs/tsvToJson.py src/hg/makeDb/doc/asmHubs/tsvToJson.py
index 9548b22..3ab387a 100755
--- src/hg/makeDb/doc/asmHubs/tsvToJson.py
+++ src/hg/makeDb/doc/asmHubs/tsvToJson.py
@@ -1,22 +1,23 @@
 #!/cluster/software/bin/python3
 
 import os
 import sys
 import re
 import site
 import json
+import subprocess
 
 ####################################################################
 ### this is kinda like an environment setting, it gets everything
 ### into a UTF-8 reading mode
 ####################################################################
 def setUtf8Encoding():
     """
     Set UTF-8 encoding for stdin, stdout, and stderr in Python.
     """
     if sys.stdout.encoding != 'utf-8':
         sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1)
     if sys.stderr.encoding != 'utf-8':
         sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1)
 
 
@@ -171,59 +172,90 @@
       elif len(isolate):
         extraStrings = f"{isolate}{asmType} {yearDate}"
       elif len(cultivar):
         extraStrings = f"{cultivar}{asmType} {yearDate}"
       if len(extraStrings) < 1:
         pat = re.compile(r'^ +')
         extraStrings = pat.sub('', f"{asmType} {yearDate}")
       
       return asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings, genBankAcc, refSeqAcc, identical, taxId
     except FileNotFoundError:
       print(f"Error: File '{asmRpt}' not found.", file=sys.stderr)
       sys.exit(1)
 
 ### inFh is a file handle to a list of assembly identifiers,
 ###  might be a multi column file, ignore anything after the first column
-def processList(inFh):
+def processList(inFh, dbGcaGcfDict):
     outStr = f"taxId\tasmId\tgenBankAcc\trefSeqAcc\tidentical\tsciName\tcomName"
     print(outStr, file=sys.stderr)
     schema = [
         {"name": "taxId", "type": "integer"},
         {"name": "asmId", "type": "string"},
         {"name": "genBank", "type": "string"},
         {"name": "refSeq", "type": "string"},
         {"name": "identical", "type": "boolean"},
         {"name": "sciName", "type": "string"},
         {"name": "comName", "type": "string"},
         {"name": "ucscBrowser", "type": "string"},
     ]
     jsonSchema = {
       "columns": [[obj["name"], obj["type"]] for obj in schema]
     }
 ##    jsonOut = json.dumps(jsonSchema)
 ##    print(jsonOut)
     ### accumulating array rows
     dataOut = []
     ncbiSrc = "/hive/data/outside/ncbi/genomes"
     for lineIn in inFh:
       #  rstrip() # eliminates new line at end of string
       splitByTab = lineIn.rstrip().split("\t")
       asmId = splitByTab[0]
       comName = splitByTab[1]
       if asmId.startswith('#'):	# ignore comment lines
         print(f"{asmId}", file=sys.stderr)
         continue
+# felCat8 *** NOT GC..
+# 0 ['felCat8'
+# 1 'Nov. 2014 (ICGSC Felis_catus_8.0/felCat8)'
+# 2 '/gbdb/felCat8'
+# 3 'Cat'
+# 4 'chrA2:53484451-53597660'
+# 5 '1'
+# 6 '3240'
+# 7 'Cat'
+# 8 'Felis catus'
+# 9 '/gbdb/felCat8/html/description.html'
+# 10 '0'
+# 11 '0'
+# 12 'International Cat Genome Sequencing Consortium'
+# 13 '9685']
+      if not asmId.startswith('GC'):
+        sql = f"select * from dbDb where name='{asmId}'"
+        hgsqlReturn = subprocess.run(['hgsql', 'hgcentraltest', '-N', '-e', sql], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+        dbDb = hgsqlReturn.stdout.strip().split('\t')
+        taxId = dbDb[13]
+        genBankAcc = "n/a"
+        refSeqAcc = "n/a"
+        identical = False
+        if asmId in dbGcaGcfDict:
+            genBankAcc = dbGcaGcfDict[asmId][0]
+            refSeqAcc = dbGcaGcfDict[asmId][1]
+            identical = dbGcaGcfDict[asmId][2]
+        sciName = dbDb[8]
+        comName = f"{dbDb[7]} - {dbDb[1]}"
+        ucscBrowser = f"https://genome.ucsc.edu/cgi-bin/hgTracks?db={asmId}"
+      else:
         asmId = asmId.split(' ',1)[0]	# only the first column
         p = asmId.split('_', 2)	# split on _ maximum of three results in list
         accession = p[0] + '_' + p[1]
         nPart = p[1]
         by3 = [nPart[i:i+3] for i in range(0, len(nPart), 3)]
         gcX = p[0]
         d0 = by3[0]
         d1 = by3[1]
         d2 = by3[2]
         asmName = "na"
         if (len(p) == 3):
           asmName = p[2]
 #      print(f"{accession}\t{asmName}\t{asmId}")
         srcDir = f"{ncbiSrc}/{gcX}/{d0}/{d1}/{d2}/{asmId}"
         asmRpt = f"{srcDir}/{asmId}_assembly_report.txt"
@@ -244,72 +276,96 @@
           ab = hapX.replace(hx,'')
           hapX = ab
         asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings, genBankAcc, refSeqAcc, identical, taxId = extractNames(asmRpt, hapX)
         if len(extraStrings):
           pat = re.compile(r'[()\[\]+*]')
           extraStrings = pat.sub('', extraStrings)
           pat = re.compile(r'\?')
           extraStrings = pat.sub(' ', extraStrings)
           extraStrings = re.sub(re.escape(asmNameAbbrev) + ' ', '', extraStrings)
           extraStrings = re.sub(r'\s+', ' ', extraStrings)
           words = extraStrings.split()
           orgList = orgName.split()
           orgName = " ".join([orgWord for orgWord in orgList if orgWord not in words])
           orgName = re.sub(r'=|\s+$|^\s+', '', orgName)
           orgName = re.sub(r'\s+', ' ', orgName).strip()
+          ucscBrowser = f"https://genome.ucsc.edu/h/{accession}"
+
       outStr = f"{taxId}\t{asmId}\t{genBankAcc}\t{refSeqAcc}\t{identical}\t{sciName}\t{comName}"
 
-        ucscBrowser = f"https://genome.ucsc.edu/h/{accession}"
       rowData = {
           "taxId": int(taxId),
           "asmId": asmId,
           "genBank": genBankAcc,
           "refSeq": refSeqAcc,
           "identical": identical,
           "sciName": sciName,
           "comName": comName,
           "ucscBrowser": ucscBrowser,
       }
       dataOut.append(rowData)
       print(outStr, file=sys.stderr)
 
-    dataJson = {
-      "data": [[obj["taxId"], obj["asmId"], obj["genBank"], obj["refSeq"], obj["identical"], obj["sciName"], obj["comName"], obj["ucscBrowser"]] for obj in dataOut]
-    }
 #    schemaPlusData = schema + dataOut
     schemaPlusData = {
       "columns": schema,
       "data": dataOut,
     }
     jsonOut = json.dumps(schemaPlusData)
     print(jsonOut)
 
+# read three column file: db  gca  gcf
+# for UCSC db to GCA and GCF accession equivalence
+def readDbGcaGcf(tsvFile):
+    dbDict = {}
+
+    try:
+        fh = open(tsvFile, 'r', encoding='latin-1')
+    except FileNotFoundError:
+        print(f"Error: File '{tsvFile}' not found.", file=sys.stderr)
+        sys.exit(1)
+
+    for line in fh:
+        if line.startswith('#'):
+            continue
+        db, gca, gcf, identical = line.strip().split('\t')
+        ident = False
+        if identical.lower().startswith('yes'):
+            ident = True
+        dbDict[db] = [gca, gcf, ident]
+
+    return dbDict
+
 #      print(f"asmType: '{asmType}', {sciName}, {orgName}, {yearDate}, {isolate}, {cultivar} {extraStrings}")
 
 def main():
-    site.ENABLE_USER_SITE = False
     if len(sys.argv) != 2:
 #        print(f"sys.path: {sys.path}")
         print("Usage: ./commonNames.py <filename|stdin>", file=sys.stderr)
         print("e.g.: ./commonNames.py some.asmId.list", file=sys.stderr)
         print("    where some.asmId.list is a simple list of NCBI assembly ids", file=sys.stderr)
         print("    will look up the common names for each ID from the assembly_report files", file=sys.stderr)
         sys.exit(1)
 
     # Ensure stdout and stderr use UTF-8 encoding
     setUtf8Encoding()
 
     listFile = sys.argv[1]
 
+    dbGcaGcfDict = {}
+    dbGcaGcf = "dbGcaGcf.tsv"
+    if os.path.isfile(dbGcaGcf):
+        dbGcaGcfDict = readDbGcaGcf(dbGcaGcf)
+
     if listFile == 'stdin':
         fileIn = sys.stdin
     else:
         try:
           fileIn = open(listFile, 'r', encoding='latin-1')
         except FileNotFoundError:
           print(f"Error: File '{listFile}' not found.", file=sys.stderr)
           sys.exit(1)
 
-    processList(fileIn)
+    processList(fileIn, dbGcaGcfDict)
 
 if __name__ == "__main__":
     main()