src/hg/makeDb/doc/asmHubs/tsvToJson.py 9cf3f9dfbd2770a517953c7f43ffade95be9a9de

9cf3f9dfbd2770a517953c7f43ffade95be9a9de
hiram
  Tue Sep 24 15:26:53 2024 -0700
add the ucscBrowser link to the stderr outpur ref #34337

diff --git src/hg/makeDb/doc/asmHubs/tsvToJson.py src/hg/makeDb/doc/asmHubs/tsvToJson.py
index 3ab387a..695c804 100755
--- src/hg/makeDb/doc/asmHubs/tsvToJson.py
+++ src/hg/makeDb/doc/asmHubs/tsvToJson.py
@@ -173,31 +173,31 @@
         extraStrings = f"{isolate}{asmType} {yearDate}"
       elif len(cultivar):
         extraStrings = f"{cultivar}{asmType} {yearDate}"
       if len(extraStrings) < 1:
         pat = re.compile(r'^ +')
         extraStrings = pat.sub('', f"{asmType} {yearDate}")
       
       return asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings, genBankAcc, refSeqAcc, identical, taxId
     except FileNotFoundError:
       print(f"Error: File '{asmRpt}' not found.", file=sys.stderr)
       sys.exit(1)
 
 ### inFh is a file handle to a list of assembly identifiers,
 ###  might be a multi column file, ignore anything after the first column
 def processList(inFh, dbGcaGcfDict):
-    outStr = f"taxId\tasmId\tgenBankAcc\trefSeqAcc\tidentical\tsciName\tcomName"
+    outStr = f"# taxId\tasmId\tgenBankAcc\trefSeqAcc\tidentical\tsciName\tcomName\tucscBrowser"
     print(outStr, file=sys.stderr)
     schema = [
         {"name": "taxId", "type": "integer"},
         {"name": "asmId", "type": "string"},
         {"name": "genBank", "type": "string"},
         {"name": "refSeq", "type": "string"},
         {"name": "identical", "type": "boolean"},
         {"name": "sciName", "type": "string"},
         {"name": "comName", "type": "string"},
         {"name": "ucscBrowser", "type": "string"},
     ]
     jsonSchema = {
       "columns": [[obj["name"], obj["type"]] for obj in schema]
     }
 ##    jsonOut = json.dumps(jsonSchema)
@@ -278,31 +278,31 @@
         asmType, sciName, orgName, yearDate, isolate, cultivar, extraStrings, genBankAcc, refSeqAcc, identical, taxId = extractNames(asmRpt, hapX)
         if len(extraStrings):
           pat = re.compile(r'[()\[\]+*]')
           extraStrings = pat.sub('', extraStrings)
           pat = re.compile(r'\?')
           extraStrings = pat.sub(' ', extraStrings)
           extraStrings = re.sub(re.escape(asmNameAbbrev) + ' ', '', extraStrings)
           extraStrings = re.sub(r'\s+', ' ', extraStrings)
           words = extraStrings.split()
           orgList = orgName.split()
           orgName = " ".join([orgWord for orgWord in orgList if orgWord not in words])
           orgName = re.sub(r'=|\s+$|^\s+', '', orgName)
           orgName = re.sub(r'\s+', ' ', orgName).strip()
           ucscBrowser = f"https://genome.ucsc.edu/h/{accession}"
 
-      outStr = f"{taxId}\t{asmId}\t{genBankAcc}\t{refSeqAcc}\t{identical}\t{sciName}\t{comName}"
+      outStr = f"{taxId}\t{asmId}\t{genBankAcc}\t{refSeqAcc}\t{identical}\t{sciName}\t{comName}\t{ucscBrowser}"
 
       rowData = {
           "taxId": int(taxId),
           "asmId": asmId,
           "genBank": genBankAcc,
           "refSeq": refSeqAcc,
           "identical": identical,
           "sciName": sciName,
           "comName": comName,
           "ucscBrowser": ucscBrowser,
       }
       dataOut.append(rowData)
       print(outStr, file=sys.stderr)
 
 #    schemaPlusData = schema + dataOut