src/hg/hubApi/assemblyList.py fa3849abfc8a9e5ac4f5e44a2f8115701446ca3f

fa3849abfc8a9e5ac4f5e44a2f8115701446ca3f
hiram
  Thu Feb 5 15:01:40 2026 -0800
now adding haplotype CSV list when available refs #34736

diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py
index a21a063783b..dc73bdc62c9 100755
--- src/hg/hubApi/assemblyList.py
+++ src/hg/hubApi/assemblyList.py
@@ -1,25 +1,28 @@
 #!/cluster/software/bin/python3
 
 import subprocess
 import locale
+import gzip
 import sys
 import re
 import os
 import csv
 import requests
+from pathlib import Path
 from io import StringIO
+from datetime import datetime, UTC
 
 # priorities derived from the 'Monthly usage stats' report
 # from the qateam cron job running on the first day of the month
 # special top priorities
 topPriorities = {
     'hg38': 1,
     'mm39': 2,
     'hs1': 3,
     'hg19': 4,
     'mm10': 5,
     'dm6': 6,
     'danRer11': 7,
     'mm9': 8,
     'hetGla2': 9,
     'rn6': 10,
@@ -41,30 +44,49 @@
 
 ####################################################################
 ### this is kinda like an environment setting, it gets everything
 ### into a UTF-8 reading mode
 ####################################################################
 def set_utf8_encoding():
     """
     Set UTF-8 encoding for stdin, stdout, and stderr in Python.
     """
     if sys.stdout.encoding != 'utf-8':
         sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1)
     if sys.stderr.encoding != 'utf-8':
         sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1)
 
 ####################################################################
+### read in haplotype CSV listing if the file exists
+####################################################################
+def readHaplotypes(filePath):
+    hapList = {}
+    hapCount = 0
+    if Path(filePath).exists():
+        with gzip.open(filePath, "rt") as fh:
+            for line in fh:
+                line = line.rstrip("\n")
+                if not line:
+                    continue
+                asmId, csvHaplotypes = line.split("\t", 1)
+                parts = asmId.split('_', 2)
+                accession = '_'.join(parts[:2])
+                hapList[accession] = csvHaplotypes
+                hapCount += 1
+    print(f"# haplotype list: {hapCount}")
+    return hapList
+
 ### add aliases to description when they are not there yet
 ####################################################################
 def addAliases(browserName, aliasData, descr):
     description = descr
     if browserName in aliasData:
         aliasList = aliasData[browserName]
         descriptionSetLc = set(word.lower() for word in description.split())
         for alias in aliasList:
             if alias.lower() not in descriptionSetLc:
                 descriptionSetLc.add(alias.lower())
                 description += " " + alias
     return description
 
 ####################################################################
 ### reading hgcentral.asmAlias table
@@ -785,30 +807,31 @@
         print("    UCSC database names to GenArk clades, in source tree hubApi/")
         print("This script is going to read the dbDb.hgcentral table, and the file")
         print("    UCSC_GI.assemblyHubList.txt from hgdownload.")
         print("Writing an output file assemblyList.tsv to be loaded into")
         print("    assemblyList.hgcentral.  The output file needs to be sorted")
         print("    sort -k2,2n assemblyList.tsv before table load.")
         sys.exit(255)
 
     # Ensure stdout and stderr use UTF-8 encoding
     set_utf8_encoding()
 
     initCladePriority()
 
     dbDbNameCladeFile = sys.argv[1]
 
+    csvHaplotypes = readHaplotypes("/hive/data/outside/ncbi/genomes/reports/haploscan/csvHaplotypes.tsv.gz")
     # the correspondence of dbDb names to GenArk clade categories
     dbDbClades, dbDbYears, dbDbNcbi = dbDbCladeList(dbDbNameCladeFile)
     # Get the dbDb.hgcentral table data
     rawData = dbDbData()
     dbDbItems = processDbDbData(rawData, dbDbClades, dbDbYears, dbDbNcbi)
     aliasData = asmAliasData()
 
     # read the GenArk data from hgdownload into a list of dictionaries
     genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt"
     genArkItems = readGenArkData(genArkUrl)
 
     establishPriorities(dbDbItems, genArkItems)
 
     asmIdClade = readAsmIdClade()
     commonNames = allCommonNames()
@@ -859,85 +882,90 @@
         if "na" not in gcAccession:
             if gcAccession in allStatus:
               stat = allStatus[gcAccession]
               refSeqCategory = stat['refSeqCategory'].lower()
               versionStatus = stat['versionStatus'].lower()
               assemblyLevel = stat['assemblyLevel'].lower()
             if gcAccession not in description:
               description += " " + gcAccession
         ### add alias names to description if they are not already there
         description = addAliases(dbDbName, aliasData, description)
 
         descr = f"{entry['sourceName']} {entry['taxId']} {description}"
         if year not in organism and year not in descr:
             descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']} {year}"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t\n"
+        haplotypes = csvHaplotypes.get(entry['name'], "")
+        outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t{haplotypes}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}")
 
     itemCount = 0
     # Print the GenArk data
     for entry in genArkItems:
         gcAccession = entry['gcAccession']
         if gcAccession in allPriorities:
             priority = allPriorities[gcAccession]
         else:
             print("no priority for ", gcAccession)
             sys.exit(255)
 
         hubPath = genarkPath(gcAccession)
         commonName = entry['commonName']
         clade = entry['clade']
         year = entry['year']
+        if year == 0:
+           year = datetime.now(UTC).strftime("%Y-%m-%d")
         descr = f"{entry['asmName']} {entry['taxId']}"
         if year not in commonName and year not in descr:
             descr = f"{entry['asmName']} {entry['taxId']} {year}"
         ### add alias names to description if they are not already there
         descr = addAliases(gcAccession, aliasData, descr)
         description = re.sub(r'\s+', ' ', descr).strip()
         refSeqCategory = entry['refSeqCategory'].lower()
         versionStatus = entry['versionStatus'].lower()
         assemblyLevel = entry['assemblyLevel'].lower()
-        outLine = f"{entry['gcAccession']}\t{priority}\t{commonName.encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t\n"
+        haplotypes = csvHaplotypes.get(entry['gcAccession'], "")
+        outLine = f"{entry['gcAccession']}\t{priority}\t{commonName.encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t{haplotypes}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}")
 
     incrementPriority = len(allPriorities) + 1
     print("# incrementing priorities from: ", incrementPriority)
 
     itemCount = 0
     # Print the refSeq/genBank data
     for entry in refSeqGenBankSorted:
         gcAccession = entry['gcAccession']
         commonName = entry['commonName']
         scientificName = entry['scientificName']
         asmName = entry['asmName']
         clade = entry['clade']
         year = entry['year']
         refSeqCategory = entry['refSeqCategory'].lower()
         versionStatus = entry['versionStatus'].lower()
         assemblyLevel = entry['assemblyLevel'].lower()
         descr = f"{asmName} {entry['taxId']} {entry['other']}"
         if year not in commonName and year not in descr:
             descr = f"{asmName} {entry['taxId']} {entry['other']} {year}"
         ### add alias names to description if they are not already there
         descr = addAliases(gcAccession, aliasData, descr)
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t\n"
+        haplotypes = csvHaplotypes.get(entry['gcAccession'], "")
+        outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t{haplotypes}\n"
         fileOut.write(outLine)
         incrementPriority += 1
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}")
 
     fileOut.close()
 
 if __name__ == "__main__":
     main()