72dbdcd6baa45774ed51ce6640d2cb19c7f5c583
hiram
  Thu Jul 31 15:12:01 2025 -0700
add in assembly alias names in the "description" so they can also be matched refs #33720

diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py
index 89ca7fa50f5..50de7c34705 100755
--- src/hg/hubApi/assemblyList.py
+++ src/hg/hubApi/assemblyList.py
@@ -26,30 +26,79 @@
 cladePrio = {}
 
 ####################################################################
 ### this is kinda like an environment setting, it gets everything
 ### into a UTF-8 reading mode
 ####################################################################
 def set_utf8_encoding():
     """
     Set UTF-8 encoding for stdin, stdout, and stderr in Python.
     """
     if sys.stdout.encoding != 'utf-8':
         sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1)
     if sys.stderr.encoding != 'utf-8':
         sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1)
 
+####################################################################
+### add aliases to description when they are not there yet
+####################################################################
+def addAliases(browserName, aliasData, descr):
+    description = descr
+    if browserName in aliasData:
+        aliasList = aliasData[browserName]
+        descriptionSetLc = set(word.lower() for word in description.split())
+        for alias in aliasList:
+            if alias.lower() not in descriptionSetLc:
+                descriptionSetLc.add(alias.lower())
+                description += " " + alias
+    return description
+
+####################################################################
+### reading hgcentral.asmAlias table
+####################################################################
+def asmAliasData():
+    # Run the MySQL command and capture the output as bytes
+    result = subprocess.run(
+        ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT alias,browser FROM asmAlias;", "hgcentral"],
+        stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    if result.returncode != 0:
+        print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}")
+        exit(1)
+    # Decode the output from bytes to string using utf-8
+    decoded = result.stdout.decode('latin-1')
+    # Split the data into lines (rows)
+    rows = decoded.strip().split('\n')
+
+    aliasData = {}
+
+    aliasCount = 0
+    browserCount = 0
+
+    for row in rows:
+        columns = row.split('\t')	# Split each row into columns
+        alias = columns[0]
+        browser = columns[1]
+        if browser not in aliasData:
+            aliasData[browser] = []
+            browserCount += 1
+        aliasData[browser].append(alias)
+        aliasCount += 1
+
+    print(f"# counted {aliasCount} aliases for {browserCount} 'browsers'")
+    return aliasData
+
 ####################################################################
 ### reading hgcentral.dbDb table
 ####################################################################
 def dbDbData():
     # Run the MySQL command and capture the output as bytes
     result = subprocess.run(
         ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT name,scientificName,organism,taxId,sourceName,description FROM dbDb WHERE active=1;", "hgcentral"],
         stdout=subprocess.PIPE, stderr=subprocess.PIPE
     )
     if result.returncode != 0:
         print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}")
         exit(1)
     # Decode the output from bytes to string using utf-8
     return result.stdout.decode('latin-1')
 
@@ -510,31 +559,31 @@
     mammalList = extractClade('mammals', genArk)
 
     versionScan = {}	# key is dbDb name without number version extension,
                         # value is highest version number seen for this bare
                         # name
     highestVersion = {}	# key is dbDb name without number version extension,
 			# value is the full dbDb name for the highest version
                         # of this dbDb name
     allDbDbNames = {}	# key is the full dbDb name, value is its version
 
     itemCount = 0
     # scanning the dbDb entries, figure out the highest version number
     #    of each name
     for item in dbDb:
         dbDbName = item['name']
-        splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName)
+        splitMatch = re.match(r"([a-zA-Z]+)(\d+)", dbDbName)
         if splitMatch:
             allDbDbNames[dbDbName] = splitMatch.group(2)
             itemCount += 1
             if splitMatch.group(1) in versionScan:
                 if float(splitMatch.group(2)) > float(versionScan[splitMatch.group(1)]):
                     versionScan[splitMatch.group(1)] = splitMatch.group(2)
                     highestVersion[splitMatch.group(1)] = dbDbName
             else:
                 versionScan[splitMatch.group(1)] = splitMatch.group(2)
                 highestVersion[splitMatch.group(1)] = dbDbName
         else:
             print("### dbDb name does not split: ", dbDbName)
             allDbDbNames[dbDbName] = 0
             itemCount += 1
 
@@ -648,31 +697,31 @@
         gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1]
         if not gcAcc.startswith("GCA_"):
             continue
         if gcAcc not in allPriorities:
             allPriorities[gcAcc] = priorityCounter
             priorityCounter += 1
             itemCount += 1
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk GCA mammals count: {itemCount:4}")
 
     itemCount = 0
     # dbDb is in cladeOrder, process in that order, find highest versions
     for item in dbDb:
         dbDbName = item['name']
         if dbDbName not in allPriorities:
-            splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName)
+            splitMatch = re.match(r"([a-zA-Z]+)(\d+)", dbDbName)
             if splitMatch:
                 noVersion = splitMatch.group(1)
                 version = allDbDbNames[dbDbName]
                 highVersion = versionScan[noVersion]
                 if highVersion == version:
                     allPriorities[dbDbName] = priorityCounter
                     priorityCounter += 1
                     itemCount += 1
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}")
 
     itemCount = 0
     # GCF RefSeq from GenArk next priority
     for item in genArk:
         gcAccession = item['gcAccession']
@@ -727,30 +776,31 @@
         print("    sort -k2,2n assemblyList.tsv before table load.")
         sys.exit(255)
 
     # Ensure stdout and stderr use UTF-8 encoding
     set_utf8_encoding()
 
     initCladePriority()
 
     dbDbNameCladeFile = sys.argv[1]
 
     # the correspondence of dbDb names to GenArk clade categories
     dbDbClades, dbDbYears, dbDbNcbi = dbDbCladeList(dbDbNameCladeFile)
     # Get the dbDb.hgcentral table data
     rawData = dbDbData()
     dbDbItems = processDbDbData(rawData, dbDbClades, dbDbYears, dbDbNcbi)
+    aliasData = asmAliasData()
 
     # read the GenArk data from hgdownload into a list of dictionaries
     genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt"
     genArkItems = readGenArkData(genArkUrl)
 
     establishPriorities(dbDbItems, genArkItems)
 
     asmIdClade = readAsmIdClade()
     commonNames = allCommonNames()
     print("# all common names: ", len(commonNames))
 
     refSeqList, refSeqYears, refSeqStatus = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade)
     print("# refSeq assemblies: ", len(refSeqList))
     refSeqListHist, refSeqYearsHist, refSeqStatusHist = readAsmSummary("refseq_historical.txt", allPriorities, commonNames, asmIdClade)
     print("# refSeq historical assemblies: ", len(refSeqListHist))
@@ -788,86 +838,92 @@
         year = entry['year']
         gcAccession = entry['gcAccession']
         description = entry['description']
         refSeqCategory = ""
         versionStatus = ""
         assemblyLevel = ""
         organism = entry['organism']
         if "na" not in gcAccession:
             if gcAccession in allStatus:
               stat = allStatus[gcAccession]
               refSeqCategory = stat['refSeqCategory'].lower()
               versionStatus = stat['versionStatus'].lower()
               assemblyLevel = stat['assemblyLevel'].lower()
             if gcAccession not in description:
               description += " " + gcAccession
+        ### add alias names to description if they are not already there
+        description = addAliases(dbDbName, aliasData, description)
 
         descr = f"{entry['sourceName']} {entry['taxId']} {description}"
         if year not in organism and year not in descr:
             descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']} {year}"
         description = re.sub(r'\s+', ' ', descr).strip()
         outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}")
 
     itemCount = 0
     # Print the GenArk data
     for entry in genArkItems:
         gcAccession = entry['gcAccession']
         if gcAccession in allPriorities:
             priority = allPriorities[gcAccession]
         else:
             print("no priority for ", gcAccession)
             sys.exit(255)
 
         hubPath = genarkPath(gcAccession)
         commonName = entry['commonName']
         clade = entry['clade']
         year = entry['year']
         descr = f"{entry['asmName']} {entry['taxId']}"
         if year not in commonName and year not in descr:
             descr = f"{entry['asmName']} {entry['taxId']} {year}"
+        ### add alias names to description if they are not already there
+        descr = addAliases(gcAccession, aliasData, descr)
         description = re.sub(r'\s+', ' ', descr).strip()
         refSeqCategory = entry['refSeqCategory'].lower()
         versionStatus = entry['versionStatus'].lower()
         assemblyLevel = entry['assemblyLevel'].lower()
         outLine = f"{entry['gcAccession']}\t{priority}\t{commonName.encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}")
 
     incrementPriority = len(allPriorities) + 1
     print("# incrementing priorities from: ", incrementPriority)
 
     itemCount = 0
     # Print the refSeq/genBank data
     for entry in refSeqGenBankSorted:
         gcAccession = entry['gcAccession']
         commonName = entry['commonName']
         scientificName = entry['scientificName']
         asmName = entry['asmName']
         clade = entry['clade']
         year = entry['year']
         refSeqCategory = entry['refSeqCategory'].lower()
         versionStatus = entry['versionStatus'].lower()
         assemblyLevel = entry['assemblyLevel'].lower()
         descr = f"{asmName} {entry['taxId']} {entry['other']}"
         if year not in commonName and year not in descr:
             descr = f"{asmName} {entry['taxId']} {entry['other']} {year}"
+        ### add alias names to description if they are not already there
+        descr = addAliases(gcAccession, aliasData, descr)
         description = re.sub(r'\s+', ' ', descr).strip()
         outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n"
         fileOut.write(outLine)
         incrementPriority += 1
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}")
 
     fileOut.close()
 
 if __name__ == "__main__":
     main()