src/hg/hubApi/genomePriority.py 7009c18457198f53021f03b5f8ad4e20f7b51baa

7009c18457198f53021f03b5f8ad4e20f7b51baa
hiram
  Wed Jul 10 11:04:03 2024 -0700
this is most likely complete, it does not output in priority order, outtput needs to be sorted refs #32897

diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py
index 4f96c97..3760c80 100755
--- src/hg/hubApi/genomePriority.py
+++ src/hg/hubApi/genomePriority.py
@@ -4,31 +4,31 @@
 import locale
 import sys
 import re
 import os
 import csv
 import requests
 from io import StringIO
 
 # special top priorities 
 topPriorities = {
     'hg38': 1,
     'mm39': 2,
     'hs1': 3,
 }
 
-### key will be dbDb name, value will be priority number
+### key will be dbDb/GCx name, value will be priority number
 allPriorities = {}
 
 priorityCounter = len(topPriorities) + 1
 
 # key is clade, value is priority, will be initialized
 # by initCladePriority() function
 cladePrio = {}
 
 ####################################################################
 ### this is kinda like an environment setting, it gets everything
 ### into a UTF-8 reading mode
 ####################################################################
 def set_utf8_encoding():
     """
     Set UTF-8 encoding for stdin, stdout, and stderr in Python.
@@ -557,36 +557,41 @@
 
     itemCount = 0
     # and the GCA/GenBank mammals
     for asmId, commonName in mammalList.items():
         gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1]
         if not gcAcc.startswith("GCA_"):
             continue
         if gcAcc not in allPriorities:
             allPriorities[gcAcc] = priorityCounter
             priorityCounter += 1
             itemCount += 1
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk GCA mammals count: {itemCount:4}")
  
     itemCount = 0
-    # the rest of the highest versions of each unique dbDb name
-    sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True)
-    for key in sortByValue:
-        highVersion = highestVersion[key[0]]
-        if highVersion not in allPriorities:
-            allPriorities[highVersion] = priorityCounter
+    # dbDb is in cladeOrder, process in that order, find highest versions
+    for item in dbDb:
+        dbDbName = item['name']
+        if dbDbName not in allPriorities:
+            splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName)
+            if splitMatch:
+                noVersion = splitMatch.group(1)
+                version = allDbDbNames[dbDbName]
+                highVersion = versionScan[noVersion]
+                if highVersion == version:
+                    allPriorities[dbDbName] = priorityCounter
                     priorityCounter += 1
                     itemCount += 1
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}")
 
     itemCount = 0
     # GCF RefSeq from GenArk next priority
     for item in genArk:
         gcAccession = item['gcAccession']
         if not gcAccession.startswith("GCF_"):
             continue
         if gcAccession not in allPriorities:
             allPriorities[gcAccession] = priorityCounter
             priorityCounter += 1
             itemCount += 1
@@ -618,44 +623,44 @@
 
 def notUsed():
     itemCount = 0
     # finally the rest of the database genomes names
     for dbName in sorted(allDbDbNames):
         if dbName not in allPriorities:
             allPriorities[dbName] = priorityCounter
             priorityCounter += 1
             itemCount += 1
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tthe rest of dbDb count: {itemCount:4}")
 
 
 
 ####################################################################
+
 """
 table load procedure:
 
   hgsql -e 'DROP TABLE IF EXISTS genomePriority;' hgcentraltest
   hgsql hgcentraltest < ../lib/genomePriority.sql
   hgsql -e 'LOAD DATA LOCAL INFILE "genomePriority.tsv" INTO
 TABLE genomePriority;' hgcentraltest
   hgsql -e 'ALTER TABLE genomePriority
 ADD FULLTEXT INDEX gdIx
 (name, commonName, scientificName, clade, description);' \
   hgcentraltest
 
 """
-####################################################################
 
 ####################################################################
 def main():
     global priorityCounter
     global allPriorities
 
     if len(sys.argv) != 2:
         print("genomePriority.py - prepare genomePriority.tsv file from")
         print("    dbDb.hgcentral and UCSC_GI.assemblyHubList.txt file.\n")
         print("Usage: genomePriority.py dbDb.name.clade.tsv\n")
         print("the dbDb.name.clade.tsv file is a manually curated file to relate")
         print("    UCSC database names to GenArk clades, in source tree hubApi/")
         print("This script is going to read the dbDb.hgcentral table, and the file")
         print("    UCSC_GI.assemblyHubList.txt from hgdownload.")
         print("Writing an output file genomePriority.tsv to be loaded into")
@@ -697,50 +702,52 @@
     print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted))
 
 
     outFile = "genomePriority.tsv"
     fileOut = open(outFile, 'w')
 
     totalItemCount = 0
     itemCount = 0
     # Print the dbDb data
     for entry in dbDbItems:
         dbDbName = entry['name']
         if dbDbName in allPriorities:
             priority = allPriorities[dbDbName]
         else:
             print("no priority for ", dbDbName)
+            sys.exit(255)
 
         clade = entry['clade']
 
         descr = f"{entry['sourceName']} {entry['description']}\n"
         description = re.sub(r'\s+', ' ', descr).strip()
         outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}")
 
     itemCount = 0
     # Print the GenArk data
     for entry in genArkItems:
         gcAccession = entry['gcAccession']
         if gcAccession in allPriorities:
             priority = allPriorities[gcAccession]
         else:
             print("no priority for ", gcAccession)
+            sys.exit(255)
 
         cleanName = removeNonAlphanumeric(entry['commonName'])
         clade = entry['clade']
         descr = f"{entry['asmName']}"
         description = re.sub(r'\s+', ' ', descr).strip()
         outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}")
 
     incrementPriority = len(allPriorities)
     print("# incrementing priorities from: ", incrementPriority)