src/hg/hubApi/genomePriority.py d84ccb1cf4dfe34ec488134fa7e57cd588d369e2

d84ccb1cf4dfe34ec488134fa7e57cd588d369e2
hiram
  Mon Jul 8 19:16:17 2024 -0700
this is now ready to clean up refs #32897

diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py
index addcb207..8a2b13e 100755
--- src/hg/hubApi/genomePriority.py
+++ src/hg/hubApi/genomePriority.py
@@ -33,30 +33,58 @@
 
 ####################################################################
 def dbDbData():
     # Run the MySQL command and capture the output as bytes
     result = subprocess.run(
         ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT name,scientificName,organism,taxId,sourceName,description FROM dbDb WHERE active=1;", "hgcentral"],
         stdout=subprocess.PIPE, stderr=subprocess.PIPE
     )
     if result.returncode != 0:
         print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}")
         exit(1)
     # Decode the output from bytes to string using utf-8
     return result.stdout.decode('latin-1')
 
 ####################################################################
+def readAsmIdClade():
+    asmIdClade = {}
+
+    filePath = "/hive/data/outside/ncbi/genomes/reports/newAsm/asmId.clade.tsv"
+
+    with open(filePath, 'r', encoding='utf-8') as file:
+        reader = csv.reader(file, delimiter='\t')
+        for row in reader:
+            asmIdClade[row[0]] = row[1]
+
+    return asmIdClade
+
+####################################################################
+def allCommonNames():
+    commonNames = {}
+
+    filePath = "/hive/data/outside/ncbi/genomes/reports/allCommonNames/asmId.commonName.all.txt"
+
+    with open(filePath, 'r', encoding='utf-8') as file:
+        reader = csv.reader(file, delimiter='\t')
+        for row in reader:
+            asmId = row[0]
+            gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1]
+            commonNames[gcAcc] = row[1]
+
+    return commonNames
+
+####################################################################
 def readCommonNames(ncbiType):
     commonNames = {}
 
     filePath = "/hive/data/outside/ncbi/genomes/reports/" + ncbiType + "/allAssemblies.commonNames.tsv"
 
     with open(filePath, 'r', encoding='utf-8') as file:
         reader = csv.reader(file, delimiter='\t')
         for row in reader:
             commonNames[row[1]] = row[3]
 
     return commonNames
 
 ####################################################################
 """
      header definitions from assembly_summary_refseq.txt
@@ -85,95 +113,119 @@
 hasn't changed.
 
    2175 archaea
  360585 bacteria
     607 fungi
     414 invertebrate
     184 plant
      96 protozoa
     231 vertebrate_mammalian
     405 vertebrate_other
   14992 viral
 
 
 """
 
-def readAsmSummary(suffix, prioExists, comNames):
+def readAsmSummary(suffix, prioExists, comNames, asmIdClade):
     # read one of the NCBI files from
     # /hive/data/outside/ncbi/genomes/reports/assembly_summary_{suffix}
     # Initialize a list to hold the dictionaries
     dataList = []
     keys = [
+"primates",
+"mammals",
 "vertebrate_mammalian",
+"birds",
+"fish",
+"vertebrate",
 "vertebrate_other",
 "invertebrate",
-"plant",
+"plants",
 "fungi",
 "protozoa",
 "viral",
 "bacteria",
 "archaea",
+"other",
+"metagenomes",
     ]
     values = [
-'1',
-'2',
-'3',
-'4',
-'5',
-'6',
-'7',
-'8',
-'0',
+'01',
+'02',
+'03',
+'04',
+'05',
+'06',
+'07',
+'08',
+'09',
+'10',
+'11',
+'12',
+'13',
+'14',
+'15',
+'16',
     ]
     cladePrio = dict(zip(keys, values))
 
     filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix
 
     with open(filePath, 'r', encoding='utf-8') as file:
         reader = csv.reader(file, delimiter='\t')
         for row in reader:
             if len(row) < 1:
                continue
             if row[0].startswith('#'):
                continue
             if row[0] in prioExists:
                continue
             if len(row) != 38:
                 print(f"ERROR: incorrect number of fields in {file}")
                 sys.exit(1)
             gcAccession = row[0]
             strain = re.sub(r'breed=', '', row[8])
             s0 = re.sub(r'cultivar=', '', strain)
             strain = re.sub(r'ecotype=', '', s0)
             s0 = re.sub(r'strain=', '', strain)
             strain = re.sub(r'na', '', s0)
             year = re.sub(r'/.*', '', row[14])
+            asmName = row[15]
+            asmId = gcAccession + "_" + asmName
             asmSubmitter = row[16]
             asmType = row[23]
 #            commonName = row[7]
             commonName = "n/a"
             if gcAccession in comNames:
                 commonName = comNames[gcAccession]
+            clade = row[24]	# almost like GenArk clades
+            if asmId in asmIdClade:	# specific GenArk clade
+                clade = asmIdClade[asmId]
+            if clade == "plant":
+                clade = "plants"
+            if clade not in cladePrio:
+                print("missing clade ", clade, " in cladePrio")
+                sys.exit(-1)
             dataDict = {
                 "gcAccession": gcAccession,
-                "asmName": row[15],
+                "asmName": asmName,
                 "scientificName": row[7],
                 "commonName": commonName,
                 "taxId": row[5],
-                "clade": row[24],	# almost like GenArk clades
+                "clade": clade,
                 "other": asmSubmitter + " " + strain + " " + asmType + " " + year,
-                "sortOrder": cladePrio[row[24]]
+                "sortOrder": cladePrio[clade],
             }
 
             utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()}
             # Append the dictionary to the list
             dataList.append(utf8Encoded)
     
     return sorted(dataList, key=lambda x: x['sortOrder'])
 
 ####################################################################
 ### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt
 def readGenArkData(url):
     # Initialize a list to hold the dictionaries
     dataList = []
 
     response = requests.get(url)
@@ -556,100 +608,111 @@
     # Ensure stdout and stderr use UTF-8 encoding
     set_utf8_encoding()
 
     # the correspondence of dbDb names to GenArk clade categories
     dbDbClades = dbDbCladeList(dbDbNameCladeFile)
     # Get the dbDb.hgcentral table data
     rawData = dbDbData()
     dbDbItems = processDbDbData(rawData, dbDbClades)
 
     # read the GenArk data from hgdownload into a list of dictionaries
     genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt"
     genArkItems = readGenArkData(genArkUrl)
 
     establishPriorities(dbDbItems, genArkItems)
 
-    refSeqCommonNames = readCommonNames("refseq")
-    print("# refseq common names: ", len(refSeqCommonNames))
-
-    refSeqList = readAsmSummary("refseq.txt", allPriorities, refSeqCommonNames)
+    asmIdClade = readAsmIdClade()
+    commonNames = allCommonNames()
+    print("# all common names: ", len(commonNames))
+#    genBankCommonNames = readCommonNames("genbank")
+#    print("# genBank common names: ", len(genBankCommonNames))
 
+    refSeqList = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade)
     print("# refSeq assemblies: ", len(refSeqList))
+    genBankList = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade)
+    print("# genBank assemblies: ", len(genBankList))
+
+    refSeqGenBankList = refSeqList + genBankList
+    print("# refSeq + genBank assemblies: ", len(refSeqGenBankList))
+
+    refSeqGenBankSorted =  sorted(refSeqGenBankList, key=lambda x: x['sortOrder'])
+    print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted))
+
 
     outFile = "genomePriority.tsv"
     fileOut = open(outFile, 'w')
 
     totalItemCount = 0
     itemCount = 0
     # Print the dbDb data
     for entry in dbDbItems:
         dbDbName = entry['name']
         if dbDbName in allPriorities:
             priority = allPriorities[dbDbName]
         else:
             print("no priority for ", dbDbName)
 
         clade = entry['clade']
 
-        descr = f"{entry['sourceName']} {clade} {entry['description']}\n"
+        descr = f"{entry['sourceName']} {entry['description']}\n"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\t1\n"
+        outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}")
 
     itemCount = 0
     # Print the GenArk data
     for entry in genArkItems:
         gcAccession = entry['gcAccession']
         if gcAccession in allPriorities:
             priority = allPriorities[gcAccession]
         else:
             print("no priority for ", gcAccession)
 
         cleanName = removeNonAlphanumeric(entry['commonName'])
         clade = entry['clade']
-        descr = f"{entry['asmName']} {clade}"
+        descr = f"{entry['asmName']}"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\t1\n"
+        outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}")
 
-    itemCount = 0
     incrementPriority = len(allPriorities)
     print("# incrementing priorities from: ", incrementPriority)
-    # Print the dbDb data
-    for entry in refSeqList:
+
+    itemCount = 0
+    # Print the refSeq data
+    for entry in refSeqGenBankSorted:
         gcAccession = entry['gcAccession']
         commonName = entry['commonName']
         scientificName = entry['scientificName']
-        descr = f"{entry['other']} {entry['clade']}"
+        clade = entry['clade']
+        descr = f"{entry['other']}"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\n"
+        outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\n"
         fileOut.write(outLine)
         incrementPriority += 1
         itemCount += 1
 
     totalItemCount += itemCount
-    print(f"{totalItemCount:4} - total\trefSeq count: {itemCount:4}")
-
-# def removeNonAlphanumeric(s):
+    print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}")
 
     fileOut.close()
 
 if __name__ == "__main__":
     main()
 
 """
                 "gcAccession": row[0],
                 "asmName": row[15],
                 "scientificName": row[7],
                 "commonName": row[3],
                 "taxId": row[5],
                 "clade": row[24],	# almost like GenArk clades
                 "other": asmSubmitter + " " + strain + " " + asmType,
 """