src/hg/hubApi/genomePriority.py c1f3643c3e308c7819a375ee5ea4170a45c679e5

c1f3643c3e308c7819a375ee5ea4170a45c679e5
hiram
  Wed Jun 26 14:51:38 2024 -0700
clean up cruft refs #32897

diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py
index 65e4830..fe7245a 100755
--- src/hg/hubApi/genomePriority.py
+++ src/hg/hubApi/genomePriority.py
@@ -173,31 +173,32 @@
             # If not seen, add it to the result list and mark as seen
             resultWords.append(word)
             seenWords.add(lowerWord)
 
     # Join the words back into a single string
     return ' '.join(resultWords)
 
 ####################################################################
 def establishPriorities(dbDb, genArk):
     global topPriorities
     global allPriorities
     global priorityCounter
 
     totalItemCount = 0
 
-    print(f"### setting priorities, {len(dbDb):4} dbDb genomes, {len(genArk):4} genArk genomes")
+    expectedTotal = len(dbDb) + len(genArk)
+    print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes")
     itemCount = 0
     for name, priority in topPriorities.items():
        allPriorities[name] = priority
        itemCount += 1
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\ttopPriorities count: {itemCount:4}")
 
     primateList = readOrderTsv('primates')
     mammalList = readOrderTsv('mammals')
 
     versionScan = {}	# key is dbDb name without number version extension,
                         # value is highest version number seen for this bare
                         # name
     highestVersion = {}	# key is dbDb name without number version extension,
 			# value is the full dbDb name for the highest version
@@ -291,33 +292,31 @@
         gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1]
         if not gcAcc.startswith("GCA_"):
             continue
         if gcAcc not in allPriorities:
             allPriorities[gcAcc] = priorityCounter
             priorityCounter += 1
             itemCount += 1
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk GCA primates count: {itemCount:4}")
  
     itemCount = 0
     # the highest versions of each unique dbDb name get the top priorities
     sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True)
     for key in sortByValue:
         highVersion = highestVersion[key[0]]
-#         if key[0] not in allPriorities:
         if highVersion not in allPriorities:
-#            allPriorities[key[0]] = priorityCounter
             allPriorities[highVersion] = priorityCounter
             priorityCounter += 1
             itemCount += 1
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}")
 
     itemCount = 0
     # the mammals, GCF/RefSeq first
     for asmId, commonName in mammalList.items():
         gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1]
         if not gcAcc.startswith("GCF_"):
             continue
         if gcAcc not in allPriorities:
             allPriorities[gcAcc] = priorityCounter
             priorityCounter += 1
@@ -414,82 +413,52 @@
     rawData = dbDbData()
     dbDbItems = processDbDbData(rawData)
     # and the correspondence of dbDb names to GenArk clade categories
     dbDbClades = dbDbCladeList(dbDbNameCladeFile)
 
     # read the GenArk data from hgdownload into a list of dictionaries
     genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt"
     genArkItems = readGenArkData(genArkUrl)
 
     establishPriorities(dbDbItems, genArkItems)
 
     outFile = "genomePriority.tsv"
     fileOut = open(outFile, 'w')
 
     itemCount = 0
-    # name,scientificName,organism,taxId,sourceName,description
     # Print the dbDb data
     for entry in dbDbItems:
-        # Encode each entry value to UTF-8 before printing
         dbDbName = entry['name']
         if dbDbName in allPriorities:
             priority = allPriorities[dbDbName]
         else:
             print("no priority for ", dbDbName)
 
         clade = dbDbClades.get(entry['name'], "n/a")
 
-        indexString = entry['name']
-        indexString += " " + removeNonAlphanumeric(entry['scientificName'])
-        indexString += " " + removeNonAlphanumeric(entry['organism'])
-        indexString += " " + removeNonAlphanumeric(entry['description'])
-        indexString += " " + removeNonAlphanumeric(entry['sourceName'])
-        indexString += " " + entry['taxId']
-        noDups = eliminateDupWords(indexString)
         descr = f"{entry['sourceName']} {clade} {entry['taxId']} {entry['description']}\n"
         description = re.sub(r'\s+', ' ', descr).strip()
-#        outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\t{noDups}\n"
         outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     itemCount = 0
     # Print the GenArk data
     for entry in genArkItems:
         gcAccession = entry['gcAccession']
         if gcAccession in allPriorities:
             priority = allPriorities[gcAccession]
         else:
             print("no priority for ", gcAccession)
 
         cleanName = removeNonAlphanumeric(entry['commonName'])
         clade = re.sub(r'\(L\)$', '', entry['clade'])
-        indexString = entry['gcAccession']
-        indexString += " " + removeNonAlphanumeric(entry['scientificName'])
-        indexString += " " + cleanName
-        indexString += " " + clade
-        indexString += " " + removeNonAlphanumeric(entry['asmName'])
-        indexString += " " + entry['taxId']
-        noDups = eliminateDupWords(indexString)
-#        print(priority, entry['gcAccession'], entry['scientificName'], entry['commonName'], entry['taxId'], entry['asmName'], "'" + noDups + "'")
-#        print(priority, entry['gcAccession'], entry['scientificName'], entry['commonName'], entry['taxId'], "'" + noDups + "'")
-#        outLine =f"{priority}\t{entry['gcAccession']}\t{entry['scientificName']}\t{entry['taxId']}\t{entry['commonName']}\t{noDups}\n"
-#        outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{entry['asmName']}\t{noDups}\n"
         descr = f"{entry['asmName']} {clade} {entry['taxId']}\n"
         description = re.sub(r'\s+', ' ', descr).strip()
         outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     fileOut.close()
 
-#        print(removeNonAlphanumeric(entry['asmName']))
-#        print(removeNonAlphanumeric(entry['commonName']))
-#        gcAccession,scientificName,commonName,taxId,asmName
-
-
-#        cleanString = {k: removeNonAlphanumeric(v) for k, v in entry.items()}
-#        print(cleanString)
-
-
 if __name__ == "__main__":
     main()