src/hg/hubApi/genomePriority.py 2d41a88dd1243e8aa51f32e001a80a529c40ea4a

2d41a88dd1243e8aa51f32e001a80a529c40ea4a
hiram
  Thu Jul 18 13:41:34 2024 -0700
adding hubUrl to the table ready for transition to assemblyList.py refs #33720

diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py
index 33de658..4d9ffec 100755
--- src/hg/hubApi/genomePriority.py
+++ src/hg/hubApi/genomePriority.py
@@ -387,30 +387,47 @@
 
     for word in words:
         # Convert word to lowercase for case-insensitive comparison
         lowerWord = word.lower()
 
         # Check if the lowercase version of the word has been seen before
         if lowerWord not in seenWords:
             # If not seen, add it to the result list and mark as seen
             resultWords.append(word)
             seenWords.add(lowerWord)
 
     # Join the words back into a single string
     return ' '.join(resultWords)
 
 ####################################################################
+### given a genark accession, return pathname to hub.txt
+###  given:   GCA_000001905.1
+###  returns: GCA/000/001/905/GCA_000001905.1/hub.txt
+####################################################################
+def genarkPath(gcAccession):
+    # Extract the prefix and the numeric part
+    prefix, numPart = gcAccession.split('_')
+
+    # Break the numeric part into chunks of three digits
+    parts = [numPart[i:i+3] for i in range(0, len(numPart), 3)]
+
+    # Join the parts to form the path
+    path = f"{prefix}/{parts[0]}/{parts[1]}/{parts[2]}/{gcAccession}/hub.txt"
+
+    return path
+
+####################################################################
 ### for the genArk set, establish some ad-hoc priorities
 ####################################################################
 def establishPriorities(dbDb, genArk):
     global topPriorities
     global allPriorities
     global priorityCounter
 
     totalItemCount = 0
 
     expectedTotal = len(dbDb) + len(genArk)
     print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes")
 
     # first priority are the specific manually selected top items
     itemCount = 0
     for name, priority in topPriorities.items():
@@ -695,77 +712,69 @@
     totalItemCount = 0
     itemCount = 0
     # Print the dbDb data
     for entry in dbDbItems:
         dbDbName = entry['name']
         if dbDbName in allPriorities:
             priority = allPriorities[dbDbName]
         else:
             print("no priority for ", dbDbName)
             sys.exit(255)
 
         clade = entry['clade']
 
         descr = f"{entry['sourceName']} {entry['description']}\n"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n"
+        outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}")
 
     itemCount = 0
     # Print the GenArk data
     for entry in genArkItems:
         gcAccession = entry['gcAccession']
         if gcAccession in allPriorities:
             priority = allPriorities[gcAccession]
         else:
             print("no priority for ", gcAccession)
             sys.exit(255)
 
+        hubPath = genarkPath(gcAccession)
         cleanName = removeNonAlphanumeric(entry['commonName'])
         clade = entry['clade']
         descr = f"{entry['asmName']}"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n"
+        outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}")
 
     incrementPriority = len(allPriorities) + 1
     print("# incrementing priorities from: ", incrementPriority)
 
     itemCount = 0
     # Print the refSeq/genBank data
     for entry in refSeqGenBankSorted:
         gcAccession = entry['gcAccession']
         commonName = entry['commonName']
         scientificName = entry['scientificName']
+        asmName = entry['asmName']
         clade = entry['clade']
-        descr = f"{entry['other']}"
+        descr = f"{asmName} {entry['other']}"
         description = re.sub(r'\s+', ' ', descr).strip()
-        outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\n"
+        outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\n"
         fileOut.write(outLine)
         incrementPriority += 1
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}")
 
     fileOut.close()
 
 if __name__ == "__main__":
     main()
-
-"""
-                "gcAccession": row[0],
-                "asmName": row[15],
-                "scientificName": row[7],
-                "commonName": row[3],
-                "taxId": row[5],
-                "clade": row[24],	# almost like GenArk clades
-                "other": asmSubmitter + " " + strain + " " + asmType,
-"""