2d41a88dd1243e8aa51f32e001a80a529c40ea4a hiram Thu Jul 18 13:41:34 2024 -0700 adding hubUrl to the table ready for transition to assemblyList.py refs #33720 diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py index 33de658..4d9ffec 100755 --- src/hg/hubApi/genomePriority.py +++ src/hg/hubApi/genomePriority.py @@ -387,30 +387,47 @@ for word in words: # Convert word to lowercase for case-insensitive comparison lowerWord = word.lower() # Check if the lowercase version of the word has been seen before if lowerWord not in seenWords: # If not seen, add it to the result list and mark as seen resultWords.append(word) seenWords.add(lowerWord) # Join the words back into a single string return ' '.join(resultWords) #################################################################### +### given a genark accession, return pathname to hub.txt +### given: GCA_000001905.1 +### returns: GCA/000/001/905/GCA_000001905.1/hub.txt +#################################################################### +def genarkPath(gcAccession): + # Extract the prefix and the numeric part + prefix, numPart = gcAccession.split('_') + + # Break the numeric part into chunks of three digits + parts = [numPart[i:i+3] for i in range(0, len(numPart), 3)] + + # Join the parts to form the path + path = f"{prefix}/{parts[0]}/{parts[1]}/{parts[2]}/{gcAccession}/hub.txt" + + return path + +#################################################################### ### for the genArk set, establish some ad-hoc priorities #################################################################### def establishPriorities(dbDb, genArk): global topPriorities global allPriorities global priorityCounter totalItemCount = 0 expectedTotal = len(dbDb) + len(genArk) print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes") # first priority are the specific manually selected top items itemCount = 0 for name, priority in topPriorities.items(): @@ -695,77 +712,69 @@ totalItemCount = 0 itemCount = 0 # Print the dbDb data for entry in dbDbItems: dbDbName = entry['name'] if dbDbName in allPriorities: priority = allPriorities[dbDbName] else: print("no priority for ", dbDbName) sys.exit(255) clade = entry['clade'] descr = f"{entry['sourceName']} {entry['description']}\n" description = re.sub(r'\s+', ' ', descr).strip() - outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n" + outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) sys.exit(255) + hubPath = genarkPath(gcAccession) cleanName = removeNonAlphanumeric(entry['commonName']) clade = entry['clade'] descr = f"{entry['asmName']}" description = re.sub(r'\s+', ' ', descr).strip() - outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n" + outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") incrementPriority = len(allPriorities) + 1 print("# incrementing priorities from: ", incrementPriority) itemCount = 0 # Print the refSeq/genBank data for entry in refSeqGenBankSorted: gcAccession = entry['gcAccession'] commonName = entry['commonName'] scientificName = entry['scientificName'] + asmName = entry['asmName'] clade = entry['clade'] - descr = f"{entry['other']}" + descr = f"{asmName} {entry['other']}" description = re.sub(r'\s+', ' ', descr).strip() - outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\n" + outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\n" fileOut.write(outLine) incrementPriority += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}") fileOut.close() if __name__ == "__main__": main() - -""" - "gcAccession": row[0], - "asmName": row[15], - "scientificName": row[7], - "commonName": row[3], - "taxId": row[5], - "clade": row[24], # almost like GenArk clades - "other": asmSubmitter + " " + strain + " " + asmType, -"""