c1f3643c3e308c7819a375ee5ea4170a45c679e5 hiram Wed Jun 26 14:51:38 2024 -0700 clean up cruft refs #32897 diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py index 65e4830..fe7245a 100755 --- src/hg/hubApi/genomePriority.py +++ src/hg/hubApi/genomePriority.py @@ -173,31 +173,32 @@ # If not seen, add it to the result list and mark as seen resultWords.append(word) seenWords.add(lowerWord) # Join the words back into a single string return ' '.join(resultWords) #################################################################### def establishPriorities(dbDb, genArk): global topPriorities global allPriorities global priorityCounter totalItemCount = 0 - print(f"### setting priorities, {len(dbDb):4} dbDb genomes, {len(genArk):4} genArk genomes") + expectedTotal = len(dbDb) + len(genArk) + print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes") itemCount = 0 for name, priority in topPriorities.items(): allPriorities[name] = priority itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\ttopPriorities count: {itemCount:4}") primateList = readOrderTsv('primates') mammalList = readOrderTsv('mammals') versionScan = {} # key is dbDb name without number version extension, # value is highest version number seen for this bare # name highestVersion = {} # key is dbDb name without number version extension, # value is the full dbDb name for the highest version @@ -291,33 +292,31 @@ gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] if not gcAcc.startswith("GCA_"): continue if gcAcc not in allPriorities: allPriorities[gcAcc] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCA primates count: {itemCount:4}") itemCount = 0 # the highest versions of each unique dbDb name get the top priorities sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True) for key in sortByValue: highVersion = highestVersion[key[0]] -# if key[0] not in allPriorities: if highVersion not in allPriorities: -# allPriorities[key[0]] = priorityCounter allPriorities[highVersion] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}") itemCount = 0 # the mammals, GCF/RefSeq first for asmId, commonName in mammalList.items(): gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] if not gcAcc.startswith("GCF_"): continue if gcAcc not in allPriorities: allPriorities[gcAcc] = priorityCounter priorityCounter += 1 @@ -414,82 +413,52 @@ rawData = dbDbData() dbDbItems = processDbDbData(rawData) # and the correspondence of dbDb names to GenArk clade categories dbDbClades = dbDbCladeList(dbDbNameCladeFile) # read the GenArk data from hgdownload into a list of dictionaries genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" genArkItems = readGenArkData(genArkUrl) establishPriorities(dbDbItems, genArkItems) outFile = "genomePriority.tsv" fileOut = open(outFile, 'w') itemCount = 0 - # name,scientificName,organism,taxId,sourceName,description # Print the dbDb data for entry in dbDbItems: - # Encode each entry value to UTF-8 before printing dbDbName = entry['name'] if dbDbName in allPriorities: priority = allPriorities[dbDbName] else: print("no priority for ", dbDbName) clade = dbDbClades.get(entry['name'], "n/a") - indexString = entry['name'] - indexString += " " + removeNonAlphanumeric(entry['scientificName']) - indexString += " " + removeNonAlphanumeric(entry['organism']) - indexString += " " + removeNonAlphanumeric(entry['description']) - indexString += " " + removeNonAlphanumeric(entry['sourceName']) - indexString += " " + entry['taxId'] - noDups = eliminateDupWords(indexString) descr = f"{entry['sourceName']} {clade} {entry['taxId']} {entry['description']}\n" description = re.sub(r'\s+', ' ', descr).strip() -# outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\t{noDups}\n" outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n" fileOut.write(outLine) itemCount += 1 itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) cleanName = removeNonAlphanumeric(entry['commonName']) clade = re.sub(r'\(L\)$', '', entry['clade']) - indexString = entry['gcAccession'] - indexString += " " + removeNonAlphanumeric(entry['scientificName']) - indexString += " " + cleanName - indexString += " " + clade - indexString += " " + removeNonAlphanumeric(entry['asmName']) - indexString += " " + entry['taxId'] - noDups = eliminateDupWords(indexString) -# print(priority, entry['gcAccession'], entry['scientificName'], entry['commonName'], entry['taxId'], entry['asmName'], "'" + noDups + "'") -# print(priority, entry['gcAccession'], entry['scientificName'], entry['commonName'], entry['taxId'], "'" + noDups + "'") -# outLine =f"{priority}\t{entry['gcAccession']}\t{entry['scientificName']}\t{entry['taxId']}\t{entry['commonName']}\t{noDups}\n" -# outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{entry['asmName']}\t{noDups}\n" descr = f"{entry['asmName']} {clade} {entry['taxId']}\n" description = re.sub(r'\s+', ' ', descr).strip() outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n" fileOut.write(outLine) itemCount += 1 fileOut.close() -# print(removeNonAlphanumeric(entry['asmName'])) -# print(removeNonAlphanumeric(entry['commonName'])) -# gcAccession,scientificName,commonName,taxId,asmName - - -# cleanString = {k: removeNonAlphanumeric(v) for k, v in entry.items()} -# print(cleanString) - - if __name__ == "__main__": main()