7009c18457198f53021f03b5f8ad4e20f7b51baa hiram Wed Jul 10 11:04:03 2024 -0700 this is most likely complete, it does not output in priority order, outtput needs to be sorted refs #32897 diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py index 4f96c97..3760c80 100755 --- src/hg/hubApi/genomePriority.py +++ src/hg/hubApi/genomePriority.py @@ -4,31 +4,31 @@ import locale import sys import re import os import csv import requests from io import StringIO # special top priorities topPriorities = { 'hg38': 1, 'mm39': 2, 'hs1': 3, } -### key will be dbDb name, value will be priority number +### key will be dbDb/GCx name, value will be priority number allPriorities = {} priorityCounter = len(topPriorities) + 1 # key is clade, value is priority, will be initialized # by initCladePriority() function cladePrio = {} #################################################################### ### this is kinda like an environment setting, it gets everything ### into a UTF-8 reading mode #################################################################### def set_utf8_encoding(): """ Set UTF-8 encoding for stdin, stdout, and stderr in Python. @@ -557,36 +557,41 @@ itemCount = 0 # and the GCA/GenBank mammals for asmId, commonName in mammalList.items(): gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] if not gcAcc.startswith("GCA_"): continue if gcAcc not in allPriorities: allPriorities[gcAcc] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCA mammals count: {itemCount:4}") itemCount = 0 - # the rest of the highest versions of each unique dbDb name - sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True) - for key in sortByValue: - highVersion = highestVersion[key[0]] - if highVersion not in allPriorities: - allPriorities[highVersion] = priorityCounter + # dbDb is in cladeOrder, process in that order, find highest versions + for item in dbDb: + dbDbName = item['name'] + if dbDbName not in allPriorities: + splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName) + if splitMatch: + noVersion = splitMatch.group(1) + version = allDbDbNames[dbDbName] + highVersion = versionScan[noVersion] + if highVersion == version: + allPriorities[dbDbName] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}") itemCount = 0 # GCF RefSeq from GenArk next priority for item in genArk: gcAccession = item['gcAccession'] if not gcAccession.startswith("GCF_"): continue if gcAccession not in allPriorities: allPriorities[gcAccession] = priorityCounter priorityCounter += 1 itemCount += 1 @@ -618,44 +623,44 @@ def notUsed(): itemCount = 0 # finally the rest of the database genomes names for dbName in sorted(allDbDbNames): if dbName not in allPriorities: allPriorities[dbName] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tthe rest of dbDb count: {itemCount:4}") #################################################################### + """ table load procedure: hgsql -e 'DROP TABLE IF EXISTS genomePriority;' hgcentraltest hgsql hgcentraltest < ../lib/genomePriority.sql hgsql -e 'LOAD DATA LOCAL INFILE "genomePriority.tsv" INTO TABLE genomePriority;' hgcentraltest hgsql -e 'ALTER TABLE genomePriority ADD FULLTEXT INDEX gdIx (name, commonName, scientificName, clade, description);' \ hgcentraltest """ -#################################################################### #################################################################### def main(): global priorityCounter global allPriorities if len(sys.argv) != 2: print("genomePriority.py - prepare genomePriority.tsv file from") print(" dbDb.hgcentral and UCSC_GI.assemblyHubList.txt file.\n") print("Usage: genomePriority.py dbDb.name.clade.tsv\n") print("the dbDb.name.clade.tsv file is a manually curated file to relate") print(" UCSC database names to GenArk clades, in source tree hubApi/") print("This script is going to read the dbDb.hgcentral table, and the file") print(" UCSC_GI.assemblyHubList.txt from hgdownload.") print("Writing an output file genomePriority.tsv to be loaded into") @@ -697,50 +702,52 @@ print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted)) outFile = "genomePriority.tsv" fileOut = open(outFile, 'w') totalItemCount = 0 itemCount = 0 # Print the dbDb data for entry in dbDbItems: dbDbName = entry['name'] if dbDbName in allPriorities: priority = allPriorities[dbDbName] else: print("no priority for ", dbDbName) + sys.exit(255) clade = entry['clade'] descr = f"{entry['sourceName']} {entry['description']}\n" description = re.sub(r'\s+', ' ', descr).strip() outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) + sys.exit(255) cleanName = removeNonAlphanumeric(entry['commonName']) clade = entry['clade'] descr = f"{entry['asmName']}" description = re.sub(r'\s+', ' ', descr).strip() outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") incrementPriority = len(allPriorities) print("# incrementing priorities from: ", incrementPriority)