1b4ea94ed661ef98b094665947b23b55c13ac989 hiram Wed Jul 10 09:41:42 2024 -0700 one last set of dbDb to get sorted by clade properly refs #32897 diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py index 8a2b13e..4f96c97 100755 --- src/hg/hubApi/genomePriority.py +++ src/hg/hubApi/genomePriority.py @@ -9,95 +9,97 @@ import requests from io import StringIO # special top priorities topPriorities = { 'hg38': 1, 'mm39': 2, 'hs1': 3, } ### key will be dbDb name, value will be priority number allPriorities = {} priorityCounter = len(topPriorities) + 1 +# key is clade, value is priority, will be initialized +# by initCladePriority() function +cladePrio = {} + +#################################################################### +### this is kinda like an environment setting, it gets everything +### into a UTF-8 reading mode #################################################################### def set_utf8_encoding(): """ Set UTF-8 encoding for stdin, stdout, and stderr in Python. """ if sys.stdout.encoding != 'utf-8': sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1) if sys.stderr.encoding != 'utf-8': sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1) #################################################################### +### reading hgcentral.dbDb table +#################################################################### def dbDbData(): # Run the MySQL command and capture the output as bytes result = subprocess.run( ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT name,scientificName,organism,taxId,sourceName,description FROM dbDb WHERE active=1;", "hgcentral"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if result.returncode != 0: print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}") exit(1) # Decode the output from bytes to string using utf-8 return result.stdout.decode('latin-1') #################################################################### +### this clade file is setup via weekly cron jobs +### 05 00 * * 3 /hive/data/outside/ncbi/genomes/cronUpdates/ncbiRsync.sh GCA +### 05 00 * * 6 /hive/data/outside/ncbi/genomes/cronUpdates/ncbiRsync.sh GCF +#################################################################### def readAsmIdClade(): asmIdClade = {} filePath = "/hive/data/outside/ncbi/genomes/reports/newAsm/asmId.clade.tsv" with open(filePath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: asmIdClade[row[0]] = row[1] return asmIdClade #################################################################### +### this common name file is setup weekly via a cron job +#################################################################### def allCommonNames(): commonNames = {} filePath = "/hive/data/outside/ncbi/genomes/reports/allCommonNames/asmId.commonName.all.txt" with open(filePath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: asmId = row[0] gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] commonNames[gcAcc] = row[1] return commonNames #################################################################### -def readCommonNames(ncbiType): - commonNames = {} - - filePath = "/hive/data/outside/ncbi/genomes/reports/" + ncbiType + "/allAssemblies.commonNames.tsv" - - with open(filePath, 'r', encoding='utf-8') as file: - reader = csv.reader(file, delimiter='\t') - for row in reader: - commonNames[row[1]] = row[3] - - return commonNames - -#################################################################### """ header definitions from assembly_summary_refseq.txt 1 #assembly_accession 20 ftp_path 2 bioproject 21 excluded_from_refseq 3 biosample 22 relation_to_type_material 4 wgs_master 23 asm_not_live_date 5 refseq_category 24 assembly_type 6 taxid 25 group 7 species_taxid 26 genome_size 8 organism_name 27 genome_size_ungapped 9 infraspecific_name 28 gc_percent 10 isolate 29 replicon_count 11 version_status 30 scaffold_count 12 assembly_level 31 contig_count @@ -113,229 +115,269 @@ hasn't changed. 2175 archaea 360585 bacteria 607 fungi 414 invertebrate 184 plant 96 protozoa 231 vertebrate_mammalian 405 vertebrate_other 14992 viral """ -def readAsmSummary(suffix, prioExists, comNames, asmIdClade): - # read one of the NCBI files from - # /hive/data/outside/ncbi/genomes/reports/assembly_summary_{suffix} - # Initialize a list to hold the dictionaries - dataList = [] +#################################################################### +### the various listings are ordered by these clade priorities to get +### primates first, mammals second, and so on +#################################################################### +def initCladePriority(): + global cladePrio + keys = [ "primates", "mammals", "vertebrate_mammalian", "birds", "fish", "vertebrate", "vertebrate_other", "invertebrate", "plants", "fungi", "protozoa", "viral", "bacteria", "archaea", "other", "metagenomes", +"n/a", ] values = [ '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', +'17', ] cladePrio = dict(zip(keys, values)) +#################################################################### +### given clade c, return priority from cladePrio +#################################################################### +def cladePriority(c): + global cladePrio + if c not in cladePrio: + print(f"ERROR: missing clade {c} in cladePrio") + sys.exit(255) + + return cladePrio[c] + +#################################################################### +### read one of the NCBI files from +### /hive/data/outside/ncbi/genomes/reports/assembly_summary_{suffix} +#################################################################### +def readAsmSummary(suffix, prioExists, comNames, asmIdClade): + + # Initialize a list to hold the dictionaries + dataList = [] + filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix with open(filePath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) < 1: continue if row[0].startswith('#'): continue if row[0] in prioExists: continue if len(row) != 38: print(f"ERROR: incorrect number of fields in {file}") - sys.exit(1) + sys.exit(255) gcAccession = row[0] strain = re.sub(r'breed=', '', row[8]) s0 = re.sub(r'cultivar=', '', strain) strain = re.sub(r'ecotype=', '', s0) s0 = re.sub(r'strain=', '', strain) strain = re.sub(r'na', '', s0) year = re.sub(r'/.*', '', row[14]) asmName = row[15] asmId = gcAccession + "_" + asmName asmSubmitter = row[16] asmType = row[23] -# commonName = row[7] commonName = "n/a" if gcAccession in comNames: commonName = comNames[gcAccession] clade = row[24] # almost like GenArk clades if asmId in asmIdClade: # specific GenArk clade clade = asmIdClade[asmId] if clade == "plant": clade = "plants" - if clade not in cladePrio: - print("missing clade ", clade, " in cladePrio") - sys.exit(-1) + cladeP = cladePriority(clade) + dataDict = { "gcAccession": gcAccession, "asmName": asmName, "scientificName": row[7], "commonName": commonName, "taxId": row[5], "clade": clade, "other": asmSubmitter + " " + strain + " " + asmType + " " + year, - "sortOrder": cladePrio[clade], + "sortOrder": cladeP, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) return sorted(dataList, key=lambda x: x['sortOrder']) #################################################################### ### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt +### this sets up the GenArk listing +#################################################################### def readGenArkData(url): # Initialize a list to hold the dictionaries dataList = [] response = requests.get(url) response.raise_for_status() fileContent = response.text fileIo = StringIO(fileContent) reader = csv.reader(fileIo, delimiter='\t') + for row in reader: if row and row[0].startswith('#'): continue + clade = re.sub(r'\(L\)$', '', row[5]) + cladeP = cladePriority(clade) dataDict = { "gcAccession": row[0], "asmName": row[1], "scientificName": row[2], "commonName": row[3], "taxId": row[4], - "clade": re.sub(r'\(L\)$', '', row[5]), + "clade": clade, + "sortOrder": cladeP, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) # reset the list so that accessions such as GCF_000001405.40 # come before GCF_000001405.39 - dataList.reverse() - return dataList +# dataList.reverse() +# return dataList + return sorted(dataList, key=lambda x: x['sortOrder']) #################################################################### +### a manually maintained clade listing for UCSC dbDb assemblies +#################################################################### def dbDbCladeList(filePath): returnList = {} with open(filePath, 'r') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) < 1: continue if row[0].startswith('#'): continue returnList[row[0]] = row[1] return returnList #################################################################### +### out of the genArkData list, extract the given clade +#################################################################### def extractClade(clade, genArkData): tmpList = {} for item in genArkData: if clade != item['clade']: continue tmpList[item['gcAccession']] = item['commonName'] # return sorted list on the common name, case insensitive returnList = dict(sorted(tmpList.items(), key=lambda item:item[1].lower())) return returnList #################################################################### -# Define a key function for sorting by the first word, case insensitive +### Define a key function for sorting by the first word, case insensitive +#################################################################### def getFirstWordCaseInsensitive(row): firstWord = row.split('\t')[0] # Extract the first word return firstWord.lower() # Convert to lowercase for case-insensitive sorting #################################################################### +### process the hgcentral.dbDb data into a dictionary +#################################################################### def processDbDbData(data, clades): # Initialize a list to hold the dictionaries dataList = [] # Split the data into lines (rows) rows = data.strip().split('\n') # reverse the rows so that names such as hg19 come before hg18 sortedRows = sorted(rows, key=getFirstWordCaseInsensitive, reverse=True) for row in sortedRows: # Split each row into columns columns = row.split('\t') clade = clades.get(columns[0], "n/a") + cladeP = cladePriority(clade) # corresponds with the SELECT statement # name,scientificName,organism,taxId,sourceName,description # Create a dictionary for each row dataDict = { "name": columns[0], "scientificName": columns[1], "organism": columns[2], "taxId": columns[3], "sourceName": columns[4], "description": columns[5], "clade": clade, + "sortOrder": cladeP, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) - return dataList + return sorted(dataList, key=lambda x: x['sortOrder']) #################################################################### -# Function to remove non-alphanumeric characters +### Function to remove non-alphanumeric characters +### cleans up names to make them better indexing words +#################################################################### def removeNonAlphanumeric(s): # Ensure string type if isinstance(s, str): reSub = re.sub(r'[^a-zA-Z0-9_]', ' ', s).strip() return re.sub(r'\s+', ' ', reSub) else: return s # Return as-is for non-string types #################################################################### def eliminateDupWords(s): # Split the sentence into words words = s.split() # Initialize a set to keep track of unique words encountered seenWords = set() @@ -345,30 +387,32 @@ for word in words: # Convert word to lowercase for case-insensitive comparison lowerWord = word.lower() # Check if the lowercase version of the word has been seen before if lowerWord not in seenWords: # If not seen, add it to the result list and mark as seen resultWords.append(word) seenWords.add(lowerWord) # Join the words back into a single string return ' '.join(resultWords) #################################################################### +### for the genArk set, establish some ad-hoc priorities +#################################################################### def establishPriorities(dbDb, genArk): global topPriorities global allPriorities global priorityCounter totalItemCount = 0 expectedTotal = len(dbDb) + len(genArk) print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes") # first priority are the specific manually selected top items itemCount = 0 for name, priority in topPriorities.items(): allPriorities[name] = priority itemCount += 1 @@ -551,92 +595,107 @@ itemCount = 0 # GCA GenBank from GenArk next priority for item in genArk: gcAccession = item['gcAccession'] if not gcAccession.startswith("GCA_"): continue if gcAccession not in allPriorities: allPriorities[gcAccession] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCA count: {itemCount:4}") itemCount = 0 + for entry in dbDb: + dbName = entry['name'] + if dbName not in allPriorities: + allPriorities[dbName] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tthe rest of dbDb count: {itemCount:4}") + +def notUsed(): + itemCount = 0 # finally the rest of the database genomes names for dbName in sorted(allDbDbNames): if dbName not in allPriorities: allPriorities[dbName] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tthe rest of dbDb count: {itemCount:4}") + + #################################################################### """ table load procedure: hgsql -e 'DROP TABLE IF EXISTS genomePriority;' hgcentraltest - hgsql hgcentraltest < genomePriority.sql + hgsql hgcentraltest < ../lib/genomePriority.sql hgsql -e 'LOAD DATA LOCAL INFILE "genomePriority.tsv" INTO TABLE genomePriority;' hgcentraltest hgsql -e 'ALTER TABLE genomePriority ADD FULLTEXT INDEX gdIx -(name, commonName, scientificName, description);' hgcentraltest +(name, commonName, scientificName, clade, description);' \ + hgcentraltest + """ #################################################################### #################################################################### def main(): global priorityCounter global allPriorities if len(sys.argv) != 2: print("genomePriority.py - prepare genomePriority.tsv file from") print(" dbDb.hgcentral and UCSC_GI.assemblyHubList.txt file.\n") print("Usage: genomePriority.py dbDb.name.clade.tsv\n") print("the dbDb.name.clade.tsv file is a manually curated file to relate") print(" UCSC database names to GenArk clades, in source tree hubApi/") print("This script is going to read the dbDb.hgcentral table, and the file") print(" UCSC_GI.assemblyHubList.txt from hgdownload.") print("Writing an output file genomePriority.tsv to be loaded into") print(" genomePriority.hgcentral. See notes in this script for load procedure.") - sys.exit(-1) - - dbDbNameCladeFile = sys.argv[1] + sys.exit(255) # Ensure stdout and stderr use UTF-8 encoding set_utf8_encoding() + initCladePriority() + + dbDbNameCladeFile = sys.argv[1] + # the correspondence of dbDb names to GenArk clade categories dbDbClades = dbDbCladeList(dbDbNameCladeFile) # Get the dbDb.hgcentral table data rawData = dbDbData() dbDbItems = processDbDbData(rawData, dbDbClades) # read the GenArk data from hgdownload into a list of dictionaries genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" genArkItems = readGenArkData(genArkUrl) establishPriorities(dbDbItems, genArkItems) asmIdClade = readAsmIdClade() commonNames = allCommonNames() print("# all common names: ", len(commonNames)) -# genBankCommonNames = readCommonNames("genbank") -# print("# genBank common names: ", len(genBankCommonNames)) refSeqList = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade) print("# refSeq assemblies: ", len(refSeqList)) genBankList = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade) print("# genBank assemblies: ", len(genBankList)) refSeqGenBankList = refSeqList + genBankList print("# refSeq + genBank assemblies: ", len(refSeqGenBankList)) refSeqGenBankSorted = sorted(refSeqGenBankList, key=lambda x: x['sortOrder']) print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted)) outFile = "genomePriority.tsv" fileOut = open(outFile, 'w') @@ -674,31 +733,31 @@ cleanName = removeNonAlphanumeric(entry['commonName']) clade = entry['clade'] descr = f"{entry['asmName']}" description = re.sub(r'\s+', ' ', descr).strip() outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") incrementPriority = len(allPriorities) print("# incrementing priorities from: ", incrementPriority) itemCount = 0 - # Print the refSeq data + # Print the refSeq/genBank data for entry in refSeqGenBankSorted: gcAccession = entry['gcAccession'] commonName = entry['commonName'] scientificName = entry['scientificName'] clade = entry['clade'] descr = f"{entry['other']}" description = re.sub(r'\s+', ' ', descr).strip() outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\n" fileOut.write(outLine) incrementPriority += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}")