fa3849abfc8a9e5ac4f5e44a2f8115701446ca3f hiram Thu Feb 5 15:01:40 2026 -0800 now adding haplotype CSV list when available refs #34736 diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py index a21a063783b..dc73bdc62c9 100755 --- src/hg/hubApi/assemblyList.py +++ src/hg/hubApi/assemblyList.py @@ -1,25 +1,28 @@ #!/cluster/software/bin/python3 import subprocess import locale +import gzip import sys import re import os import csv import requests +from pathlib import Path from io import StringIO +from datetime import datetime, UTC # priorities derived from the 'Monthly usage stats' report # from the qateam cron job running on the first day of the month # special top priorities topPriorities = { 'hg38': 1, 'mm39': 2, 'hs1': 3, 'hg19': 4, 'mm10': 5, 'dm6': 6, 'danRer11': 7, 'mm9': 8, 'hetGla2': 9, 'rn6': 10, @@ -41,30 +44,49 @@ #################################################################### ### this is kinda like an environment setting, it gets everything ### into a UTF-8 reading mode #################################################################### def set_utf8_encoding(): """ Set UTF-8 encoding for stdin, stdout, and stderr in Python. """ if sys.stdout.encoding != 'utf-8': sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1) if sys.stderr.encoding != 'utf-8': sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1) #################################################################### +### read in haplotype CSV listing if the file exists +#################################################################### +def readHaplotypes(filePath): + hapList = {} + hapCount = 0 + if Path(filePath).exists(): + with gzip.open(filePath, "rt") as fh: + for line in fh: + line = line.rstrip("\n") + if not line: + continue + asmId, csvHaplotypes = line.split("\t", 1) + parts = asmId.split('_', 2) + accession = '_'.join(parts[:2]) + hapList[accession] = csvHaplotypes + hapCount += 1 + print(f"# haplotype list: {hapCount}") + return hapList + ### add aliases to description when they are not there yet #################################################################### def addAliases(browserName, aliasData, descr): description = descr if browserName in aliasData: aliasList = aliasData[browserName] descriptionSetLc = set(word.lower() for word in description.split()) for alias in aliasList: if alias.lower() not in descriptionSetLc: descriptionSetLc.add(alias.lower()) description += " " + alias return description #################################################################### ### reading hgcentral.asmAlias table @@ -785,30 +807,31 @@ print(" UCSC database names to GenArk clades, in source tree hubApi/") print("This script is going to read the dbDb.hgcentral table, and the file") print(" UCSC_GI.assemblyHubList.txt from hgdownload.") print("Writing an output file assemblyList.tsv to be loaded into") print(" assemblyList.hgcentral. The output file needs to be sorted") print(" sort -k2,2n assemblyList.tsv before table load.") sys.exit(255) # Ensure stdout and stderr use UTF-8 encoding set_utf8_encoding() initCladePriority() dbDbNameCladeFile = sys.argv[1] + csvHaplotypes = readHaplotypes("/hive/data/outside/ncbi/genomes/reports/haploscan/csvHaplotypes.tsv.gz") # the correspondence of dbDb names to GenArk clade categories dbDbClades, dbDbYears, dbDbNcbi = dbDbCladeList(dbDbNameCladeFile) # Get the dbDb.hgcentral table data rawData = dbDbData() dbDbItems = processDbDbData(rawData, dbDbClades, dbDbYears, dbDbNcbi) aliasData = asmAliasData() # read the GenArk data from hgdownload into a list of dictionaries genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" genArkItems = readGenArkData(genArkUrl) establishPriorities(dbDbItems, genArkItems) asmIdClade = readAsmIdClade() commonNames = allCommonNames() @@ -859,85 +882,90 @@ if "na" not in gcAccession: if gcAccession in allStatus: stat = allStatus[gcAccession] refSeqCategory = stat['refSeqCategory'].lower() versionStatus = stat['versionStatus'].lower() assemblyLevel = stat['assemblyLevel'].lower() if gcAccession not in description: description += " " + gcAccession ### add alias names to description if they are not already there description = addAliases(dbDbName, aliasData, description) descr = f"{entry['sourceName']} {entry['taxId']} {description}" if year not in organism and year not in descr: descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']} {year}" description = re.sub(r'\s+', ' ', descr).strip() - outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t\n" + haplotypes = csvHaplotypes.get(entry['name'], "") + outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t{haplotypes}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) sys.exit(255) hubPath = genarkPath(gcAccession) commonName = entry['commonName'] clade = entry['clade'] year = entry['year'] + if year == 0: + year = datetime.now(UTC).strftime("%Y-%m-%d") descr = f"{entry['asmName']} {entry['taxId']}" if year not in commonName and year not in descr: descr = f"{entry['asmName']} {entry['taxId']} {year}" ### add alias names to description if they are not already there descr = addAliases(gcAccession, aliasData, descr) description = re.sub(r'\s+', ' ', descr).strip() refSeqCategory = entry['refSeqCategory'].lower() versionStatus = entry['versionStatus'].lower() assemblyLevel = entry['assemblyLevel'].lower() - outLine = f"{entry['gcAccession']}\t{priority}\t{commonName.encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t\n" + haplotypes = csvHaplotypes.get(entry['gcAccession'], "") + outLine = f"{entry['gcAccession']}\t{priority}\t{commonName.encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t{haplotypes}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") incrementPriority = len(allPriorities) + 1 print("# incrementing priorities from: ", incrementPriority) itemCount = 0 # Print the refSeq/genBank data for entry in refSeqGenBankSorted: gcAccession = entry['gcAccession'] commonName = entry['commonName'] scientificName = entry['scientificName'] asmName = entry['asmName'] clade = entry['clade'] year = entry['year'] refSeqCategory = entry['refSeqCategory'].lower() versionStatus = entry['versionStatus'].lower() assemblyLevel = entry['assemblyLevel'].lower() descr = f"{asmName} {entry['taxId']} {entry['other']}" if year not in commonName and year not in descr: descr = f"{asmName} {entry['taxId']} {entry['other']} {year}" ### add alias names to description if they are not already there descr = addAliases(gcAccession, aliasData, descr) description = re.sub(r'\s+', ' ', descr).strip() - outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t\n" + haplotypes = csvHaplotypes.get(entry['gcAccession'], "") + outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\t{haplotypes}\n" fileOut.write(outLine) incrementPriority += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}") fileOut.close() if __name__ == "__main__": main()