72dbdcd6baa45774ed51ce6640d2cb19c7f5c583 hiram Thu Jul 31 15:12:01 2025 -0700 add in assembly alias names in the "description" so they can also be matched refs #33720 diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py index 89ca7fa50f5..50de7c34705 100755 --- src/hg/hubApi/assemblyList.py +++ src/hg/hubApi/assemblyList.py @@ -26,30 +26,79 @@ cladePrio = {} #################################################################### ### this is kinda like an environment setting, it gets everything ### into a UTF-8 reading mode #################################################################### def set_utf8_encoding(): """ Set UTF-8 encoding for stdin, stdout, and stderr in Python. """ if sys.stdout.encoding != 'utf-8': sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1) if sys.stderr.encoding != 'utf-8': sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1) +#################################################################### +### add aliases to description when they are not there yet +#################################################################### +def addAliases(browserName, aliasData, descr): + description = descr + if browserName in aliasData: + aliasList = aliasData[browserName] + descriptionSetLc = set(word.lower() for word in description.split()) + for alias in aliasList: + if alias.lower() not in descriptionSetLc: + descriptionSetLc.add(alias.lower()) + description += " " + alias + return description + +#################################################################### +### reading hgcentral.asmAlias table +#################################################################### +def asmAliasData(): + # Run the MySQL command and capture the output as bytes + result = subprocess.run( + ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT alias,browser FROM asmAlias;", "hgcentral"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + if result.returncode != 0: + print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}") + exit(1) + # Decode the output from bytes to string using utf-8 + decoded = result.stdout.decode('latin-1') + # Split the data into lines (rows) + rows = decoded.strip().split('\n') + + aliasData = {} + + aliasCount = 0 + browserCount = 0 + + for row in rows: + columns = row.split('\t') # Split each row into columns + alias = columns[0] + browser = columns[1] + if browser not in aliasData: + aliasData[browser] = [] + browserCount += 1 + aliasData[browser].append(alias) + aliasCount += 1 + + print(f"# counted {aliasCount} aliases for {browserCount} 'browsers'") + return aliasData + #################################################################### ### reading hgcentral.dbDb table #################################################################### def dbDbData(): # Run the MySQL command and capture the output as bytes result = subprocess.run( ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT name,scientificName,organism,taxId,sourceName,description FROM dbDb WHERE active=1;", "hgcentral"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if result.returncode != 0: print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}") exit(1) # Decode the output from bytes to string using utf-8 return result.stdout.decode('latin-1') @@ -510,31 +559,31 @@ mammalList = extractClade('mammals', genArk) versionScan = {} # key is dbDb name without number version extension, # value is highest version number seen for this bare # name highestVersion = {} # key is dbDb name without number version extension, # value is the full dbDb name for the highest version # of this dbDb name allDbDbNames = {} # key is the full dbDb name, value is its version itemCount = 0 # scanning the dbDb entries, figure out the highest version number # of each name for item in dbDb: dbDbName = item['name'] - splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName) + splitMatch = re.match(r"([a-zA-Z]+)(\d+)", dbDbName) if splitMatch: allDbDbNames[dbDbName] = splitMatch.group(2) itemCount += 1 if splitMatch.group(1) in versionScan: if float(splitMatch.group(2)) > float(versionScan[splitMatch.group(1)]): versionScan[splitMatch.group(1)] = splitMatch.group(2) highestVersion[splitMatch.group(1)] = dbDbName else: versionScan[splitMatch.group(1)] = splitMatch.group(2) highestVersion[splitMatch.group(1)] = dbDbName else: print("### dbDb name does not split: ", dbDbName) allDbDbNames[dbDbName] = 0 itemCount += 1 @@ -648,31 +697,31 @@ gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] if not gcAcc.startswith("GCA_"): continue if gcAcc not in allPriorities: allPriorities[gcAcc] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCA mammals count: {itemCount:4}") itemCount = 0 # dbDb is in cladeOrder, process in that order, find highest versions for item in dbDb: dbDbName = item['name'] if dbDbName not in allPriorities: - splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName) + splitMatch = re.match(r"([a-zA-Z]+)(\d+)", dbDbName) if splitMatch: noVersion = splitMatch.group(1) version = allDbDbNames[dbDbName] highVersion = versionScan[noVersion] if highVersion == version: allPriorities[dbDbName] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}") itemCount = 0 # GCF RefSeq from GenArk next priority for item in genArk: gcAccession = item['gcAccession'] @@ -727,30 +776,31 @@ print(" sort -k2,2n assemblyList.tsv before table load.") sys.exit(255) # Ensure stdout and stderr use UTF-8 encoding set_utf8_encoding() initCladePriority() dbDbNameCladeFile = sys.argv[1] # the correspondence of dbDb names to GenArk clade categories dbDbClades, dbDbYears, dbDbNcbi = dbDbCladeList(dbDbNameCladeFile) # Get the dbDb.hgcentral table data rawData = dbDbData() dbDbItems = processDbDbData(rawData, dbDbClades, dbDbYears, dbDbNcbi) + aliasData = asmAliasData() # read the GenArk data from hgdownload into a list of dictionaries genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" genArkItems = readGenArkData(genArkUrl) establishPriorities(dbDbItems, genArkItems) asmIdClade = readAsmIdClade() commonNames = allCommonNames() print("# all common names: ", len(commonNames)) refSeqList, refSeqYears, refSeqStatus = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade) print("# refSeq assemblies: ", len(refSeqList)) refSeqListHist, refSeqYearsHist, refSeqStatusHist = readAsmSummary("refseq_historical.txt", allPriorities, commonNames, asmIdClade) print("# refSeq historical assemblies: ", len(refSeqListHist)) @@ -788,86 +838,92 @@ year = entry['year'] gcAccession = entry['gcAccession'] description = entry['description'] refSeqCategory = "" versionStatus = "" assemblyLevel = "" organism = entry['organism'] if "na" not in gcAccession: if gcAccession in allStatus: stat = allStatus[gcAccession] refSeqCategory = stat['refSeqCategory'].lower() versionStatus = stat['versionStatus'].lower() assemblyLevel = stat['assemblyLevel'].lower() if gcAccession not in description: description += " " + gcAccession + ### add alias names to description if they are not already there + description = addAliases(dbDbName, aliasData, description) descr = f"{entry['sourceName']} {entry['taxId']} {description}" if year not in organism and year not in descr: descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']} {year}" description = re.sub(r'\s+', ' ', descr).strip() outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) sys.exit(255) hubPath = genarkPath(gcAccession) commonName = entry['commonName'] clade = entry['clade'] year = entry['year'] descr = f"{entry['asmName']} {entry['taxId']}" if year not in commonName and year not in descr: descr = f"{entry['asmName']} {entry['taxId']} {year}" + ### add alias names to description if they are not already there + descr = addAliases(gcAccession, aliasData, descr) description = re.sub(r'\s+', ' ', descr).strip() refSeqCategory = entry['refSeqCategory'].lower() versionStatus = entry['versionStatus'].lower() assemblyLevel = entry['assemblyLevel'].lower() outLine = f"{entry['gcAccession']}\t{priority}\t{commonName.encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") incrementPriority = len(allPriorities) + 1 print("# incrementing priorities from: ", incrementPriority) itemCount = 0 # Print the refSeq/genBank data for entry in refSeqGenBankSorted: gcAccession = entry['gcAccession'] commonName = entry['commonName'] scientificName = entry['scientificName'] asmName = entry['asmName'] clade = entry['clade'] year = entry['year'] refSeqCategory = entry['refSeqCategory'].lower() versionStatus = entry['versionStatus'].lower() assemblyLevel = entry['assemblyLevel'].lower() descr = f"{asmName} {entry['taxId']} {entry['other']}" if year not in commonName and year not in descr: descr = f"{asmName} {entry['taxId']} {entry['other']} {year}" + ### add alias names to description if they are not already there + descr = addAliases(gcAccession, aliasData, descr) description = re.sub(r'\s+', ' ', descr).strip() outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) incrementPriority += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}") fileOut.close() if __name__ == "__main__": main()