733bfccc6366eca9cdb40718caff16406b38efee hiram Wed Jun 26 14:41:14 2024 -0700 creating file to load into genomePriority.hgcentral for search purpose refs #32897 diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py new file mode 100755 index 0000000..65e4830 --- /dev/null +++ src/hg/hubApi/genomePriority.py @@ -0,0 +1,495 @@ +#!/cluster/software/bin/python3 + +import subprocess +import locale +import sys +import re +import os +import csv +import requests +from io import StringIO + +# special top priorities +topPriorities = { + 'hg38': 1, + 'mm39': 2, + 'hs1': 3, +} + +### key will be dbDb name, value will be priority number +allPriorities = {} + +priorityCounter = len(topPriorities) + 1 + +#################################################################### +def set_utf8_encoding(): + """ + Set UTF-8 encoding for stdin, stdout, and stderr in Python. + """ + if sys.stdout.encoding != 'utf-8': + sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1) + if sys.stderr.encoding != 'utf-8': + sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1) + +#################################################################### +def dbDbData(): + # Run the MySQL command and capture the output as bytes + result = subprocess.run( + ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT name,scientificName,organism,taxId,sourceName,description FROM dbDb WHERE active=1;", "hgcentral"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + if result.returncode != 0: + print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}") + exit(1) + # Decode the output from bytes to string using utf-8 + return result.stdout.decode('latin-1') + +#################################################################### +### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt +def readGenArkData(url): + # Initialize a list to hold the dictionaries + dataList = [] + + response = requests.get(url) + response.raise_for_status() + fileContent = response.text + fileIo = StringIO(fileContent) + reader = csv.reader(fileIo, delimiter='\t') + for row in reader: + if row and row[0].startswith('#'): + continue + dataDict = { + "gcAccession": row[0], + "asmName": row[1], + "scientificName": row[2], + "commonName": row[3], + "taxId": row[4], + "clade": row[5], + } + + utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} + # Append the dictionary to the list + dataList.append(utf8Encoded) + + # reset the list so that accessions such as GCF_000001405.40 + # come before GCF_000001405.39 + dataList.reverse() + return dataList + +#################################################################### +def dbDbCladeList(filePath): + returnList = {} + + with open(filePath, 'r') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + if len(row) < 1: + continue + if row[0].startswith('#'): + continue + returnList[row[0]] = row[1] + + return returnList + +#################################################################### +def readOrderTsv(clade): + returnList = {} + + homeDir = os.environ.get('HOME') + filePath = homeDir + "/kent/src/hg/makeDb/doc/" + clade + "AsmHub/" + clade + ".orderList.tsv" + + with open(filePath, 'r') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + returnList[row[0]] = row[1] + + return returnList + +#################################################################### +# Define a key function for sorting by the first word, case insensitive +def getFirstWordCaseInsensitive(row): + firstWord = row.split('\t')[0] # Extract the first word + return firstWord.lower() # Convert to lowercase for case-insensitive sorting + +#################################################################### +def processDbDbData(data): + # Initialize a list to hold the dictionaries + dataList = [] + + # Split the data into lines (rows) + rows = data.strip().split('\n') + # reverse the rows so that names such as hg19 come before hg18 + sortedRows = sorted(rows, key=getFirstWordCaseInsensitive, reverse=True) + + for row in sortedRows: + # Split each row into columns + columns = row.split('\t') + + # corresponds with the SELECT statement + # name,scientificName,organism,taxId,sourceName,description + # Create a dictionary for each row + dataDict = { + "name": columns[0], + "scientificName": columns[1], + "organism": columns[2], + "taxId": columns[3], + "sourceName": columns[4], + "description": columns[5], + } + + utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} + # Append the dictionary to the list + dataList.append(utf8Encoded) + + return dataList + +#################################################################### +# Function to remove non-alphanumeric characters +def removeNonAlphanumeric(s): + # Ensure string type + if isinstance(s, str): + reSub = re.sub(r'[^a-zA-Z0-9_]', ' ', s).strip() + return re.sub(r'\s+', ' ', reSub) + else: + return s # Return as-is for non-string types + +#################################################################### +def eliminateDupWords(s): + # Split the sentence into words + words = s.split() + + # Initialize a set to keep track of unique words encountered + seenWords = set() + + # List to store the words with duplicates removed + resultWords = [] + + for word in words: + # Convert word to lowercase for case-insensitive comparison + lowerWord = word.lower() + + # Check if the lowercase version of the word has been seen before + if lowerWord not in seenWords: + # If not seen, add it to the result list and mark as seen + resultWords.append(word) + seenWords.add(lowerWord) + + # Join the words back into a single string + return ' '.join(resultWords) + +#################################################################### +def establishPriorities(dbDb, genArk): + global topPriorities + global allPriorities + global priorityCounter + + totalItemCount = 0 + + print(f"### setting priorities, {len(dbDb):4} dbDb genomes, {len(genArk):4} genArk genomes") + itemCount = 0 + for name, priority in topPriorities.items(): + allPriorities[name] = priority + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\ttopPriorities count: {itemCount:4}") + + primateList = readOrderTsv('primates') + mammalList = readOrderTsv('mammals') + + versionScan = {} # key is dbDb name without number version extension, + # value is highest version number seen for this bare + # name + highestVersion = {} # key is dbDb name without number version extension, + # value is the full dbDb name for the highest version + # of this dbDb name + allDbDbNames = {} # key is the full dbDb name, value is its version + + itemCount = 0 + # scanning the dbDb entries, figure out the highest version number + # of each name + for item in dbDb: + dbDbName = item['name'] + splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName) + if splitMatch: + allDbDbNames[dbDbName] = splitMatch.group(2) + itemCount += 1 + if splitMatch.group(1) in versionScan: + if splitMatch.group(2) > versionScan[splitMatch.group(1)]: + versionScan[splitMatch.group(1)] = splitMatch.group(2) + highestVersion[splitMatch.group(1)] = dbDbName + else: + versionScan[splitMatch.group(1)] = splitMatch.group(2) + highestVersion[splitMatch.group(1)] = dbDbName + else: + print("### dbDb name does not split: ", dbDbName) + allDbDbNames[dbDbName] = 0 + itemCount += 1 + + dbDbLen = len(dbDb) + + itemCount = 0 + # homo sapiens come before anything else + for item in dbDb: + dbDbName = item['name'] + if dbDbName not in allPriorities: + sciName = item['scientificName'] + if sciName.lower() == "homo sapiens": + allPriorities[dbDbName] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tdbDb homo sapiens count: {itemCount:3}") + + itemCount = 0 + # and now the GenArk homo sapiens should be lined up here next, + # GCF/RefSeq first + for item in genArk: + gcAccession = item['gcAccession'] + if not gcAccession.startswith("GCF_"): + continue + if gcAccession not in allPriorities: + sciName = item['scientificName'] + if sciName.lower() == "homo sapiens": + allPriorities[gcAccession] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCF homo sapiens count: {itemCount:4}") + + itemCount = 0 + # and now the GenArk homo sapiens should be lined up here next, + # GCA/GenBank second + for item in genArk: + gcAccession = item['gcAccession'] + if not gcAccession.startswith("GCA_"): + continue + if gcAccession not in allPriorities: + sciName = item['scientificName'] + if sciName.lower() == "homo sapiens": + allPriorities[gcAccession] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCA homo sapiens count: {itemCount:4}") + + itemCount = 0 + # the primates, GCF/RefSeq first + for asmId, commonName in primateList.items(): + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + if not gcAcc.startswith("GCF_"): + continue + if gcAcc not in allPriorities: + allPriorities[gcAcc] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCF primates count: {itemCount:4}") + + itemCount = 0 + # and the GCA/GenBank primates + for asmId, commonName in primateList.items(): + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + if not gcAcc.startswith("GCA_"): + continue + if gcAcc not in allPriorities: + allPriorities[gcAcc] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCA primates count: {itemCount:4}") + + itemCount = 0 + # the highest versions of each unique dbDb name get the top priorities + sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True) + for key in sortByValue: + highVersion = highestVersion[key[0]] +# if key[0] not in allPriorities: + if highVersion not in allPriorities: +# allPriorities[key[0]] = priorityCounter + allPriorities[highVersion] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}") + + itemCount = 0 + # the mammals, GCF/RefSeq first + for asmId, commonName in mammalList.items(): + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + if not gcAcc.startswith("GCF_"): + continue + if gcAcc not in allPriorities: + allPriorities[gcAcc] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCF mammals count: {itemCount:4}") + + itemCount = 0 + # and the GCA/GenBank mammals + for asmId, commonName in mammalList.items(): + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + if not gcAcc.startswith("GCA_"): + continue + if gcAcc not in allPriorities: + allPriorities[gcAcc] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCA mammals count: {itemCount:4}") + + itemCount = 0 + # GCF RefSeq from GenArk next priority + for item in genArk: + gcAccession = item['gcAccession'] + if not gcAccession.startswith("GCF_"): + continue + if gcAccession not in allPriorities: + allPriorities[gcAccession] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCF count: {itemCount:4}") + + itemCount = 0 + # GCA GenBank from GenArk next priority + for item in genArk: + gcAccession = item['gcAccession'] + if not gcAccession.startswith("GCA_"): + continue + if gcAccession not in allPriorities: + allPriorities[gcAccession] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCA count: {itemCount:4}") + + itemCount = 0 + # the rest of the names just get incrementing priorities + for dbName in sorted(allDbDbNames): + if dbName not in allPriorities: + allPriorities[dbName] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tthe rest of dbDb count: {itemCount:4}") + +#################################################################### +""" +table load procedure: + + hgsql -e 'DROP TABLE IF EXISTS genomePriority;' hgcentraltest + hgsql hgcentraltest < genomePriority.sql + hgsql -e 'LOAD DATA LOCAL INFILE "genomePriority.tsv" INTO +TABLE genomePriority;' hgcentraltest + hgsql -e 'ALTER TABLE genomePriority +ADD FULLTEXT INDEX gdIx +(name, commonName, scientificName, description);' hgcentraltest +""" +#################################################################### + +#################################################################### +def main(): + global priorityCounter + global allPriorities + + if len(sys.argv) != 2: + print("genomePriority.py - prepare genomePriority.tsv file from") + print(" dbDb.hgcentral and UCSC_GI.assemblyHubList.txt file.\n") + print("Usage: genomePriority.py dbDb.name.clade.tsv\n") + print("the dbDb.name.clade.tsv file is a manually curated file to relate") + print(" UCSC database names to GenArk clades, in source tree hubApi/") + print("This script is going to read the dbDb.hgcentral table, and the file") + print(" UCSC_GI.assemblyHubList.txt from hgdownload.") + print("Writing an output file genomePriority.tsv to be loaded into") + print(" genomePriority.hgcentral. See notes in this script for load procedure.") + sys.exit(-1) + + dbDbNameCladeFile = sys.argv[1] + + # Ensure stdout and stderr use UTF-8 encoding + set_utf8_encoding() + + # Get the dbDb.hgcentral table data + rawData = dbDbData() + dbDbItems = processDbDbData(rawData) + # and the correspondence of dbDb names to GenArk clade categories + dbDbClades = dbDbCladeList(dbDbNameCladeFile) + + # read the GenArk data from hgdownload into a list of dictionaries + genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" + genArkItems = readGenArkData(genArkUrl) + + establishPriorities(dbDbItems, genArkItems) + + outFile = "genomePriority.tsv" + fileOut = open(outFile, 'w') + + itemCount = 0 + # name,scientificName,organism,taxId,sourceName,description + # Print the dbDb data + for entry in dbDbItems: + # Encode each entry value to UTF-8 before printing + dbDbName = entry['name'] + if dbDbName in allPriorities: + priority = allPriorities[dbDbName] + else: + print("no priority for ", dbDbName) + + clade = dbDbClades.get(entry['name'], "n/a") + + indexString = entry['name'] + indexString += " " + removeNonAlphanumeric(entry['scientificName']) + indexString += " " + removeNonAlphanumeric(entry['organism']) + indexString += " " + removeNonAlphanumeric(entry['description']) + indexString += " " + removeNonAlphanumeric(entry['sourceName']) + indexString += " " + entry['taxId'] + noDups = eliminateDupWords(indexString) + descr = f"{entry['sourceName']} {clade} {entry['taxId']} {entry['description']}\n" + description = re.sub(r'\s+', ' ', descr).strip() +# outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\t{noDups}\n" + outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n" + fileOut.write(outLine) + itemCount += 1 + + itemCount = 0 + # Print the GenArk data + for entry in genArkItems: + gcAccession = entry['gcAccession'] + if gcAccession in allPriorities: + priority = allPriorities[gcAccession] + else: + print("no priority for ", gcAccession) + + cleanName = removeNonAlphanumeric(entry['commonName']) + clade = re.sub(r'\(L\)$', '', entry['clade']) + indexString = entry['gcAccession'] + indexString += " " + removeNonAlphanumeric(entry['scientificName']) + indexString += " " + cleanName + indexString += " " + clade + indexString += " " + removeNonAlphanumeric(entry['asmName']) + indexString += " " + entry['taxId'] + noDups = eliminateDupWords(indexString) +# print(priority, entry['gcAccession'], entry['scientificName'], entry['commonName'], entry['taxId'], entry['asmName'], "'" + noDups + "'") +# print(priority, entry['gcAccession'], entry['scientificName'], entry['commonName'], entry['taxId'], "'" + noDups + "'") +# outLine =f"{priority}\t{entry['gcAccession']}\t{entry['scientificName']}\t{entry['taxId']}\t{entry['commonName']}\t{noDups}\n" +# outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{entry['asmName']}\t{noDups}\n" + descr = f"{entry['asmName']} {clade} {entry['taxId']}\n" + description = re.sub(r'\s+', ' ', descr).strip() + outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n" + fileOut.write(outLine) + itemCount += 1 + + fileOut.close() + +# print(removeNonAlphanumeric(entry['asmName'])) +# print(removeNonAlphanumeric(entry['commonName'])) +# gcAccession,scientificName,commonName,taxId,asmName + + +# cleanString = {k: removeNonAlphanumeric(v) for k, v in entry.items()} +# print(cleanString) + + +if __name__ == "__main__": + main()