54748ed558ead689973cb71209e4367d16f8450b hiram Thu Jun 27 23:20:53 2024 -0700 more specific sorting of primates and mammals database vs GenArk listings refs #32897 diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py index fe7245a..7f22cad 100755 --- src/hg/hubApi/genomePriority.py +++ src/hg/hubApi/genomePriority.py @@ -52,101 +52,102 @@ response = requests.get(url) response.raise_for_status() fileContent = response.text fileIo = StringIO(fileContent) reader = csv.reader(fileIo, delimiter='\t') for row in reader: if row and row[0].startswith('#'): continue dataDict = { "gcAccession": row[0], "asmName": row[1], "scientificName": row[2], "commonName": row[3], "taxId": row[4], - "clade": row[5], + "clade": re.sub(r'\(L\)$', '', row[5]), } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) # reset the list so that accessions such as GCF_000001405.40 # come before GCF_000001405.39 dataList.reverse() return dataList #################################################################### def dbDbCladeList(filePath): returnList = {} with open(filePath, 'r') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) < 1: continue if row[0].startswith('#'): continue returnList[row[0]] = row[1] return returnList #################################################################### -def readOrderTsv(clade): - returnList = {} - - homeDir = os.environ.get('HOME') - filePath = homeDir + "/kent/src/hg/makeDb/doc/" + clade + "AsmHub/" + clade + ".orderList.tsv" +def extractClade(clade, genArkData): + tmpList = {} + for item in genArkData: + if clade != item['clade']: + continue + tmpList[item['gcAccession']] = item['commonName'] - with open(filePath, 'r') as file: - reader = csv.reader(file, delimiter='\t') - for row in reader: - returnList[row[0]] = row[1] + # return sorted list on the common name, case insensitive + returnList = dict(sorted(tmpList.items(), key=lambda item:item[1].lower())) return returnList #################################################################### # Define a key function for sorting by the first word, case insensitive def getFirstWordCaseInsensitive(row): firstWord = row.split('\t')[0] # Extract the first word return firstWord.lower() # Convert to lowercase for case-insensitive sorting #################################################################### -def processDbDbData(data): +def processDbDbData(data, clades): # Initialize a list to hold the dictionaries dataList = [] # Split the data into lines (rows) rows = data.strip().split('\n') # reverse the rows so that names such as hg19 come before hg18 sortedRows = sorted(rows, key=getFirstWordCaseInsensitive, reverse=True) for row in sortedRows: # Split each row into columns columns = row.split('\t') + clade = clades.get(columns[0], "n/a") # corresponds with the SELECT statement # name,scientificName,organism,taxId,sourceName,description # Create a dictionary for each row dataDict = { "name": columns[0], "scientificName": columns[1], "organism": columns[2], "taxId": columns[3], "sourceName": columns[4], "description": columns[5], + "clade": clade, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) return dataList #################################################################### # Function to remove non-alphanumeric characters def removeNonAlphanumeric(s): # Ensure string type if isinstance(s, str): reSub = re.sub(r'[^a-zA-Z0-9_]', ' ', s).strip() return re.sub(r'\s+', ' ', reSub) @@ -175,39 +176,41 @@ seenWords.add(lowerWord) # Join the words back into a single string return ' '.join(resultWords) #################################################################### def establishPriorities(dbDb, genArk): global topPriorities global allPriorities global priorityCounter totalItemCount = 0 expectedTotal = len(dbDb) + len(genArk) print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes") + + # first priority are the specific manually selected top items itemCount = 0 for name, priority in topPriorities.items(): allPriorities[name] = priority itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\ttopPriorities count: {itemCount:4}") - primateList = readOrderTsv('primates') - mammalList = readOrderTsv('mammals') + primateList = extractClade('primates', genArk) + mammalList = extractClade('mammals', genArk) versionScan = {} # key is dbDb name without number version extension, # value is highest version number seen for this bare # name highestVersion = {} # key is dbDb name without number version extension, # value is the full dbDb name for the highest version # of this dbDb name allDbDbNames = {} # key is the full dbDb name, value is its version itemCount = 0 # scanning the dbDb entries, figure out the highest version number # of each name for item in dbDb: dbDbName = item['name'] splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName) @@ -216,61 +219,64 @@ itemCount += 1 if splitMatch.group(1) in versionScan: if splitMatch.group(2) > versionScan[splitMatch.group(1)]: versionScan[splitMatch.group(1)] = splitMatch.group(2) highestVersion[splitMatch.group(1)] = dbDbName else: versionScan[splitMatch.group(1)] = splitMatch.group(2) highestVersion[splitMatch.group(1)] = dbDbName else: print("### dbDb name does not split: ", dbDbName) allDbDbNames[dbDbName] = 0 itemCount += 1 dbDbLen = len(dbDb) + # second priority are the highest versioned database primates + # but not homo sapiens since we already have hg38 in topPriorities itemCount = 0 - # homo sapiens come before anything else - for item in dbDb: - dbDbName = item['name'] - if dbDbName not in allPriorities: - sciName = item['scientificName'] - if sciName.lower() == "homo sapiens": - allPriorities[dbDbName] = priorityCounter + sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True) + for key in sortByValue: + highVersion = highestVersion[key[0]] + if highVersion not in allPriorities: + # find the element in the dbDb list that matches this highVersion name + highDict = next((d for d in dbDb if d.get('name') == highVersion), None) + if highDict['clade'] == "primates": + if highDict['scientificName'].lower() != "homo sapiens": + allPriorities[highVersion] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount - print(f"{totalItemCount:4} - total\tdbDb homo sapiens count: {itemCount:3}") + print(f"{totalItemCount:4} - total\tdbDb highest version primates count: {itemCount:4}") itemCount = 0 - # and now the GenArk homo sapiens should be lined up here next, - # GCF/RefSeq first + # and now the GenArk GCF/RefSeq homo sapiens should be lined up here next for item in genArk: gcAccession = item['gcAccession'] if not gcAccession.startswith("GCF_"): continue if gcAccession not in allPriorities: sciName = item['scientificName'] if sciName.lower() == "homo sapiens": allPriorities[gcAccession] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCF homo sapiens count: {itemCount:4}") itemCount = 0 - # and now the GenArk homo sapiens should be lined up here next, + # and now the GenArk GCA/GenBank homo sapiens should be lined up here next # GCA/GenBank second for item in genArk: gcAccession = item['gcAccession'] if not gcAccession.startswith("GCA_"): continue if gcAccession not in allPriorities: sciName = item['scientificName'] if sciName.lower() == "homo sapiens": allPriorities[gcAccession] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCA homo sapiens count: {itemCount:4}") itemCount = 0 @@ -287,96 +293,111 @@ print(f"{totalItemCount:4} - total\tgenArk GCF primates count: {itemCount:4}") itemCount = 0 # and the GCA/GenBank primates for asmId, commonName in primateList.items(): gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] if not gcAcc.startswith("GCA_"): continue if gcAcc not in allPriorities: allPriorities[gcAcc] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCA primates count: {itemCount:4}") + # next are the highest versioned database mammals itemCount = 0 - # the highest versions of each unique dbDb name get the top priorities sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True) for key in sortByValue: highVersion = highestVersion[key[0]] if highVersion not in allPriorities: + # find the element in the dbDb list that matches this highVersion name + highDict = next((d for d in dbDb if d.get('name') == highVersion), None) + if highDict['clade'] == "mammals": allPriorities[highVersion] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount - print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}") + print(f"{totalItemCount:4} - total\tdbDb highest version mammals count: {itemCount:4}") itemCount = 0 # the mammals, GCF/RefSeq first for asmId, commonName in mammalList.items(): gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] if not gcAcc.startswith("GCF_"): continue if gcAcc not in allPriorities: allPriorities[gcAcc] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCF mammals count: {itemCount:4}") itemCount = 0 # and the GCA/GenBank mammals for asmId, commonName in mammalList.items(): gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] if not gcAcc.startswith("GCA_"): continue if gcAcc not in allPriorities: allPriorities[gcAcc] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCA mammals count: {itemCount:4}") itemCount = 0 + # the rest of the highest versions of each unique dbDb name + sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True) + for key in sortByValue: + highVersion = highestVersion[key[0]] + if highVersion not in allPriorities: + allPriorities[highVersion] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}") + + itemCount = 0 # GCF RefSeq from GenArk next priority for item in genArk: gcAccession = item['gcAccession'] if not gcAccession.startswith("GCF_"): continue if gcAccession not in allPriorities: allPriorities[gcAccession] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCF count: {itemCount:4}") itemCount = 0 # GCA GenBank from GenArk next priority for item in genArk: gcAccession = item['gcAccession'] if not gcAccession.startswith("GCA_"): continue if gcAccession not in allPriorities: allPriorities[gcAccession] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk GCA count: {itemCount:4}") itemCount = 0 - # the rest of the names just get incrementing priorities + # finally the rest of the database genomes names for dbName in sorted(allDbDbNames): if dbName not in allPriorities: allPriorities[dbName] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tthe rest of dbDb count: {itemCount:4}") #################################################################### """ table load procedure: hgsql -e 'DROP TABLE IF EXISTS genomePriority;' hgcentraltest hgsql hgcentraltest < genomePriority.sql hgsql -e 'LOAD DATA LOCAL INFILE "genomePriority.tsv" INTO @@ -397,68 +418,68 @@ print(" dbDb.hgcentral and UCSC_GI.assemblyHubList.txt file.\n") print("Usage: genomePriority.py dbDb.name.clade.tsv\n") print("the dbDb.name.clade.tsv file is a manually curated file to relate") print(" UCSC database names to GenArk clades, in source tree hubApi/") print("This script is going to read the dbDb.hgcentral table, and the file") print(" UCSC_GI.assemblyHubList.txt from hgdownload.") print("Writing an output file genomePriority.tsv to be loaded into") print(" genomePriority.hgcentral. See notes in this script for load procedure.") sys.exit(-1) dbDbNameCladeFile = sys.argv[1] # Ensure stdout and stderr use UTF-8 encoding set_utf8_encoding() + # the correspondence of dbDb names to GenArk clade categories + dbDbClades = dbDbCladeList(dbDbNameCladeFile) # Get the dbDb.hgcentral table data rawData = dbDbData() - dbDbItems = processDbDbData(rawData) - # and the correspondence of dbDb names to GenArk clade categories - dbDbClades = dbDbCladeList(dbDbNameCladeFile) + dbDbItems = processDbDbData(rawData, dbDbClades) # read the GenArk data from hgdownload into a list of dictionaries genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" genArkItems = readGenArkData(genArkUrl) establishPriorities(dbDbItems, genArkItems) outFile = "genomePriority.tsv" fileOut = open(outFile, 'w') itemCount = 0 # Print the dbDb data for entry in dbDbItems: dbDbName = entry['name'] if dbDbName in allPriorities: priority = allPriorities[dbDbName] else: print("no priority for ", dbDbName) - clade = dbDbClades.get(entry['name'], "n/a") + clade = entry['clade'] descr = f"{entry['sourceName']} {clade} {entry['taxId']} {entry['description']}\n" description = re.sub(r'\s+', ' ', descr).strip() outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n" fileOut.write(outLine) itemCount += 1 itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) cleanName = removeNonAlphanumeric(entry['commonName']) - clade = re.sub(r'\(L\)$', '', entry['clade']) + clade = entry['clade'] descr = f"{entry['asmName']} {clade} {entry['taxId']}\n" description = re.sub(r'\s+', ' ', descr).strip() outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n" fileOut.write(outLine) itemCount += 1 fileOut.close() if __name__ == "__main__": main()