d84ccb1cf4dfe34ec488134fa7e57cd588d369e2 hiram Mon Jul 8 19:16:17 2024 -0700 this is now ready to clean up refs #32897 diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py index addcb207..8a2b13e 100755 --- src/hg/hubApi/genomePriority.py +++ src/hg/hubApi/genomePriority.py @@ -33,30 +33,58 @@ #################################################################### def dbDbData(): # Run the MySQL command and capture the output as bytes result = subprocess.run( ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT name,scientificName,organism,taxId,sourceName,description FROM dbDb WHERE active=1;", "hgcentral"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if result.returncode != 0: print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}") exit(1) # Decode the output from bytes to string using utf-8 return result.stdout.decode('latin-1') #################################################################### +def readAsmIdClade(): + asmIdClade = {} + + filePath = "/hive/data/outside/ncbi/genomes/reports/newAsm/asmId.clade.tsv" + + with open(filePath, 'r', encoding='utf-8') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + asmIdClade[row[0]] = row[1] + + return asmIdClade + +#################################################################### +def allCommonNames(): + commonNames = {} + + filePath = "/hive/data/outside/ncbi/genomes/reports/allCommonNames/asmId.commonName.all.txt" + + with open(filePath, 'r', encoding='utf-8') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + asmId = row[0] + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + commonNames[gcAcc] = row[1] + + return commonNames + +#################################################################### def readCommonNames(ncbiType): commonNames = {} filePath = "/hive/data/outside/ncbi/genomes/reports/" + ncbiType + "/allAssemblies.commonNames.tsv" with open(filePath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: commonNames[row[1]] = row[3] return commonNames #################################################################### """ header definitions from assembly_summary_refseq.txt @@ -85,95 +113,119 @@ hasn't changed. 2175 archaea 360585 bacteria 607 fungi 414 invertebrate 184 plant 96 protozoa 231 vertebrate_mammalian 405 vertebrate_other 14992 viral """ -def readAsmSummary(suffix, prioExists, comNames): +def readAsmSummary(suffix, prioExists, comNames, asmIdClade): # read one of the NCBI files from # /hive/data/outside/ncbi/genomes/reports/assembly_summary_{suffix} # Initialize a list to hold the dictionaries dataList = [] keys = [ +"primates", +"mammals", "vertebrate_mammalian", +"birds", +"fish", +"vertebrate", "vertebrate_other", "invertebrate", -"plant", +"plants", "fungi", "protozoa", "viral", "bacteria", "archaea", +"other", +"metagenomes", ] values = [ -'1', -'2', -'3', -'4', -'5', -'6', -'7', -'8', -'0', +'01', +'02', +'03', +'04', +'05', +'06', +'07', +'08', +'09', +'10', +'11', +'12', +'13', +'14', +'15', +'16', ] cladePrio = dict(zip(keys, values)) filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix with open(filePath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) < 1: continue if row[0].startswith('#'): continue if row[0] in prioExists: continue if len(row) != 38: print(f"ERROR: incorrect number of fields in {file}") sys.exit(1) gcAccession = row[0] strain = re.sub(r'breed=', '', row[8]) s0 = re.sub(r'cultivar=', '', strain) strain = re.sub(r'ecotype=', '', s0) s0 = re.sub(r'strain=', '', strain) strain = re.sub(r'na', '', s0) year = re.sub(r'/.*', '', row[14]) + asmName = row[15] + asmId = gcAccession + "_" + asmName asmSubmitter = row[16] asmType = row[23] # commonName = row[7] commonName = "n/a" if gcAccession in comNames: commonName = comNames[gcAccession] + clade = row[24] # almost like GenArk clades + if asmId in asmIdClade: # specific GenArk clade + clade = asmIdClade[asmId] + if clade == "plant": + clade = "plants" + if clade not in cladePrio: + print("missing clade ", clade, " in cladePrio") + sys.exit(-1) dataDict = { "gcAccession": gcAccession, - "asmName": row[15], + "asmName": asmName, "scientificName": row[7], "commonName": commonName, "taxId": row[5], - "clade": row[24], # almost like GenArk clades + "clade": clade, "other": asmSubmitter + " " + strain + " " + asmType + " " + year, - "sortOrder": cladePrio[row[24]] + "sortOrder": cladePrio[clade], } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) return sorted(dataList, key=lambda x: x['sortOrder']) #################################################################### ### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt def readGenArkData(url): # Initialize a list to hold the dictionaries dataList = [] response = requests.get(url) @@ -556,100 +608,111 @@ # Ensure stdout and stderr use UTF-8 encoding set_utf8_encoding() # the correspondence of dbDb names to GenArk clade categories dbDbClades = dbDbCladeList(dbDbNameCladeFile) # Get the dbDb.hgcentral table data rawData = dbDbData() dbDbItems = processDbDbData(rawData, dbDbClades) # read the GenArk data from hgdownload into a list of dictionaries genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" genArkItems = readGenArkData(genArkUrl) establishPriorities(dbDbItems, genArkItems) - refSeqCommonNames = readCommonNames("refseq") - print("# refseq common names: ", len(refSeqCommonNames)) - - refSeqList = readAsmSummary("refseq.txt", allPriorities, refSeqCommonNames) + asmIdClade = readAsmIdClade() + commonNames = allCommonNames() + print("# all common names: ", len(commonNames)) +# genBankCommonNames = readCommonNames("genbank") +# print("# genBank common names: ", len(genBankCommonNames)) + refSeqList = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade) print("# refSeq assemblies: ", len(refSeqList)) + genBankList = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade) + print("# genBank assemblies: ", len(genBankList)) + + refSeqGenBankList = refSeqList + genBankList + print("# refSeq + genBank assemblies: ", len(refSeqGenBankList)) + + refSeqGenBankSorted = sorted(refSeqGenBankList, key=lambda x: x['sortOrder']) + print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted)) + outFile = "genomePriority.tsv" fileOut = open(outFile, 'w') totalItemCount = 0 itemCount = 0 # Print the dbDb data for entry in dbDbItems: dbDbName = entry['name'] if dbDbName in allPriorities: priority = allPriorities[dbDbName] else: print("no priority for ", dbDbName) clade = entry['clade'] - descr = f"{entry['sourceName']} {clade} {entry['description']}\n" + descr = f"{entry['sourceName']} {entry['description']}\n" description = re.sub(r'\s+', ' ', descr).strip() - outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\t1\n" + outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) cleanName = removeNonAlphanumeric(entry['commonName']) clade = entry['clade'] - descr = f"{entry['asmName']} {clade}" + descr = f"{entry['asmName']}" description = re.sub(r'\s+', ' ', descr).strip() - outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\t1\n" + outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") - itemCount = 0 incrementPriority = len(allPriorities) print("# incrementing priorities from: ", incrementPriority) - # Print the dbDb data - for entry in refSeqList: + + itemCount = 0 + # Print the refSeq data + for entry in refSeqGenBankSorted: gcAccession = entry['gcAccession'] commonName = entry['commonName'] scientificName = entry['scientificName'] - descr = f"{entry['other']} {entry['clade']}" + clade = entry['clade'] + descr = f"{entry['other']}" description = re.sub(r'\s+', ' ', descr).strip() - outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\n" + outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\n" fileOut.write(outLine) incrementPriority += 1 itemCount += 1 totalItemCount += itemCount - print(f"{totalItemCount:4} - total\trefSeq count: {itemCount:4}") - -# def removeNonAlphanumeric(s): + print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}") fileOut.close() if __name__ == "__main__": main() """ "gcAccession": row[0], "asmName": row[15], "scientificName": row[7], "commonName": row[3], "taxId": row[5], "clade": row[24], # almost like GenArk clades "other": asmSubmitter + " " + strain + " " + asmType, """