455920818135586578c0382cf43a165aaafdcb8e hiram Fri Jul 19 11:09:42 2024 -0700 transition the prototype genomePriority.py to this production script to be used by otto cronjob refs #33720 diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py new file mode 100755 index 0000000..6a547ad --- /dev/null +++ src/hg/hubApi/assemblyList.py @@ -0,0 +1,766 @@ +#!/cluster/software/bin/python3 + +import subprocess +import locale +import sys +import re +import os +import csv +import requests +from io import StringIO + +# special top priorities +topPriorities = { + 'hg38': 1, + 'mm39': 2, + 'hs1': 3, +} + +### key will be dbDb/GCx name, value will be priority number +allPriorities = {} + +priorityCounter = len(topPriorities) + 1 + +# key is clade, value is priority, will be initialized +# by initCladePriority() function +cladePrio = {} + +#################################################################### +### this is kinda like an environment setting, it gets everything +### into a UTF-8 reading mode +#################################################################### +def set_utf8_encoding(): + """ + Set UTF-8 encoding for stdin, stdout, and stderr in Python. + """ + if sys.stdout.encoding != 'utf-8': + sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=1) + if sys.stderr.encoding != 'utf-8': + sys.stderr = open(sys.stderr.fileno(), mode='w', encoding='utf-8', buffering=1) + +#################################################################### +### reading hgcentral.dbDb table +#################################################################### +def dbDbData(): + # Run the MySQL command and capture the output as bytes + result = subprocess.run( + ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT name,scientificName,organism,taxId,sourceName,description FROM dbDb WHERE active=1;", "hgcentral"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + if result.returncode != 0: + print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}") + exit(1) + # Decode the output from bytes to string using utf-8 + return result.stdout.decode('latin-1') + +#################################################################### +### this clade file is setup via weekly cron jobs +### 05 00 * * 3 /hive/data/outside/ncbi/genomes/cronUpdates/ncbiRsync.sh GCA +### 05 00 * * 6 /hive/data/outside/ncbi/genomes/cronUpdates/ncbiRsync.sh GCF +#################################################################### +def readAsmIdClade(): + asmIdClade = {} + + filePath = "/hive/data/outside/ncbi/genomes/reports/newAsm/asmId.clade.tsv" + + with open(filePath, 'r', encoding='utf-8') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + asmIdClade[row[0]] = row[1] + + return asmIdClade + +#################################################################### +### this common name file is setup weekly via a cron job +#################################################################### +def allCommonNames(): + commonNames = {} + + filePath = "/hive/data/outside/ncbi/genomes/reports/allCommonNames/asmId.commonName.all.txt" + + with open(filePath, 'r', encoding='utf-8') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + asmId = row[0] + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + commonNames[gcAcc] = row[1] + + return commonNames + +#################################################################### +""" + header definitions from assembly_summary_refseq.txt + + 1 #assembly_accession 20 ftp_path + 2 bioproject 21 excluded_from_refseq + 3 biosample 22 relation_to_type_material + 4 wgs_master 23 asm_not_live_date + 5 refseq_category 24 assembly_type + 6 taxid 25 group + 7 species_taxid 26 genome_size + 8 organism_name 27 genome_size_ungapped + 9 infraspecific_name 28 gc_percent + 10 isolate 29 replicon_count + 11 version_status 30 scaffold_count + 12 assembly_level 31 contig_count + 13 release_type 32 annotation_provider + 14 genome_rep 33 annotation_name + 15 seq_rel_date 34 annotation_date + 16 asm_name 35 total_gene_count + 17 asm_submitter 36 protein_coding_gene_count + 18 gbrs_paired_asm 37 non_coding_gene_count + 19 paired_asm_comp 38 pubmed_id + +Would be good to verify this in the readAsmSummary to make sure it +hasn't changed. + + 2175 archaea + 360585 bacteria + 607 fungi + 414 invertebrate + 184 plant + 96 protozoa + 231 vertebrate_mammalian + 405 vertebrate_other + 14992 viral + + +""" + +#################################################################### +### the various listings are ordered by these clade priorities to get +### primates first, mammals second, and so on +#################################################################### +def initCladePriority(): + global cladePrio + + keys = [ +"primates", +"mammals", +"vertebrate_mammalian", +"birds", +"fish", +"vertebrate", +"vertebrate_other", +"invertebrate", +"plants", +"fungi", +"protozoa", +"viral", +"bacteria", +"archaea", +"other", +"metagenomes", +"n/a", + ] + values = [ +'01', +'02', +'03', +'04', +'05', +'06', +'07', +'08', +'09', +'10', +'11', +'12', +'13', +'14', +'15', +'16', +'17', + ] + cladePrio = dict(zip(keys, values)) + +#################################################################### +### given clade c, return priority from cladePrio +#################################################################### +def cladePriority(c): + global cladePrio + if c not in cladePrio: + print(f"ERROR: missing clade {c} in cladePrio") + sys.exit(255) + + return cladePrio[c] + +#################################################################### +### read one of the NCBI files from +### /hive/data/outside/ncbi/genomes/reports/assembly_summary_{suffix} +#################################################################### +def readAsmSummary(suffix, prioExists, comNames, asmIdClade): + + # Initialize a list to hold the dictionaries + dataList = [] + + filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix + + with open(filePath, 'r', encoding='utf-8') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + if len(row) < 1: + continue + if row[0].startswith('#'): + continue + if row[0] in prioExists: + continue + if len(row) != 38: + print(f"ERROR: incorrect number of fields in {file}") + sys.exit(255) + gcAccession = row[0] + strain = re.sub(r'breed=', '', row[8]) + s0 = re.sub(r'cultivar=', '', strain) + strain = re.sub(r'ecotype=', '', s0) + s0 = re.sub(r'strain=', '', strain) + strain = re.sub(r'na', '', s0) + year = re.sub(r'/.*', '', row[14]) + asmName = row[15] + asmId = gcAccession + "_" + asmName + asmSubmitter = row[16] + asmType = row[23] + commonName = "n/a" + if gcAccession in comNames: + commonName = comNames[gcAccession] + clade = row[24] # almost like GenArk clades + if asmId in asmIdClade: # specific GenArk clade + clade = asmIdClade[asmId] + if clade == "plant": + clade = "plants" + cladeP = cladePriority(clade) + + dataDict = { + "gcAccession": gcAccession, + "asmName": asmName, + "scientificName": row[7], + "commonName": commonName, + "taxId": row[5], + "clade": clade, + "other": asmSubmitter + " " + strain + " " + asmType + " " + year, + "sortOrder": cladeP, + } + + utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} + # Append the dictionary to the list + dataList.append(utf8Encoded) + + return sorted(dataList, key=lambda x: x['sortOrder']) + +#################################################################### +### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt +### this sets up the GenArk listing +#################################################################### +def readGenArkData(url): + # Initialize a list to hold the dictionaries + dataList = [] + + response = requests.get(url) + response.raise_for_status() + fileContent = response.text + fileIo = StringIO(fileContent) + reader = csv.reader(fileIo, delimiter='\t') + + for row in reader: + if row and row[0].startswith('#'): + continue + clade = re.sub(r'\(L\)$', '', row[5]) + cladeP = cladePriority(clade) + dataDict = { + "gcAccession": row[0], + "asmName": row[1], + "scientificName": row[2], + "commonName": row[3], + "taxId": row[4], + "clade": clade, + "sortOrder": cladeP, + } + + utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} + # Append the dictionary to the list + dataList.append(utf8Encoded) + + # reset the list so that accessions such as GCF_000001405.40 + # come before GCF_000001405.39 +# dataList.reverse() +# return dataList + return sorted(dataList, key=lambda x: x['sortOrder']) + +#################################################################### +### a manually maintained clade listing for UCSC dbDb assemblies +#################################################################### +def dbDbCladeList(filePath): + returnList = {} + + with open(filePath, 'r') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + if len(row) < 1: + continue + if row[0].startswith('#'): + continue + returnList[row[0]] = row[1] + + return returnList + +#################################################################### +### out of the genArkData list, extract the given clade +#################################################################### +def extractClade(clade, genArkData): + tmpList = {} + for item in genArkData: + if clade != item['clade']: + continue + tmpList[item['gcAccession']] = item['commonName'] + + # return sorted list on the common name, case insensitive + returnList = dict(sorted(tmpList.items(), key=lambda item:item[1].lower())) + + return returnList + +#################################################################### +### Define a key function for sorting by the first word, case insensitive +#################################################################### +def getFirstWordCaseInsensitive(row): + firstWord = row.split('\t')[0] # Extract the first word + return firstWord.lower() # Convert to lowercase for case-insensitive sorting + +#################################################################### +### process the hgcentral.dbDb data into a dictionary +#################################################################### +def processDbDbData(data, clades): + # Initialize a list to hold the dictionaries + dataList = [] + + # Split the data into lines (rows) + rows = data.strip().split('\n') + # reverse the rows so that names such as hg19 come before hg18 + sortedRows = sorted(rows, key=getFirstWordCaseInsensitive, reverse=True) + + for row in sortedRows: + # Split each row into columns + columns = row.split('\t') + clade = clades.get(columns[0], "n/a") + cladeP = cladePriority(clade) + + # corresponds with the SELECT statement + # name,scientificName,organism,taxId,sourceName,description + # Create a dictionary for each row + dataDict = { + "name": columns[0], + "scientificName": columns[1], + "organism": columns[2], + "taxId": columns[3], + "sourceName": columns[4], + "description": columns[5], + "clade": clade, + "sortOrder": cladeP, + } + + utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} + # Append the dictionary to the list + dataList.append(utf8Encoded) + + return sorted(dataList, key=lambda x: x['sortOrder']) + +#################################################################### +### Function to remove non-alphanumeric characters +### cleans up names to make them better indexing words +#################################################################### +def removeNonAlphanumeric(s): + # Ensure string type + if isinstance(s, str): + reSub = re.sub(r'[^a-zA-Z0-9_]', ' ', s).strip() + return re.sub(r'\s+', ' ', reSub) + else: + return s # Return as-is for non-string types + +#################################################################### +def eliminateDupWords(s): + # Split the sentence into words + words = s.split() + + # Initialize a set to keep track of unique words encountered + seenWords = set() + + # List to store the words with duplicates removed + resultWords = [] + + for word in words: + # Convert word to lowercase for case-insensitive comparison + lowerWord = word.lower() + + # Check if the lowercase version of the word has been seen before + if lowerWord not in seenWords: + # If not seen, add it to the result list and mark as seen + resultWords.append(word) + seenWords.add(lowerWord) + + # Join the words back into a single string + return ' '.join(resultWords) + +#################################################################### +### given a genark accession, return pathname to hub.txt +### given: GCA_000001905.1 +### returns: GCA/000/001/905/GCA_000001905.1/hub.txt +#################################################################### +def genarkPath(gcAccession): + # Extract the prefix and the numeric part + prefix, numPart = gcAccession.split('_') + + # Break the numeric part into chunks of three digits + parts = [numPart[i:i+3] for i in range(0, len(numPart), 3)] + + # Join the parts to form the path + path = f"{prefix}/{parts[0]}/{parts[1]}/{parts[2]}/{gcAccession}/hub.txt" + + return path + +#################################################################### +### for the genArk set, establish some ad-hoc priorities +#################################################################### +def establishPriorities(dbDb, genArk): + global topPriorities + global allPriorities + global priorityCounter + + totalItemCount = 0 + + expectedTotal = len(dbDb) + len(genArk) + print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes") + + # first priority are the specific manually selected top items + itemCount = 0 + for name, priority in topPriorities.items(): + allPriorities[name] = priority + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\ttopPriorities count: {itemCount:4}") + + primateList = extractClade('primates', genArk) + mammalList = extractClade('mammals', genArk) + + versionScan = {} # key is dbDb name without number version extension, + # value is highest version number seen for this bare + # name + highestVersion = {} # key is dbDb name without number version extension, + # value is the full dbDb name for the highest version + # of this dbDb name + allDbDbNames = {} # key is the full dbDb name, value is its version + + itemCount = 0 + # scanning the dbDb entries, figure out the highest version number + # of each name + for item in dbDb: + dbDbName = item['name'] + splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName) + if splitMatch: + allDbDbNames[dbDbName] = splitMatch.group(2) + itemCount += 1 + if splitMatch.group(1) in versionScan: + if float(splitMatch.group(2)) > float(versionScan[splitMatch.group(1)]): + versionScan[splitMatch.group(1)] = splitMatch.group(2) + highestVersion[splitMatch.group(1)] = dbDbName + else: + versionScan[splitMatch.group(1)] = splitMatch.group(2) + highestVersion[splitMatch.group(1)] = dbDbName + else: + print("### dbDb name does not split: ", dbDbName) + allDbDbNames[dbDbName] = 0 + itemCount += 1 + + dbDbLen = len(dbDb) + + # second priority are the highest versioned database primates + # but not homo sapiens since we already have hg38 in topPriorities + itemCount = 0 + sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True) + for key in sortByValue: + highVersion = highestVersion[key[0]] + if highVersion not in allPriorities: + # find the element in the dbDb list that matches this highVersion name + highDict = next((d for d in dbDb if d.get('name') == highVersion), None) + if highDict['clade'] == "primates": + if highDict['scientificName'].lower() != "homo sapiens": + allPriorities[highVersion] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tdbDb highest version primates count: {itemCount:4}") + + itemCount = 0 + # and now the GenArk GCF/RefSeq homo sapiens should be lined up here next + for item in genArk: + gcAccession = item['gcAccession'] + if not gcAccession.startswith("GCF_"): + continue + if gcAccession not in allPriorities: + sciName = item['scientificName'] + if sciName.lower() == "homo sapiens": + allPriorities[gcAccession] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCF homo sapiens count: {itemCount:4}") + + itemCount = 0 + # and now the GenArk GCA/GenBank homo sapiens should be lined up here next + # GCA/GenBank second + for item in genArk: + gcAccession = item['gcAccession'] + if not gcAccession.startswith("GCA_"): + continue + if gcAccession not in allPriorities: + sciName = item['scientificName'] + if sciName.lower() == "homo sapiens": + allPriorities[gcAccession] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCA homo sapiens count: {itemCount:4}") + + itemCount = 0 + # the primates, GCF/RefSeq first + for asmId, commonName in primateList.items(): + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + if not gcAcc.startswith("GCF_"): + continue + if gcAcc not in allPriorities: + allPriorities[gcAcc] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCF primates count: {itemCount:4}") + + itemCount = 0 + # and the GCA/GenBank primates + for asmId, commonName in primateList.items(): + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + if not gcAcc.startswith("GCA_"): + continue + if gcAcc not in allPriorities: + allPriorities[gcAcc] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCA primates count: {itemCount:4}") + + # next are the highest versioned database mammals + itemCount = 0 + sortByValue = sorted(versionScan.items(), key=lambda x: x[1], reverse=True) + for key in sortByValue: + highVersion = highestVersion[key[0]] + if highVersion not in allPriorities: + # find the element in the dbDb list that matches this highVersion name + highDict = next((d for d in dbDb if d.get('name') == highVersion), None) + if highDict['clade'] == "mammals": + allPriorities[highVersion] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tdbDb highest version mammals count: {itemCount:4}") + + itemCount = 0 + # the mammals, GCF/RefSeq first + for asmId, commonName in mammalList.items(): + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + if not gcAcc.startswith("GCF_"): + continue + if gcAcc not in allPriorities: + allPriorities[gcAcc] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCF mammals count: {itemCount:4}") + + itemCount = 0 + # and the GCA/GenBank mammals + for asmId, commonName in mammalList.items(): + gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] + if not gcAcc.startswith("GCA_"): + continue + if gcAcc not in allPriorities: + allPriorities[gcAcc] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCA mammals count: {itemCount:4}") + + itemCount = 0 + # dbDb is in cladeOrder, process in that order, find highest versions + for item in dbDb: + dbDbName = item['name'] + if dbDbName not in allPriorities: + splitMatch = re.match("([a-zA-Z]+)(\d+)", dbDbName) + if splitMatch: + noVersion = splitMatch.group(1) + version = allDbDbNames[dbDbName] + highVersion = versionScan[noVersion] + if highVersion == version: + allPriorities[dbDbName] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tdbDb highest versions count: {itemCount:4}") + + itemCount = 0 + # GCF RefSeq from GenArk next priority + for item in genArk: + gcAccession = item['gcAccession'] + if not gcAccession.startswith("GCF_"): + continue + if gcAccession not in allPriorities: + allPriorities[gcAccession] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCF count: {itemCount:4}") + + itemCount = 0 + # GCA GenBank from GenArk next priority + for item in genArk: + gcAccession = item['gcAccession'] + if not gcAccession.startswith("GCA_"): + continue + if gcAccession not in allPriorities: + allPriorities[gcAccession] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk GCA count: {itemCount:4}") + + itemCount = 0 + for entry in dbDb: + dbName = entry['name'] + if dbName not in allPriorities: + allPriorities[dbName] = priorityCounter + priorityCounter += 1 + itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tthe rest of dbDb count: {itemCount:4}") + +#################################################################### + +def main(): + global priorityCounter + global allPriorities + + if len(sys.argv) != 2: + print("assemblyList.py - prepare assemblyList.tsv file from") + print(" dbDb.hgcentral and UCSC_GI.assemblyHubList.txt file.\n") + print("Usage: assemblyList.py dbDb.name.clade.tsv\n") + print("the dbDb.name.clade.tsv file is a manually curated file to relate") + print(" UCSC database names to GenArk clades, in source tree hubApi/") + print("This script is going to read the dbDb.hgcentral table, and the file") + print(" UCSC_GI.assemblyHubList.txt from hgdownload.") + print("Writing an output file assemblyList.tsv to be loaded into") + print(" assemblyList.hgcentral. The output file needs to be sorted") + print(" sort -k2,2n assemblyList.tsv before table load.") + sys.exit(255) + + # Ensure stdout and stderr use UTF-8 encoding + set_utf8_encoding() + + initCladePriority() + + dbDbNameCladeFile = sys.argv[1] + + # the correspondence of dbDb names to GenArk clade categories + dbDbClades = dbDbCladeList(dbDbNameCladeFile) + # Get the dbDb.hgcentral table data + rawData = dbDbData() + dbDbItems = processDbDbData(rawData, dbDbClades) + + # read the GenArk data from hgdownload into a list of dictionaries + genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" + genArkItems = readGenArkData(genArkUrl) + + establishPriorities(dbDbItems, genArkItems) + + asmIdClade = readAsmIdClade() + commonNames = allCommonNames() + print("# all common names: ", len(commonNames)) + + refSeqList = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade) + print("# refSeq assemblies: ", len(refSeqList)) + genBankList = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade) + print("# genBank assemblies: ", len(genBankList)) + + refSeqGenBankList = refSeqList + genBankList + print("# refSeq + genBank assemblies: ", len(refSeqGenBankList)) + + refSeqGenBankSorted = sorted(refSeqGenBankList, key=lambda x: x['sortOrder']) + print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted)) + + + outFile = "assemblyList.tsv" + fileOut = open(outFile, 'w') + + totalItemCount = 0 + itemCount = 0 + # Print the dbDb data + for entry in dbDbItems: + dbDbName = entry['name'] + if dbDbName in allPriorities: + priority = allPriorities[dbDbName] + else: + print("no priority for ", dbDbName) + sys.exit(255) + + clade = entry['clade'] + + descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']}\n" + description = re.sub(r'\s+', ' ', descr).strip() + outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\n" + fileOut.write(outLine) + itemCount += 1 + + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") + + itemCount = 0 + # Print the GenArk data + for entry in genArkItems: + gcAccession = entry['gcAccession'] + if gcAccession in allPriorities: + priority = allPriorities[gcAccession] + else: + print("no priority for ", gcAccession) + sys.exit(255) + + hubPath = genarkPath(gcAccession) + cleanName = removeNonAlphanumeric(entry['commonName']) + clade = entry['clade'] + descr = f"{entry['asmName']} {entry['taxId']}" + description = re.sub(r'\s+', ' ', descr).strip() + outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\n" + fileOut.write(outLine) + itemCount += 1 + + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") + + incrementPriority = len(allPriorities) + 1 + print("# incrementing priorities from: ", incrementPriority) + + itemCount = 0 + # Print the refSeq/genBank data + for entry in refSeqGenBankSorted: + gcAccession = entry['gcAccession'] + commonName = entry['commonName'] + scientificName = entry['scientificName'] + asmName = entry['asmName'] + clade = entry['clade'] + descr = f"{asmName} {entry['taxId']} {entry['other']}" + description = re.sub(r'\s+', ' ', descr).strip() + outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\n" + fileOut.write(outLine) + incrementPriority += 1 + itemCount += 1 + + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}") + + fileOut.close() + +if __name__ == "__main__": + main()