aff75303759349db45579315630cb17e9370f86f hiram Fri Jul 5 16:07:27 2024 -0700 now with assembly exist signal and including all RefSeq potential assemblies refs #32897 diff --git src/hg/hubApi/genomePriority.py src/hg/hubApi/genomePriority.py index 7f22cad..addcb207 100755 --- src/hg/hubApi/genomePriority.py +++ src/hg/hubApi/genomePriority.py @@ -33,30 +33,156 @@ #################################################################### def dbDbData(): # Run the MySQL command and capture the output as bytes result = subprocess.run( ["hgsql", "-hgenome-centdb", "-N", "-e", "SELECT name,scientificName,organism,taxId,sourceName,description FROM dbDb WHERE active=1;", "hgcentral"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if result.returncode != 0: print(f"Error executing MySQL command: {result.stderr.decode('utf-8')}") exit(1) # Decode the output from bytes to string using utf-8 return result.stdout.decode('latin-1') #################################################################### +def readCommonNames(ncbiType): + commonNames = {} + + filePath = "/hive/data/outside/ncbi/genomes/reports/" + ncbiType + "/allAssemblies.commonNames.tsv" + + with open(filePath, 'r', encoding='utf-8') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + commonNames[row[1]] = row[3] + + return commonNames + +#################################################################### +""" + header definitions from assembly_summary_refseq.txt + + 1 #assembly_accession 20 ftp_path + 2 bioproject 21 excluded_from_refseq + 3 biosample 22 relation_to_type_material + 4 wgs_master 23 asm_not_live_date + 5 refseq_category 24 assembly_type + 6 taxid 25 group + 7 species_taxid 26 genome_size + 8 organism_name 27 genome_size_ungapped + 9 infraspecific_name 28 gc_percent + 10 isolate 29 replicon_count + 11 version_status 30 scaffold_count + 12 assembly_level 31 contig_count + 13 release_type 32 annotation_provider + 14 genome_rep 33 annotation_name + 15 seq_rel_date 34 annotation_date + 16 asm_name 35 total_gene_count + 17 asm_submitter 36 protein_coding_gene_count + 18 gbrs_paired_asm 37 non_coding_gene_count + 19 paired_asm_comp 38 pubmed_id + +Would be good to verify this in the readAsmSummary to make sure it +hasn't changed. + + 2175 archaea + 360585 bacteria + 607 fungi + 414 invertebrate + 184 plant + 96 protozoa + 231 vertebrate_mammalian + 405 vertebrate_other + 14992 viral + + +""" + +def readAsmSummary(suffix, prioExists, comNames): + # read one of the NCBI files from + # /hive/data/outside/ncbi/genomes/reports/assembly_summary_{suffix} + # Initialize a list to hold the dictionaries + dataList = [] + keys = [ +"vertebrate_mammalian", +"vertebrate_other", +"invertebrate", +"plant", +"fungi", +"protozoa", +"viral", +"bacteria", +"archaea", + ] + values = [ +'1', +'2', +'3', +'4', +'5', +'6', +'7', +'8', +'0', + ] + cladePrio = dict(zip(keys, values)) + + filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix + + with open(filePath, 'r', encoding='utf-8') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + if len(row) < 1: + continue + if row[0].startswith('#'): + continue + if row[0] in prioExists: + continue + if len(row) != 38: + print(f"ERROR: incorrect number of fields in {file}") + sys.exit(1) + gcAccession = row[0] + strain = re.sub(r'breed=', '', row[8]) + s0 = re.sub(r'cultivar=', '', strain) + strain = re.sub(r'ecotype=', '', s0) + s0 = re.sub(r'strain=', '', strain) + strain = re.sub(r'na', '', s0) + year = re.sub(r'/.*', '', row[14]) + asmSubmitter = row[16] + asmType = row[23] +# commonName = row[7] + commonName = "n/a" + if gcAccession in comNames: + commonName = comNames[gcAccession] + dataDict = { + "gcAccession": gcAccession, + "asmName": row[15], + "scientificName": row[7], + "commonName": commonName, + "taxId": row[5], + "clade": row[24], # almost like GenArk clades + "other": asmSubmitter + " " + strain + " " + asmType + " " + year, + "sortOrder": cladePrio[row[24]] + } + + utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} + # Append the dictionary to the list + dataList.append(utf8Encoded) + + return sorted(dataList, key=lambda x: x['sortOrder']) + +#################################################################### ### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt def readGenArkData(url): # Initialize a list to hold the dictionaries dataList = [] response = requests.get(url) response.raise_for_status() fileContent = response.text fileIo = StringIO(fileContent) reader = csv.reader(fileIo, delimiter='\t') for row in reader: if row and row[0].startswith('#'): continue dataDict = { "gcAccession": row[0], @@ -430,56 +556,100 @@ # Ensure stdout and stderr use UTF-8 encoding set_utf8_encoding() # the correspondence of dbDb names to GenArk clade categories dbDbClades = dbDbCladeList(dbDbNameCladeFile) # Get the dbDb.hgcentral table data rawData = dbDbData() dbDbItems = processDbDbData(rawData, dbDbClades) # read the GenArk data from hgdownload into a list of dictionaries genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" genArkItems = readGenArkData(genArkUrl) establishPriorities(dbDbItems, genArkItems) + refSeqCommonNames = readCommonNames("refseq") + print("# refseq common names: ", len(refSeqCommonNames)) + + refSeqList = readAsmSummary("refseq.txt", allPriorities, refSeqCommonNames) + + print("# refSeq assemblies: ", len(refSeqList)) + outFile = "genomePriority.tsv" fileOut = open(outFile, 'w') + totalItemCount = 0 itemCount = 0 # Print the dbDb data for entry in dbDbItems: dbDbName = entry['name'] if dbDbName in allPriorities: priority = allPriorities[dbDbName] else: print("no priority for ", dbDbName) clade = entry['clade'] - descr = f"{entry['sourceName']} {clade} {entry['taxId']} {entry['description']}\n" + descr = f"{entry['sourceName']} {clade} {entry['description']}\n" description = re.sub(r'\s+', ' ', descr).strip() - outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n" + outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\t1\n" fileOut.write(outLine) itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") + itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) cleanName = removeNonAlphanumeric(entry['commonName']) clade = entry['clade'] - descr = f"{entry['asmName']} {clade} {entry['taxId']}\n" + descr = f"{entry['asmName']} {clade}" + description = re.sub(r'\s+', ' ', descr).strip() + outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\t1\n" + fileOut.write(outLine) + itemCount += 1 + + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") + + itemCount = 0 + incrementPriority = len(allPriorities) + print("# incrementing priorities from: ", incrementPriority) + # Print the dbDb data + for entry in refSeqList: + gcAccession = entry['gcAccession'] + commonName = entry['commonName'] + scientificName = entry['scientificName'] + descr = f"{entry['other']} {entry['clade']}" description = re.sub(r'\s+', ' ', descr).strip() - outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description}\n" + outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\n" fileOut.write(outLine) + incrementPriority += 1 itemCount += 1 + totalItemCount += itemCount + print(f"{totalItemCount:4} - total\trefSeq count: {itemCount:4}") + +# def removeNonAlphanumeric(s): + fileOut.close() if __name__ == "__main__": main() + +""" + "gcAccession": row[0], + "asmName": row[15], + "scientificName": row[7], + "commonName": row[3], + "taxId": row[5], + "clade": row[24], # almost like GenArk clades + "other": asmSubmitter + " " + strain + " " + asmType, +"""