7d62fef94fc8e61b8c4f2b66229d81596e19a44d hiram Mon Aug 19 13:50:38 2024 -0700 augment the description on GenArk assemblies to include the year and the assembly status refs #33720 diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py index 6a547ad..5ff3d4f 100755 --- src/hg/hubApi/assemblyList.py +++ src/hg/hubApi/assemblyList.py @@ -73,30 +73,34 @@ #################################################################### ### this common name file is setup weekly via a cron job #################################################################### def allCommonNames(): commonNames = {} filePath = "/hive/data/outside/ncbi/genomes/reports/allCommonNames/asmId.commonName.all.txt" with open(filePath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: asmId = row[0] gcAcc = asmId.split('_')[0] + "_" + asmId.split('_')[1] commonNames[gcAcc] = row[1] + if len(commonNames) < 1000000: + sys.stderr.write(f"ERROR: assemblyList.py: allCommonNames is failing\n") + sys.exit(255) + return commonNames #################################################################### """ header definitions from assembly_summary_refseq.txt 1 #assembly_accession 20 ftp_path 2 bioproject 21 excluded_from_refseq 3 biosample 22 relation_to_type_material 4 wgs_master 23 asm_not_live_date 5 refseq_category 24 assembly_type 6 taxid 25 group 7 species_taxid 26 genome_size 8 organism_name 27 genome_size_ungapped 9 infraspecific_name 28 gc_percent @@ -176,87 +180,103 @@ #################################################################### ### given clade c, return priority from cladePrio #################################################################### def cladePriority(c): global cladePrio if c not in cladePrio: print(f"ERROR: missing clade {c} in cladePrio") sys.exit(255) return cladePrio[c] #################################################################### ### read one of the NCBI files from ### /hive/data/outside/ncbi/genomes/reports/assembly_summary_{suffix} +### returning three items: +### 1. sorted list of dictionaries representing all the data read in +### 2. dictionary: key gcAccession and value year of assembly +### 3. dictionary: key gcAccession and value genome version and refseq status #################################################################### def readAsmSummary(suffix, prioExists, comNames, asmIdClade): # Initialize a list to hold the dictionaries dataList = [] + yearDict = {} # key is gcAccession, value is year + statusDict = {} + # key is gcAccession, value is version_status and refseq_category filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix with open(filePath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) < 1: continue if row[0].startswith('#'): continue - if row[0] in prioExists: + gcAccession = row[0] + ### record the year and status here before bailing out so + ### these can get into genArk if needed + asmName = row[15] + year = re.sub(r'/.*', '', row[14]) + yearDict[gcAccession] = year + versionStatus = row[10] + refseqCategory = re.sub(r'genome', '', row[4]) + if refseqCategory == "na": + refseqCategory = "" + genomeStatus = refseqCategory + versionStatus + statusDict[gcAccession] = genomeStatus + if gcAccession in prioExists: continue if len(row) != 38: print(f"ERROR: incorrect number of fields in {file}") sys.exit(255) - gcAccession = row[0] strain = re.sub(r'breed=', '', row[8]) s0 = re.sub(r'cultivar=', '', strain) strain = re.sub(r'ecotype=', '', s0) s0 = re.sub(r'strain=', '', strain) strain = re.sub(r'na', '', s0) - year = re.sub(r'/.*', '', row[14]) - asmName = row[15] asmId = gcAccession + "_" + asmName asmSubmitter = row[16] asmType = row[23] commonName = "n/a" if gcAccession in comNames: commonName = comNames[gcAccession] clade = row[24] # almost like GenArk clades if asmId in asmIdClade: # specific GenArk clade clade = asmIdClade[asmId] if clade == "plant": clade = "plants" cladeP = cladePriority(clade) dataDict = { "gcAccession": gcAccession, "asmName": asmName, "scientificName": row[7], "commonName": commonName, "taxId": row[5], "clade": clade, - "other": asmSubmitter + " " + strain + " " + asmType + " " + year, + "other": asmSubmitter + " " + strain + " " + asmType + " " + year + " " + genomeStatus, "sortOrder": cladeP, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) - return sorted(dataList, key=lambda x: x['sortOrder']) + return sorted(dataList, key=lambda x: x['sortOrder']), yearDict, statusDict #################################################################### ### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt ### this sets up the GenArk listing #################################################################### def readGenArkData(url): # Initialize a list to hold the dictionaries dataList = [] response = requests.get(url) response.raise_for_status() fileContent = response.text fileIo = StringIO(fileContent) reader = csv.reader(fileIo, delimiter='\t') @@ -403,30 +423,51 @@ ### given: GCA_000001905.1 ### returns: GCA/000/001/905/GCA_000001905.1/hub.txt #################################################################### def genarkPath(gcAccession): # Extract the prefix and the numeric part prefix, numPart = gcAccession.split('_') # Break the numeric part into chunks of three digits parts = [numPart[i:i+3] for i in range(0, len(numPart), 3)] # Join the parts to form the path path = f"{prefix}/{parts[0]}/{parts[1]}/{parts[2]}/{gcAccession}/hub.txt" return path + +#################################################################### +### scan the genArks items, add years and status if not already there +#################################################################### +def addYearsStatus(genArks, years, status): + for item in genArks: + gcAccession = item['gcAccession'] + if gcAccession in years: + year = years[gcAccession] + pat = r'\b' + re.escape(year) + r'\b' + if not re.search(pat, item['commonName']): + if not re.search(pat, item['taxId']): + item['taxId'] += " " + year + if gcAccession in status: + stat = status[gcAccession] + pat = r'\b' + re.escape(stat) + r'\b' + if not re.search(pat, item['taxId']): + item['taxId'] += " " + stat + + return + #################################################################### ### for the genArk set, establish some ad-hoc priorities #################################################################### def establishPriorities(dbDb, genArk): global topPriorities global allPriorities global priorityCounter totalItemCount = 0 expectedTotal = len(dbDb) + len(genArk) print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes") # first priority are the specific manually selected top items itemCount = 0 @@ -668,34 +709,38 @@ dbDbClades = dbDbCladeList(dbDbNameCladeFile) # Get the dbDb.hgcentral table data rawData = dbDbData() dbDbItems = processDbDbData(rawData, dbDbClades) # read the GenArk data from hgdownload into a list of dictionaries genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" genArkItems = readGenArkData(genArkUrl) establishPriorities(dbDbItems, genArkItems) asmIdClade = readAsmIdClade() commonNames = allCommonNames() print("# all common names: ", len(commonNames)) - refSeqList = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade) + refSeqList, refSeqYears, refSeqStatus = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade) print("# refSeq assemblies: ", len(refSeqList)) - genBankList = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade) + genBankList, genBankYears, genBankStatus = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade) print("# genBank assemblies: ", len(genBankList)) + ### dictionary unpacking, combine both dictionaries (Python 3.5+) + allYears = {**refSeqYears, **genBankYears} + allStatus = {**refSeqStatus, **genBankStatus} + addYearsStatus(genArkItems, allYears, allStatus) refSeqGenBankList = refSeqList + genBankList print("# refSeq + genBank assemblies: ", len(refSeqGenBankList)) refSeqGenBankSorted = sorted(refSeqGenBankList, key=lambda x: x['sortOrder']) print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted)) outFile = "assemblyList.tsv" fileOut = open(outFile, 'w') totalItemCount = 0 itemCount = 0 # Print the dbDb data for entry in dbDbItems: @@ -740,27 +785,27 @@ print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") incrementPriority = len(allPriorities) + 1 print("# incrementing priorities from: ", incrementPriority) itemCount = 0 # Print the refSeq/genBank data for entry in refSeqGenBankSorted: gcAccession = entry['gcAccession'] commonName = entry['commonName'] scientificName = entry['scientificName'] asmName = entry['asmName'] clade = entry['clade'] descr = f"{asmName} {entry['taxId']} {entry['other']}" description = re.sub(r'\s+', ' ', descr).strip() - outLine = f"{entry['gcAccession']}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\n" + outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\n" fileOut.write(outLine) incrementPriority += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}") fileOut.close() if __name__ == "__main__": main()