48466d5a0bfaa3fa457f5a9a07118c323de32b11 hiram Wed Aug 28 18:08:34 2024 -0700 fixed up to add the new columns refs #33720 diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py index 5ff3d4f..48d2598 100755 --- src/hg/hubApi/assemblyList.py +++ src/hg/hubApi/assemblyList.py @@ -206,69 +206,83 @@ # key is gcAccession, value is version_status and refseq_category filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix with open(filePath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) < 1: continue if row[0].startswith('#'): continue gcAccession = row[0] ### record the year and status here before bailing out so ### these can get into genArk if needed asmName = row[15] + assemblyLevel = re.sub(r'genome', '', row[11], flags=re.IGNORECASE) year = re.sub(r'/.*', '', row[14]) yearDict[gcAccession] = year versionStatus = row[10] - refseqCategory = re.sub(r'genome', '', row[4]) - if refseqCategory == "na": - refseqCategory = "" - genomeStatus = refseqCategory + versionStatus - statusDict[gcAccession] = genomeStatus + refSeqCategory = re.sub(r'genome', '', row[4]) + if refSeqCategory == "na": + refSeqCategory = "" + if versionStatus == "na": + versionStatus = "" + if assemblyLevel == "na": + assemblyLevel = "" + genomeStatus = f"{refSeqCategory} {versionStatus} {assemblyLevel}" + thisStat = { + "refSeqCategory": refSeqCategory, + "versionStatus": versionStatus, + "assemblyLevel": assemblyLevel, + } + statusDict[gcAccession] = thisStat if gcAccession in prioExists: continue if len(row) != 38: print(f"ERROR: incorrect number of fields in {file}") sys.exit(255) strain = re.sub(r'breed=', '', row[8]) s0 = re.sub(r'cultivar=', '', strain) strain = re.sub(r'ecotype=', '', s0) s0 = re.sub(r'strain=', '', strain) strain = re.sub(r'na', '', s0) asmId = gcAccession + "_" + asmName asmSubmitter = row[16] asmType = row[23] commonName = "n/a" if gcAccession in comNames: commonName = comNames[gcAccession] clade = row[24] # almost like GenArk clades if asmId in asmIdClade: # specific GenArk clade clade = asmIdClade[asmId] if clade == "plant": clade = "plants" cladeP = cladePriority(clade) dataDict = { "gcAccession": gcAccession, "asmName": asmName, "scientificName": row[7], "commonName": commonName, "taxId": row[5], "clade": clade, - "other": asmSubmitter + " " + strain + " " + asmType + " " + year + " " + genomeStatus, + "other": f"{asmSubmitter} {strain} {asmType} {year} {genomeStatus}", + "year": year, + "refSeqCategory": refSeqCategory, + "versionStatus": versionStatus, + "assemblyLevel": assemblyLevel, "sortOrder": cladeP, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) return sorted(dataList, key=lambda x: x['sortOrder']), yearDict, statusDict #################################################################### ### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt ### this sets up the GenArk listing #################################################################### def readGenArkData(url): # Initialize a list to hold the dictionaries @@ -280,111 +294,123 @@ fileIo = StringIO(fileContent) reader = csv.reader(fileIo, delimiter='\t') for row in reader: if row and row[0].startswith('#'): continue clade = re.sub(r'\(L\)$', '', row[5]) cladeP = cladePriority(clade) dataDict = { "gcAccession": row[0], "asmName": row[1], "scientificName": row[2], "commonName": row[3], "taxId": row[4], "clade": clade, + "year": 0, + "refSeqCategory": "na", + "versionStatus": "na", + "assemblyLevel": "na", "sortOrder": cladeP, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) # reset the list so that accessions such as GCF_000001405.40 # come before GCF_000001405.39 # dataList.reverse() # return dataList return sorted(dataList, key=lambda x: x['sortOrder']) #################################################################### ### a manually maintained clade listing for UCSC dbDb assemblies #################################################################### def dbDbCladeList(filePath): - returnList = {} + returnClade = {} + returnYear = {} + returnNcbi = {} with open(filePath, 'r') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) < 1: continue if row[0].startswith('#'): continue - returnList[row[0]] = row[1] + returnClade[row[0]] = row[1] + returnYear[row[0]] = row[2] + returnNcbi[row[0]] = row[3] - return returnList + return returnClade, returnYear, returnNcbi #################################################################### ### out of the genArkData list, extract the given clade #################################################################### def extractClade(clade, genArkData): tmpList = {} for item in genArkData: if clade != item['clade']: continue tmpList[item['gcAccession']] = item['commonName'] # return sorted list on the common name, case insensitive returnList = dict(sorted(tmpList.items(), key=lambda item:item[1].lower())) return returnList #################################################################### ### Define a key function for sorting by the first word, case insensitive #################################################################### def getFirstWordCaseInsensitive(row): firstWord = row.split('\t')[0] # Extract the first word return firstWord.lower() # Convert to lowercase for case-insensitive sorting #################################################################### ### process the hgcentral.dbDb data into a dictionary #################################################################### -def processDbDbData(data, clades): +def processDbDbData(data, clades, years, ncbi): # Initialize a list to hold the dictionaries dataList = [] # Split the data into lines (rows) rows = data.strip().split('\n') # reverse the rows so that names such as hg19 come before hg18 sortedRows = sorted(rows, key=getFirstWordCaseInsensitive, reverse=True) for row in sortedRows: # Split each row into columns columns = row.split('\t') clade = clades.get(columns[0], "n/a") + year = years.get(columns[0], "n/a") + gcAccession = ncbi.get(columns[0], "n/a") cladeP = cladePriority(clade) # corresponds with the SELECT statement # name,scientificName,organism,taxId,sourceName,description # Create a dictionary for each row dataDict = { "name": columns[0], "scientificName": columns[1], "organism": columns[2], "taxId": columns[3], "sourceName": columns[4], "description": columns[5], "clade": clade, + "year": year, + "gcAccession": gcAccession, "sortOrder": cladeP, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) return sorted(dataList, key=lambda x: x['sortOrder']) #################################################################### ### Function to remove non-alphanumeric characters ### cleans up names to make them better indexing words #################################################################### def removeNonAlphanumeric(s): # Ensure string type @@ -432,39 +458,43 @@ # Join the parts to form the path path = f"{prefix}/{parts[0]}/{parts[1]}/{parts[2]}/{gcAccession}/hub.txt" return path #################################################################### ### scan the genArks items, add years and status if not already there #################################################################### def addYearsStatus(genArks, years, status): for item in genArks: gcAccession = item['gcAccession'] if gcAccession in years: year = years[gcAccession] + item['year'] = year pat = r'\b' + re.escape(year) + r'\b' if not re.search(pat, item['commonName']): if not re.search(pat, item['taxId']): item['taxId'] += " " + year if gcAccession in status: stat = status[gcAccession] - pat = r'\b' + re.escape(stat) + r'\b' - if not re.search(pat, item['taxId']): - item['taxId'] += " " + stat + item['refSeqCategory'] = stat['refSeqCategory'] + item['versionStatus'] = stat['versionStatus'] + item['assemblyLevel'] = stat['assemblyLevel'] +## pat = r'\b' + re.escape(stat) + r'\b' +## if not re.search(pat, item['taxId']): +## item['taxId'] += " " + stat return #################################################################### ### for the genArk set, establish some ad-hoc priorities #################################################################### def establishPriorities(dbDb, genArk): global topPriorities global allPriorities global priorityCounter totalItemCount = 0 expectedTotal = len(dbDb) + len(genArk) print(f"### expected total: {expectedTotal:4} = {len(dbDb):4} dbDb genomes + {len(genArk):4} genArk genomes") @@ -676,136 +706,159 @@ allPriorities[dbName] = priorityCounter priorityCounter += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tthe rest of dbDb count: {itemCount:4}") #################################################################### def main(): global priorityCounter global allPriorities if len(sys.argv) != 2: print("assemblyList.py - prepare assemblyList.tsv file from") print(" dbDb.hgcentral and UCSC_GI.assemblyHubList.txt file.\n") - print("Usage: assemblyList.py dbDb.name.clade.tsv\n") - print("the dbDb.name.clade.tsv file is a manually curated file to relate") + print("Usage: assemblyList.py dbDb.clade.year.acc.tsv\n") + print("the dbDb.clade.year.tsv file is a manually curated file to relate") print(" UCSC database names to GenArk clades, in source tree hubApi/") print("This script is going to read the dbDb.hgcentral table, and the file") print(" UCSC_GI.assemblyHubList.txt from hgdownload.") print("Writing an output file assemblyList.tsv to be loaded into") print(" assemblyList.hgcentral. The output file needs to be sorted") print(" sort -k2,2n assemblyList.tsv before table load.") sys.exit(255) # Ensure stdout and stderr use UTF-8 encoding set_utf8_encoding() initCladePriority() dbDbNameCladeFile = sys.argv[1] # the correspondence of dbDb names to GenArk clade categories - dbDbClades = dbDbCladeList(dbDbNameCladeFile) + dbDbClades, dbDbYears, dbDbNcbi = dbDbCladeList(dbDbNameCladeFile) # Get the dbDb.hgcentral table data rawData = dbDbData() - dbDbItems = processDbDbData(rawData, dbDbClades) + dbDbItems = processDbDbData(rawData, dbDbClades, dbDbYears, dbDbNcbi) # read the GenArk data from hgdownload into a list of dictionaries genArkUrl = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt" genArkItems = readGenArkData(genArkUrl) establishPriorities(dbDbItems, genArkItems) asmIdClade = readAsmIdClade() commonNames = allCommonNames() print("# all common names: ", len(commonNames)) refSeqList, refSeqYears, refSeqStatus = readAsmSummary("refseq.txt", allPriorities, commonNames, asmIdClade) print("# refSeq assemblies: ", len(refSeqList)) + refSeqListHist, refSeqYearsHist, refSeqStatusHist = readAsmSummary("refseq_historical.txt", allPriorities, commonNames, asmIdClade) + print("# refSeq historical assemblies: ", len(refSeqListHist)) genBankList, genBankYears, genBankStatus = readAsmSummary("genbank.txt", allPriorities, commonNames, asmIdClade) print("# genBank assemblies: ", len(genBankList)) + genBankListHist, genBankYearsHist, genBankStatusHist = readAsmSummary("genbank_historical.txt", allPriorities, commonNames, asmIdClade) + print("# genBank historical assemblies: ", len(genBankListHist)) ### dictionary unpacking, combine both dictionaries (Python 3.5+) - allYears = {**refSeqYears, **genBankYears} - allStatus = {**refSeqStatus, **genBankStatus} + allYears = {**refSeqYears, **genBankYears, **refSeqYearsHist, **genBankYearsHist} + allStatus = {**refSeqStatus, **genBankStatus, **refSeqStatusHist, **genBankStatusHist} addYearsStatus(genArkItems, allYears, allStatus) - refSeqGenBankList = refSeqList + genBankList + refSeqGenBankList = refSeqList + genBankList + refSeqListHist + genBankListHist print("# refSeq + genBank assemblies: ", len(refSeqGenBankList)) refSeqGenBankSorted = sorted(refSeqGenBankList, key=lambda x: x['sortOrder']) print("# sorted refSeq + genBank assemblies: ", len(refSeqGenBankSorted)) outFile = "assemblyList.tsv" fileOut = open(outFile, 'w') totalItemCount = 0 itemCount = 0 # Print the dbDb data for entry in dbDbItems: dbDbName = entry['name'] if dbDbName in allPriorities: priority = allPriorities[dbDbName] else: print("no priority for ", dbDbName) sys.exit(255) clade = entry['clade'] + year = entry['year'] + gcAccession = entry['gcAccession'] + refSeqCategory = "" + versionStatus = "" + assemblyLevel = "" + if "na" not in gcAccession: + if gcAccession in allStatus: + stat = allStatus[gcAccession] + refSeqCategory = stat['refSeqCategory'] + versionStatus = stat['versionStatus'] + assemblyLevel = stat['assemblyLevel'] descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']}\n" description = re.sub(r'\s+', ' ', descr).strip() - outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\n" + outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) sys.exit(255) hubPath = genarkPath(gcAccession) cleanName = removeNonAlphanumeric(entry['commonName']) clade = entry['clade'] descr = f"{entry['asmName']} {entry['taxId']}" description = re.sub(r'\s+', ' ', descr).strip() - outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\n" + year = entry['year'] + refSeqCategory = entry['refSeqCategory'] + versionStatus = entry['versionStatus'] + assemblyLevel = entry['assemblyLevel'] + outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") incrementPriority = len(allPriorities) + 1 print("# incrementing priorities from: ", incrementPriority) itemCount = 0 # Print the refSeq/genBank data for entry in refSeqGenBankSorted: gcAccession = entry['gcAccession'] commonName = entry['commonName'] scientificName = entry['scientificName'] asmName = entry['asmName'] clade = entry['clade'] + year = entry['year'] + refSeqCategory = entry['refSeqCategory'] + versionStatus = entry['versionStatus'] + assemblyLevel = entry['assemblyLevel'] descr = f"{asmName} {entry['taxId']} {entry['other']}" description = re.sub(r'\s+', ' ', descr).strip() - outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\n" + outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) incrementPriority += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}") fileOut.close() if __name__ == "__main__": main()