ff0ea8e2f1223e74bcb82b2426e6976bca6473b7 hiram Thu Sep 12 20:50:47 2024 -0700 get the year in the description if it is not in other fields and correctly lowerCase the category,status and level words refs #33720 diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py index 48d2598..62f9d2f 100755 --- src/hg/hubApi/assemblyList.py +++ src/hg/hubApi/assemblyList.py @@ -206,42 +206,41 @@ # key is gcAccession, value is version_status and refseq_category filePath = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_" + suffix with open(filePath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) < 1: continue if row[0].startswith('#'): continue gcAccession = row[0] ### record the year and status here before bailing out so ### these can get into genArk if needed asmName = row[15] - assemblyLevel = re.sub(r'genome', '', row[11], flags=re.IGNORECASE) + assemblyLevel = re.sub(r'genome', '', row[11], flags=re.IGNORECASE).lower() year = re.sub(r'/.*', '', row[14]) yearDict[gcAccession] = year - versionStatus = row[10] - refSeqCategory = re.sub(r'genome', '', row[4]) + versionStatus = row[10].lower() + refSeqCategory = re.sub(r'genome', '', row[4]).lower() if refSeqCategory == "na": refSeqCategory = "" if versionStatus == "na": versionStatus = "" if assemblyLevel == "na": assemblyLevel = "" - genomeStatus = f"{refSeqCategory} {versionStatus} {assemblyLevel}" thisStat = { "refSeqCategory": refSeqCategory, "versionStatus": versionStatus, "assemblyLevel": assemblyLevel, } statusDict[gcAccession] = thisStat if gcAccession in prioExists: continue if len(row) != 38: print(f"ERROR: incorrect number of fields in {file}") sys.exit(255) strain = re.sub(r'breed=', '', row[8]) s0 = re.sub(r'cultivar=', '', strain) strain = re.sub(r'ecotype=', '', s0) s0 = re.sub(r'strain=', '', strain) @@ -254,31 +253,31 @@ commonName = comNames[gcAccession] clade = row[24] # almost like GenArk clades if asmId in asmIdClade: # specific GenArk clade clade = asmIdClade[asmId] if clade == "plant": clade = "plants" cladeP = cladePriority(clade) dataDict = { "gcAccession": gcAccession, "asmName": asmName, "scientificName": row[7], "commonName": commonName, "taxId": row[5], "clade": clade, - "other": f"{asmSubmitter} {strain} {asmType} {year} {genomeStatus}", + "other": f"{asmSubmitter} {strain} {asmType} {year}", "year": year, "refSeqCategory": refSeqCategory, "versionStatus": versionStatus, "assemblyLevel": assemblyLevel, "sortOrder": cladeP, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) return sorted(dataList, key=lambda x: x['sortOrder']), yearDict, statusDict #################################################################### ### given a URL to hgdownload file: /hubs/UCSC_GI.assemblyHubList.txt @@ -295,33 +294,33 @@ reader = csv.reader(fileIo, delimiter='\t') for row in reader: if row and row[0].startswith('#'): continue clade = re.sub(r'\(L\)$', '', row[5]) cladeP = cladePriority(clade) dataDict = { "gcAccession": row[0], "asmName": row[1], "scientificName": row[2], "commonName": row[3], "taxId": row[4], "clade": clade, "year": 0, - "refSeqCategory": "na", - "versionStatus": "na", - "assemblyLevel": "na", + "refSeqCategory": "", + "versionStatus": "", + "assemblyLevel": "", "sortOrder": cladeP, } utf8Encoded= {k: v.encode('utf-8', 'ignore').decode('utf-8') if isinstance(v, str) else v for k, v in dataDict.items()} # Append the dictionary to the list dataList.append(utf8Encoded) # reset the list so that accessions such as GCF_000001405.40 # come before GCF_000001405.39 # dataList.reverse() # return dataList return sorted(dataList, key=lambda x: x['sortOrder']) #################################################################### ### a manually maintained clade listing for UCSC dbDb assemblies @@ -465,33 +464,33 @@ #################################################################### ### scan the genArks items, add years and status if not already there #################################################################### def addYearsStatus(genArks, years, status): for item in genArks: gcAccession = item['gcAccession'] if gcAccession in years: year = years[gcAccession] item['year'] = year pat = r'\b' + re.escape(year) + r'\b' if not re.search(pat, item['commonName']): if not re.search(pat, item['taxId']): item['taxId'] += " " + year if gcAccession in status: stat = status[gcAccession] - item['refSeqCategory'] = stat['refSeqCategory'] - item['versionStatus'] = stat['versionStatus'] - item['assemblyLevel'] = stat['assemblyLevel'] + item['refSeqCategory'] = stat['refSeqCategory'].lower() + item['versionStatus'] = stat['versionStatus'].lower() + item['assemblyLevel'] = stat['assemblyLevel'].lower() ## pat = r'\b' + re.escape(stat) + r'\b' ## if not re.search(pat, item['taxId']): ## item['taxId'] += " " + stat return #################################################################### ### for the genArk set, establish some ad-hoc priorities #################################################################### def establishPriorities(dbDb, genArk): global topPriorities global allPriorities global priorityCounter totalItemCount = 0 @@ -779,86 +778,96 @@ # Print the dbDb data for entry in dbDbItems: dbDbName = entry['name'] if dbDbName in allPriorities: priority = allPriorities[dbDbName] else: print("no priority for ", dbDbName) sys.exit(255) clade = entry['clade'] year = entry['year'] gcAccession = entry['gcAccession'] refSeqCategory = "" versionStatus = "" assemblyLevel = "" + organism = entry['organism'] if "na" not in gcAccession: if gcAccession in allStatus: stat = allStatus[gcAccession] - refSeqCategory = stat['refSeqCategory'] - versionStatus = stat['versionStatus'] - assemblyLevel = stat['assemblyLevel'] - - descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']}\n" + refSeqCategory = stat['refSeqCategory'].lower() + versionStatus = stat['versionStatus'].lower() + assemblyLevel = stat['assemblyLevel'].lower() + + descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']}" + yearSearch = r'\b{}\b'.format(year) + if not re.search(yearSearch, organism) and not re.search(yearSearch,descr): + descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']} {year}" description = re.sub(r'\s+', ' ', descr).strip() - outLine =f"{entry['name']}\t{priority}\t{entry['organism']}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" + outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) sys.exit(255) hubPath = genarkPath(gcAccession) - cleanName = removeNonAlphanumeric(entry['commonName']) + commonName = entry['commonName'] clade = entry['clade'] + year = entry['year'] descr = f"{entry['asmName']} {entry['taxId']}" + yearSearch = r'\b{}\b'.format(year) + if not re.search(yearSearch, commonName) and not re.search(yearSearch, descr): + descr = f"{entry['asmName']} {entry['taxId']} {year}" description = re.sub(r'\s+', ' ', descr).strip() - year = entry['year'] - refSeqCategory = entry['refSeqCategory'] - versionStatus = entry['versionStatus'] - assemblyLevel = entry['assemblyLevel'] - outLine = f"{entry['gcAccession']}\t{priority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" + refSeqCategory = entry['refSeqCategory'].lower() + versionStatus = entry['versionStatus'].lower() + assemblyLevel = entry['assemblyLevel'].lower() + outLine = f"{entry['gcAccession']}\t{priority}\t{commonName.encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") incrementPriority = len(allPriorities) + 1 print("# incrementing priorities from: ", incrementPriority) itemCount = 0 # Print the refSeq/genBank data for entry in refSeqGenBankSorted: gcAccession = entry['gcAccession'] commonName = entry['commonName'] scientificName = entry['scientificName'] asmName = entry['asmName'] clade = entry['clade'] year = entry['year'] - refSeqCategory = entry['refSeqCategory'] - versionStatus = entry['versionStatus'] - assemblyLevel = entry['assemblyLevel'] + refSeqCategory = entry['refSeqCategory'].lower() + versionStatus = entry['versionStatus'].lower() + assemblyLevel = entry['assemblyLevel'].lower() descr = f"{asmName} {entry['taxId']} {entry['other']}" + yearSearch = r'\b{}\b'.format(year) + if not re.search(yearSearch, commonName) and not re.search(yearSearch, descr): + descr = f"{asmName} {entry['taxId']} {entry['other']} {year}" description = re.sub(r'\s+', ' ', descr).strip() outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) incrementPriority += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}") fileOut.close() if __name__ == "__main__": main()