456f4a30692d5db99a4b90e76b2f7a4f06bae0e1 hiram Fri Sep 13 10:53:13 2024 -0700 simple year match is good enough refs #33720 diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py index 62f9d2f..a6c8548 100755 --- src/hg/hubApi/assemblyList.py +++ src/hg/hubApi/assemblyList.py @@ -787,87 +787,84 @@ clade = entry['clade'] year = entry['year'] gcAccession = entry['gcAccession'] refSeqCategory = "" versionStatus = "" assemblyLevel = "" organism = entry['organism'] if "na" not in gcAccession: if gcAccession in allStatus: stat = allStatus[gcAccession] refSeqCategory = stat['refSeqCategory'].lower() versionStatus = stat['versionStatus'].lower() assemblyLevel = stat['assemblyLevel'].lower() descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']}" - yearSearch = r'\b{}\b'.format(year) - if not re.search(yearSearch, organism) and not re.search(yearSearch,descr): + if year not in organism and year not in descr: descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']} {year}" description = re.sub(r'\s+', ' ', descr).strip() outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}") itemCount = 0 # Print the GenArk data for entry in genArkItems: gcAccession = entry['gcAccession'] if gcAccession in allPriorities: priority = allPriorities[gcAccession] else: print("no priority for ", gcAccession) sys.exit(255) hubPath = genarkPath(gcAccession) commonName = entry['commonName'] clade = entry['clade'] year = entry['year'] descr = f"{entry['asmName']} {entry['taxId']}" - yearSearch = r'\b{}\b'.format(year) - if not re.search(yearSearch, commonName) and not re.search(yearSearch, descr): + if year not in commonName and year not in descr: descr = f"{entry['asmName']} {entry['taxId']} {year}" description = re.sub(r'\s+', ' ', descr).strip() refSeqCategory = entry['refSeqCategory'].lower() versionStatus = entry['versionStatus'].lower() assemblyLevel = entry['assemblyLevel'].lower() outLine = f"{entry['gcAccession']}\t{priority}\t{commonName.encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}") incrementPriority = len(allPriorities) + 1 print("# incrementing priorities from: ", incrementPriority) itemCount = 0 # Print the refSeq/genBank data for entry in refSeqGenBankSorted: gcAccession = entry['gcAccession'] commonName = entry['commonName'] scientificName = entry['scientificName'] asmName = entry['asmName'] clade = entry['clade'] year = entry['year'] refSeqCategory = entry['refSeqCategory'].lower() versionStatus = entry['versionStatus'].lower() assemblyLevel = entry['assemblyLevel'].lower() descr = f"{asmName} {entry['taxId']} {entry['other']}" - yearSearch = r'\b{}\b'.format(year) - if not re.search(yearSearch, commonName) and not re.search(yearSearch, descr): + if year not in commonName and year not in descr: descr = f"{asmName} {entry['taxId']} {entry['other']} {year}" description = re.sub(r'\s+', ' ', descr).strip() outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n" fileOut.write(outLine) incrementPriority += 1 itemCount += 1 totalItemCount += itemCount print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}") fileOut.close() if __name__ == "__main__": main()