456f4a30692d5db99a4b90e76b2f7a4f06bae0e1
hiram
  Fri Sep 13 10:53:13 2024 -0700
simple year match is good enough refs #33720

diff --git src/hg/hubApi/assemblyList.py src/hg/hubApi/assemblyList.py
index 62f9d2f..a6c8548 100755
--- src/hg/hubApi/assemblyList.py
+++ src/hg/hubApi/assemblyList.py
@@ -787,87 +787,84 @@
         clade = entry['clade']
         year = entry['year']
         gcAccession = entry['gcAccession']
         refSeqCategory = ""
         versionStatus = ""
         assemblyLevel = ""
         organism = entry['organism']
         if "na" not in gcAccession:
             if gcAccession in allStatus:
               stat = allStatus[gcAccession]
               refSeqCategory = stat['refSeqCategory'].lower()
               versionStatus = stat['versionStatus'].lower()
               assemblyLevel = stat['assemblyLevel'].lower()
 
         descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']}"
-        yearSearch = r'\b{}\b'.format(year)
-        if not re.search(yearSearch, organism) and not re.search(yearSearch,descr):
+        if year not in organism and year not in descr:
             descr = f"{entry['sourceName']} {entry['taxId']} {entry['description']} {year}"
         description = re.sub(r'\s+', ' ', descr).strip()
         outLine =f"{entry['name']}\t{priority}\t{organism}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tdbDb count: {itemCount:4}")
 
     itemCount = 0
     # Print the GenArk data
     for entry in genArkItems:
         gcAccession = entry['gcAccession']
         if gcAccession in allPriorities:
             priority = allPriorities[gcAccession]
         else:
             print("no priority for ", gcAccession)
             sys.exit(255)
 
         hubPath = genarkPath(gcAccession)
         commonName = entry['commonName']
         clade = entry['clade']
         year = entry['year']
         descr = f"{entry['asmName']} {entry['taxId']}"
-        yearSearch = r'\b{}\b'.format(year)
-        if not re.search(yearSearch, commonName) and not re.search(yearSearch, descr):
+        if year not in commonName and year not in descr:
             descr = f"{entry['asmName']} {entry['taxId']} {year}"
         description = re.sub(r'\s+', ' ', descr).strip()
         refSeqCategory = entry['refSeqCategory'].lower()
         versionStatus = entry['versionStatus'].lower()
         assemblyLevel = entry['assemblyLevel'].lower()
         outLine = f"{entry['gcAccession']}\t{priority}\t{commonName.encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description}\t1\t{hubPath}\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n"
         fileOut.write(outLine)
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\tgenArk count: {itemCount:4}")
 
     incrementPriority = len(allPriorities) + 1
     print("# incrementing priorities from: ", incrementPriority)
 
     itemCount = 0
     # Print the refSeq/genBank data
     for entry in refSeqGenBankSorted:
         gcAccession = entry['gcAccession']
         commonName = entry['commonName']
         scientificName = entry['scientificName']
         asmName = entry['asmName']
         clade = entry['clade']
         year = entry['year']
         refSeqCategory = entry['refSeqCategory'].lower()
         versionStatus = entry['versionStatus'].lower()
         assemblyLevel = entry['assemblyLevel'].lower()
         descr = f"{asmName} {entry['taxId']} {entry['other']}"
-        yearSearch = r'\b{}\b'.format(year)
-        if not re.search(yearSearch, commonName) and not re.search(yearSearch, descr):
+        if year not in commonName and year not in descr:
             descr = f"{asmName} {entry['taxId']} {entry['other']} {year}"
         description = re.sub(r'\s+', ' ', descr).strip()
         outLine = f"{gcAccession}\t{incrementPriority}\t{entry['commonName'].encode('ascii', 'ignore').decode('ascii')}\t{entry['scientificName']}\t{entry['taxId']}\t{clade}\t{description.encode('ascii', 'ignore').decode('ascii')}\t0\t\t{year}\t{refSeqCategory}\t{versionStatus}\t{assemblyLevel}\n"
         fileOut.write(outLine)
         incrementPriority += 1
         itemCount += 1
 
     totalItemCount += itemCount
     print(f"{totalItemCount:4} - total\trefSeq + genbank count: {itemCount:4}")
 
     fileOut.close()
 
 if __name__ == "__main__":
     main()