168c7a7081021c0656d309e61e00e070c66bfd2d
hiram
  Mon Sep 1 22:57:04 2025 -0700
improved procedure refs #34917

diff --git src/hg/makeDb/doc/asmHubs/findEquiv.py src/hg/makeDb/doc/asmHubs/findEquiv.py
index 9c9f55a1a12..ee95cd5afb0 100755
--- src/hg/makeDb/doc/asmHubs/findEquiv.py
+++ src/hg/makeDb/doc/asmHubs/findEquiv.py
@@ -98,54 +98,65 @@
 ### now with all lists in dictionaries, check the equivalence
 ### return an identity equivalent dictionary with the key as the
 ### original identifier, and the value the GenArk identifier
 #############################################################################
 def relateData(identifierDict, genArk, dataDict):
     identEquiv = {}
 
     initialIdentCount = len(identifierDict)
     print(f"### checking {initialIdentCount} identifiers")
 
     genArkMatch = identifierDict.keys() & genArk.keys()
     for key in genArkMatch:
         identEquiv[key] = f"{key}_{genArk[key]}"
 
     genArkCount = len(identEquiv)
-    print(f"### genArk matched {genArkCount}")
+    print(f"### genArk perfect matched {genArkCount}")
     notMatched = identifierDict.keys() - identEquiv.keys()
     print(f"### remaining identifiers to match {len(notMatched)}")
 
     equivCount = 0
     # check for equivalent GCA/GCF in genArk
     for key in notMatched:
         if key in dataDict:
             equivKey = dataDict[key][0]
             if equivKey in genArk:
                 if equivKey not in identEquiv:
                     identEquiv[key] = f"{equivKey}_{genArk[equivKey]}"
                     equivCount += 1
                     print(f"{key}\t{identEquiv[key]}")
 
     print(f"### found an additional {equivCount} equivalents")
 
     return identEquiv
 
 #############################################################################
 ### main()
 #############################################################################
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        sys.stderr.write("Usage: findEquiv.py <identifierFile> <resultFile>\n")
+    if len(sys.argv) != 4:
+        sys.stderr.write("Usage: findEquiv.py <identifierFile> <resultFile> <notFoundFile>\n")
         sys.exit(1)
     
     identifierFile = sys.argv[1]
     resultFile = sys.argv[2]
+    notFoundFile = sys.argv[3]
     identifierDict = readIdentifiers(identifierFile)
     genArkDict = readGenArk("UCSC_GI.assemblyHubList.txt")
     dataDict = readGrepData()
     identEquiv = relateData(identifierDict, genArkDict, dataDict)
 
     # write results, sorted
     with open(resultFile, "w", newline="\n") as file:
         writer = csv.writer(file, delimiter="\t", lineterminator="\n")
         for key, value in sorted(identEquiv.items()):
             writer.writerow([key, value])
+
+    # check if all items matched, if not write them out to notFoundFile
+    notFound = [key for key in identifierDict if key not in identEquiv]
+    if len(notFound) > 0:
+        print(f"### {len(notFound)} identifiers not found written to: {notFoundFile}")
+        with open(notFoundFile, "w", newline="\n") as file:
+            file.writelines(f"{key}\n" for key in sorted(notFound))
+    else:
+        with open(notFoundFile, "w", newline="\n") as file:
+            file.write(f"# all matched OK\n")