168c7a7081021c0656d309e61e00e070c66bfd2d hiram Mon Sep 1 22:57:04 2025 -0700 improved procedure refs #34917 diff --git src/hg/makeDb/doc/asmHubs/findEquiv.py src/hg/makeDb/doc/asmHubs/findEquiv.py index 9c9f55a1a12..ee95cd5afb0 100755 --- src/hg/makeDb/doc/asmHubs/findEquiv.py +++ src/hg/makeDb/doc/asmHubs/findEquiv.py @@ -98,54 +98,65 @@ ### now with all lists in dictionaries, check the equivalence ### return an identity equivalent dictionary with the key as the ### original identifier, and the value the GenArk identifier ############################################################################# def relateData(identifierDict, genArk, dataDict): identEquiv = {} initialIdentCount = len(identifierDict) print(f"### checking {initialIdentCount} identifiers") genArkMatch = identifierDict.keys() & genArk.keys() for key in genArkMatch: identEquiv[key] = f"{key}_{genArk[key]}" genArkCount = len(identEquiv) - print(f"### genArk matched {genArkCount}") + print(f"### genArk perfect matched {genArkCount}") notMatched = identifierDict.keys() - identEquiv.keys() print(f"### remaining identifiers to match {len(notMatched)}") equivCount = 0 # check for equivalent GCA/GCF in genArk for key in notMatched: if key in dataDict: equivKey = dataDict[key][0] if equivKey in genArk: if equivKey not in identEquiv: identEquiv[key] = f"{equivKey}_{genArk[equivKey]}" equivCount += 1 print(f"{key}\t{identEquiv[key]}") print(f"### found an additional {equivCount} equivalents") return identEquiv ############################################################################# ### main() ############################################################################# if __name__ == "__main__": - if len(sys.argv) != 3: - sys.stderr.write("Usage: findEquiv.py <identifierFile> <resultFile>\n") + if len(sys.argv) != 4: + sys.stderr.write("Usage: findEquiv.py <identifierFile> <resultFile> <notFoundFile>\n") sys.exit(1) identifierFile = sys.argv[1] resultFile = sys.argv[2] + notFoundFile = sys.argv[3] identifierDict = readIdentifiers(identifierFile) genArkDict = readGenArk("UCSC_GI.assemblyHubList.txt") dataDict = readGrepData() identEquiv = relateData(identifierDict, genArkDict, dataDict) # write results, sorted with open(resultFile, "w", newline="\n") as file: writer = csv.writer(file, delimiter="\t", lineterminator="\n") for key, value in sorted(identEquiv.items()): writer.writerow([key, value]) + + # check if all items matched, if not write them out to notFoundFile + notFound = [key for key in identifierDict if key not in identEquiv] + if len(notFound) > 0: + print(f"### {len(notFound)} identifiers not found written to: {notFoundFile}") + with open(notFoundFile, "w", newline="\n") as file: + file.writelines(f"{key}\n" for key in sorted(notFound)) + else: + with open(notFoundFile, "w", newline="\n") as file: + file.write(f"# all matched OK\n")