47ca1286ce276827df0a092c5bf3be1fbfabe1ee hiram Tue Feb 4 13:43:03 2025 -0800 output list in sorted order refs #34917 diff --git src/hg/makeDb/doc/asmHubs/findEquiv.py src/hg/makeDb/doc/asmHubs/findEquiv.py index f11a7d3164c..9c9f55a1a12 100755 --- src/hg/makeDb/doc/asmHubs/findEquiv.py +++ src/hg/makeDb/doc/asmHubs/findEquiv.py @@ -1,150 +1,151 @@ #!/usr/bin/python3 # given some list of accessions: GCx_012345678.9 # check to see if they are in GenArk or they at least # have GCF/GCA equivalents in GenArk. # # arguments are: the file list of accessions # and: file to write resulting two column tsv list: # accessionFromInputList GenArkIdentifier # where the GenArkIdentifier will be the full name: # GCx_012345678,9_asmName import csv import sys import subprocess ############################################################################# ### read the genark list: "UCSC_GI.assemblyHubList.txt" ### obtained via rsync: ### rsync -a -P \ ### rsync://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt ./ ############################################################################# def readGenArk(hubList): genArk = {} noHeaders = ["grep", "-a", "-v", "^#", hubList] cutCommand = ["cut", "-f1,2"] onlyGc = ["grep", "^GC"] sortCommand = ["sort", "-u"] try: noHeaderProc = subprocess.Popen(noHeaders, stdout=subprocess.PIPE) cutProc = subprocess.Popen(cutCommand, stdin=noHeaderProc.stdout, stdout=subprocess.PIPE) matchGc = subprocess.Popen(onlyGc, stdin=cutProc.stdout, stdout=subprocess.PIPE) sortProcess = subprocess.Popen(sortCommand, stdin=matchGc.stdout, stdout=subprocess.PIPE, text=True) for line in sortProcess.stdout: parts = line.strip().split('\t') if len(parts) == 2: key, asmName = parts genArk[key] = asmName except Exception as e: sys.stderr.write(f"Error processing grep pipeline: {e}\n") sys.exit(1) print(f"### len(genArk): {len(genArk)}") return genArk ############################################################################# ### read in the identifier list to check ############################################################################# def readIdentifiers(filePath): identifierDict = {} try: with open(filePath, 'r') as file: for line in file: identifier = line.strip() if identifier: identifierDict[identifier] = True except Exception as e: sys.stderr.write(f"Error reading file {filePath}: {e}\n") sys.exit(1) return identifierDict ############################################################################# ### read the NCBI assembly_summary files for all RefSeq, current and historical ### return an equivalent GCF->GCA dictionary with identical/different status ############################################################################# def readGrepData(): file1 = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_refseq.txt" file2 = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_refseq_historical.txt" noHeaders = ["grep", "-h", "-v", "^#", file1, file2] onlyGcf = ["grep", "GCF_"] andGca = ["grep", "GCA_"] cutCommand = ["cut", "-f1,18,19"] sortCommand = ["sort", "-u"] dataDict = {} try: noHeaderProc = subprocess.Popen(noHeaders, stdout=subprocess.PIPE) gcfProc = subprocess.Popen(onlyGcf, stdin=noHeaderProc.stdout, stdout=subprocess.PIPE) andGcaProc = subprocess.Popen(andGca, stdin=gcfProc.stdout, stdout=subprocess.PIPE) cutProcess = subprocess.Popen(cutCommand, stdin=andGcaProc.stdout, stdout=subprocess.PIPE) sortProcess = subprocess.Popen(sortCommand, stdin=cutProcess.stdout, stdout=subprocess.PIPE, text=True) for line in sortProcess.stdout: parts = line.strip().split('\t') if len(parts) == 3: key, col18, col19 = parts dataDict[key] = (col18, col19) dataDict[col18] = (key, col19) except Exception as e: sys.stderr.write(f"Error processing grep pipeline: {e}\n") sys.exit(1) print(f"### len(gcaGcf): {len(dataDict)}") return dataDict ############################################################################# ### now with all lists in dictionaries, check the equivalence ### return an identity equivalent dictionary with the key as the ### original identifier, and the value the GenArk identifier ############################################################################# def relateData(identifierDict, genArk, dataDict): identEquiv = {} initialIdentCount = len(identifierDict) print(f"### checking {initialIdentCount} identifiers") genArkMatch = identifierDict.keys() & genArk.keys() for key in genArkMatch: identEquiv[key] = f"{key}_{genArk[key]}" genArkCount = len(identEquiv) print(f"### genArk matched {genArkCount}") notMatched = identifierDict.keys() - identEquiv.keys() print(f"### remaining identifiers to match {len(notMatched)}") equivCount = 0 # check for equivalent GCA/GCF in genArk for key in notMatched: if key in dataDict: equivKey = dataDict[key][0] if equivKey in genArk: if equivKey not in identEquiv: identEquiv[key] = f"{equivKey}_{genArk[equivKey]}" equivCount += 1 print(f"{key}\t{identEquiv[key]}") print(f"### found an additional {equivCount} equivalents") return identEquiv ############################################################################# ### main() ############################################################################# if __name__ == "__main__": if len(sys.argv) != 3: sys.stderr.write("Usage: findEquiv.py \n") sys.exit(1) identifierFile = sys.argv[1] resultFile = sys.argv[2] identifierDict = readIdentifiers(identifierFile) genArkDict = readGenArk("UCSC_GI.assemblyHubList.txt") dataDict = readGrepData() identEquiv = relateData(identifierDict, genArkDict, dataDict) - # write results + # write results, sorted with open(resultFile, "w", newline="\n") as file: writer = csv.writer(file, delimiter="\t", lineterminator="\n") - writer.writerows(identEquiv.items()) + for key, value in sorted(identEquiv.items()): + writer.writerow([key, value])