a4bf3820f12a2275c02f005c60fd84f56df26c09 hiram Tue Feb 4 13:34:48 2025 -0800 find GenArk equivalents given a list of accessions for Tiberius (and other data) import refs #34917 diff --git src/hg/makeDb/doc/asmHubs/findEquiv.py src/hg/makeDb/doc/asmHubs/findEquiv.py new file mode 100755 index 00000000000..f11a7d3164c --- /dev/null +++ src/hg/makeDb/doc/asmHubs/findEquiv.py @@ -0,0 +1,150 @@ +#!/usr/bin/python3 + +# given some list of accessions: GCx_012345678.9 +# check to see if they are in GenArk or they at least +# have GCF/GCA equivalents in GenArk. +# +# arguments are: the file list of accessions +# and: file to write resulting two column tsv list: +# accessionFromInputList GenArkIdentifier +# where the GenArkIdentifier will be the full name: +# GCx_012345678,9_asmName + +import csv +import sys +import subprocess + +############################################################################# +### read the genark list: "UCSC_GI.assemblyHubList.txt" +### obtained via rsync: +### rsync -a -P \ +### rsync://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt ./ +############################################################################# +def readGenArk(hubList): + genArk = {} + noHeaders = ["grep", "-a", "-v", "^#", hubList] + cutCommand = ["cut", "-f1,2"] + onlyGc = ["grep", "^GC"] + sortCommand = ["sort", "-u"] + try: + noHeaderProc = subprocess.Popen(noHeaders, stdout=subprocess.PIPE) + cutProc = subprocess.Popen(cutCommand, stdin=noHeaderProc.stdout, stdout=subprocess.PIPE) + matchGc = subprocess.Popen(onlyGc, stdin=cutProc.stdout, stdout=subprocess.PIPE) + sortProcess = subprocess.Popen(sortCommand, stdin=matchGc.stdout, stdout=subprocess.PIPE, text=True) + + for line in sortProcess.stdout: + parts = line.strip().split('\t') + if len(parts) == 2: + key, asmName = parts + genArk[key] = asmName + except Exception as e: + sys.stderr.write(f"Error processing grep pipeline: {e}\n") + sys.exit(1) + + print(f"### len(genArk): {len(genArk)}") + return genArk + +############################################################################# +### read in the identifier list to check +############################################################################# +def readIdentifiers(filePath): + identifierDict = {} + try: + with open(filePath, 'r') as file: + for line in file: + identifier = line.strip() + if identifier: + identifierDict[identifier] = True + except Exception as e: + sys.stderr.write(f"Error reading file {filePath}: {e}\n") + sys.exit(1) + return identifierDict + +############################################################################# +### read the NCBI assembly_summary files for all RefSeq, current and historical +### return an equivalent GCF->GCA dictionary with identical/different status +############################################################################# +def readGrepData(): + file1 = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_refseq.txt" + file2 = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_refseq_historical.txt" + noHeaders = ["grep", "-h", "-v", "^#", file1, file2] + onlyGcf = ["grep", "GCF_"] + andGca = ["grep", "GCA_"] + cutCommand = ["cut", "-f1,18,19"] + sortCommand = ["sort", "-u"] + + dataDict = {} + try: + noHeaderProc = subprocess.Popen(noHeaders, stdout=subprocess.PIPE) + gcfProc = subprocess.Popen(onlyGcf, stdin=noHeaderProc.stdout, stdout=subprocess.PIPE) + andGcaProc = subprocess.Popen(andGca, stdin=gcfProc.stdout, stdout=subprocess.PIPE) + cutProcess = subprocess.Popen(cutCommand, stdin=andGcaProc.stdout, stdout=subprocess.PIPE) + sortProcess = subprocess.Popen(sortCommand, stdin=cutProcess.stdout, stdout=subprocess.PIPE, text=True) + + for line in sortProcess.stdout: + parts = line.strip().split('\t') + if len(parts) == 3: + key, col18, col19 = parts + dataDict[key] = (col18, col19) + dataDict[col18] = (key, col19) + except Exception as e: + sys.stderr.write(f"Error processing grep pipeline: {e}\n") + sys.exit(1) + + print(f"### len(gcaGcf): {len(dataDict)}") + return dataDict + +############################################################################# +### now with all lists in dictionaries, check the equivalence +### return an identity equivalent dictionary with the key as the +### original identifier, and the value the GenArk identifier +############################################################################# +def relateData(identifierDict, genArk, dataDict): + identEquiv = {} + + initialIdentCount = len(identifierDict) + print(f"### checking {initialIdentCount} identifiers") + + genArkMatch = identifierDict.keys() & genArk.keys() + for key in genArkMatch: + identEquiv[key] = f"{key}_{genArk[key]}" + + genArkCount = len(identEquiv) + print(f"### genArk matched {genArkCount}") + notMatched = identifierDict.keys() - identEquiv.keys() + print(f"### remaining identifiers to match {len(notMatched)}") + + equivCount = 0 + # check for equivalent GCA/GCF in genArk + for key in notMatched: + if key in dataDict: + equivKey = dataDict[key][0] + if equivKey in genArk: + if equivKey not in identEquiv: + identEquiv[key] = f"{equivKey}_{genArk[equivKey]}" + equivCount += 1 + print(f"{key}\t{identEquiv[key]}") + + print(f"### found an additional {equivCount} equivalents") + + return identEquiv + +############################################################################# +### main() +############################################################################# +if __name__ == "__main__": + if len(sys.argv) != 3: + sys.stderr.write("Usage: findEquiv.py \n") + sys.exit(1) + + identifierFile = sys.argv[1] + resultFile = sys.argv[2] + identifierDict = readIdentifiers(identifierFile) + genArkDict = readGenArk("UCSC_GI.assemblyHubList.txt") + dataDict = readGrepData() + identEquiv = relateData(identifierDict, genArkDict, dataDict) + + # write results + with open(resultFile, "w", newline="\n") as file: + writer = csv.writer(file, delimiter="\t", lineterminator="\n") + writer.writerows(identEquiv.items())