a4bf3820f12a2275c02f005c60fd84f56df26c09
hiram
  Tue Feb 4 13:34:48 2025 -0800
find GenArk equivalents given a list of accessions for Tiberius (and other data) import refs #34917

diff --git src/hg/makeDb/doc/asmHubs/findEquiv.py src/hg/makeDb/doc/asmHubs/findEquiv.py
new file mode 100755
index 00000000000..f11a7d3164c
--- /dev/null
+++ src/hg/makeDb/doc/asmHubs/findEquiv.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python3
+
+# given some list of accessions: GCx_012345678.9
+# check to see if they are in GenArk or they at least
+# have GCF/GCA equivalents in GenArk.
+#
+# arguments are: the file list of accessions
+#           and: file to write resulting two column tsv list:
+#           accessionFromInputList <tab> GenArkIdentifier
+# where the GenArkIdentifier will be the full name:
+#           GCx_012345678,9_asmName
+
+import csv
+import sys
+import subprocess
+
+#############################################################################
+### read the genark list: "UCSC_GI.assemblyHubList.txt"
+### obtained via rsync: 
+### rsync -a -P \
+###   rsync://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt ./
+#############################################################################
+def readGenArk(hubList):
+    genArk = {}
+    noHeaders = ["grep", "-a", "-v", "^#", hubList]
+    cutCommand = ["cut", "-f1,2"]
+    onlyGc = ["grep", "^GC"]
+    sortCommand = ["sort", "-u"]
+    try:
+        noHeaderProc = subprocess.Popen(noHeaders, stdout=subprocess.PIPE)
+        cutProc = subprocess.Popen(cutCommand, stdin=noHeaderProc.stdout, stdout=subprocess.PIPE)
+        matchGc = subprocess.Popen(onlyGc, stdin=cutProc.stdout, stdout=subprocess.PIPE)
+        sortProcess = subprocess.Popen(sortCommand, stdin=matchGc.stdout, stdout=subprocess.PIPE, text=True)
+        
+        for line in sortProcess.stdout:
+            parts = line.strip().split('\t')
+            if len(parts) == 2:
+                key, asmName = parts
+                genArk[key] = asmName
+    except Exception as e:
+        sys.stderr.write(f"Error processing grep pipeline: {e}\n")
+        sys.exit(1)
+    
+    print(f"### len(genArk): {len(genArk)}")
+    return genArk
+
+#############################################################################
+### read in the identifier list to check
+#############################################################################
+def readIdentifiers(filePath):
+    identifierDict = {}
+    try:
+        with open(filePath, 'r') as file:
+            for line in file:
+                identifier = line.strip()
+                if identifier:
+                    identifierDict[identifier] = True
+    except Exception as e:
+        sys.stderr.write(f"Error reading file {filePath}: {e}\n")
+        sys.exit(1)
+    return identifierDict
+
+#############################################################################
+### read the NCBI assembly_summary files for all RefSeq, current and historical
+### return an equivalent GCF->GCA dictionary with identical/different status
+#############################################################################
+def readGrepData():
+    file1 = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_refseq.txt"
+    file2 = "/hive/data/outside/ncbi/genomes/reports/assembly_summary_refseq_historical.txt"
+    noHeaders = ["grep", "-h", "-v", "^#", file1, file2]
+    onlyGcf = ["grep", "GCF_"]
+    andGca = ["grep", "GCA_"]
+    cutCommand = ["cut", "-f1,18,19"]
+    sortCommand = ["sort", "-u"]
+    
+    dataDict = {}
+    try:
+        noHeaderProc = subprocess.Popen(noHeaders, stdout=subprocess.PIPE)
+        gcfProc = subprocess.Popen(onlyGcf, stdin=noHeaderProc.stdout, stdout=subprocess.PIPE)
+        andGcaProc = subprocess.Popen(andGca, stdin=gcfProc.stdout, stdout=subprocess.PIPE)
+        cutProcess = subprocess.Popen(cutCommand, stdin=andGcaProc.stdout, stdout=subprocess.PIPE)
+        sortProcess = subprocess.Popen(sortCommand, stdin=cutProcess.stdout, stdout=subprocess.PIPE, text=True)
+        
+        for line in sortProcess.stdout:
+            parts = line.strip().split('\t')
+            if len(parts) == 3:
+                key, col18, col19 = parts
+                dataDict[key] = (col18, col19)
+                dataDict[col18] = (key, col19)
+    except Exception as e:
+        sys.stderr.write(f"Error processing grep pipeline: {e}\n")
+        sys.exit(1)
+    
+    print(f"### len(gcaGcf): {len(dataDict)}")
+    return dataDict
+
+#############################################################################
+### now with all lists in dictionaries, check the equivalence
+### return an identity equivalent dictionary with the key as the
+### original identifier, and the value the GenArk identifier
+#############################################################################
+def relateData(identifierDict, genArk, dataDict):
+    identEquiv = {}
+
+    initialIdentCount = len(identifierDict)
+    print(f"### checking {initialIdentCount} identifiers")
+
+    genArkMatch = identifierDict.keys() & genArk.keys()
+    for key in genArkMatch:
+        identEquiv[key] = f"{key}_{genArk[key]}"
+
+    genArkCount = len(identEquiv)
+    print(f"### genArk matched {genArkCount}")
+    notMatched = identifierDict.keys() - identEquiv.keys()
+    print(f"### remaining identifiers to  match {len(notMatched)}")
+
+    equivCount = 0
+    # check for equivalent GCA/GCF in genArk
+    for key in notMatched:
+        if key in dataDict:
+            equivKey = dataDict[key][0]
+            if equivKey in genArk:
+                if equivKey not in identEquiv:
+                    identEquiv[key] = f"{equivKey}_{genArk[equivKey]}"
+                    equivCount += 1
+                    print(f"{key}\t{identEquiv[key]}")
+
+    print(f"### found an additional {equivCount} equivalents")
+
+    return identEquiv
+
+#############################################################################
+### main()
+#############################################################################
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        sys.stderr.write("Usage: findEquiv.py <identifierFile> <resultFile>\n")
+        sys.exit(1)
+    
+    identifierFile = sys.argv[1]
+    resultFile = sys.argv[2]
+    identifierDict = readIdentifiers(identifierFile)
+    genArkDict = readGenArk("UCSC_GI.assemblyHubList.txt")
+    dataDict = readGrepData()
+    identEquiv = relateData(identifierDict, genArkDict, dataDict)
+
+    # write results
+    with open(resultFile, "w", newline="\n") as file:
+        writer = csv.writer(file, delimiter="\t", lineterminator="\n")
+        writer.writerows(identEquiv.items())