src/hg/utils/automation/refSeqOnGenBank.py 08300d3c6b6f4211a16bf497c40ae6722242d4d4

08300d3c6b6f4211a16bf497c40ae6722242d4d4
hiram
  Fri May 24 14:05:36 2024 -0700
script to add RefSeq genes to a GenBank assembly no redmine

diff --git src/hg/utils/automation/refSeqOnGenBank.py src/hg/utils/automation/refSeqOnGenBank.py
new file mode 100755
index 0000000..7689753
--- /dev/null
+++ src/hg/utils/automation/refSeqOnGenBank.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+
+### refSeqOnGenBank.py
+### take RefSeq genes from a GCF refseq assembly and add them as tracks
+### to the equivalent GCA genbank assembly
+
+### also used is a botique module 'icecream' which is not typically
+### available.
+
+import sys
+import os
+import re
+import subprocess
+import csv
+
+#############################################################################
+### helper functions
+
+### additional error handling could be done here to output information
+### in case of script crashing with subprocess.CalledProcessError
+### maybe later
+def subProcess(cmd):
+  if sys.version_info >= (3, 7):
+    rtn = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+  else:
+    rtn = subprocess.run(cmd, stdout=subprocess.PIPE, universal_newlines=True)
+  if rtn.returncode != 0:
+    print(f"ERROR: subProcess({cmd})")
+    sys.exit(-1)
+
+  return rtn.stdout.strip(), rtn.stderr
+  # the strip() removes any leading or trailing whitespace
+  # use just rtn.stdout if you want the precise output
+
+def buildPath(asmId):
+  gcX = asmId[0:3]
+  d0 = asmId[4:7]
+  d1 = asmId[7:10]
+  d2 = asmId[10:13]
+  tP = "/hive/data/genomes/asmHubs/allBuild/" + "/".join([gcX,d0,d1,d2,asmId])
+  buildPath, stdErr = subProcess(["realpath", tP])
+  if not os.path.isdir(buildPath):
+    print(f"ERROR: can not find build directory {tP}")
+    sys.exit(-1)
+  return buildPath
+
+#############################################################################
+
+# Check if the Python version is at least 3.6  (due to use of f-strings here)
+if sys.version_info < (3, 6):
+    print("This script requires Python 3.6 or higher.")
+    sys.exit(1)
+
+hostEnv, stdErr = subProcess(["uname", "-n"])
+### to use all the special stuff when hgwdev /usr/bin/python3 is used
+### this allows it to find the icecream module
+if re.match(r'hgwdev*', hostEnv):
+  sys.path.append("/cluster/software/lib/python3.6/site-packages")
+  
+### icecream is a convenient debug printout system using the ic() function
+from icecream import install
+install()
+### ic(someArg) will print out the name of the argument and its contents
+###             for any type of object
+ic(sys.argv)
+
+# Check if the correct number of command line arguments is provided
+if len(sys.argv) != 3:
+    print("Usage: refSeqOnGenBank.py <refSeqAsmId> <genBankAsmId>")
+    print("will add the RefSeq gene track to the GenBank assembly")
+    print("when the refSeq genes exist on that refSeq assembly")
+    print(" e.g.:\nrefSeqOnGenBank.py GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri \\\n\tGCA_029281585.2_NHGRI_mGorGor1-v2.0_pri")
+    sys.exit(-1)
+
+def main():
+    refSeqAsmId = sys.argv[1]
+    genBankAsmId = sys.argv[2]
+
+    if not refSeqAsmId.startswith("GCF_"):
+      print(f"ERROR: refSeqAsmId does not start with GCF_: {refSeqAsmId}")
+      sys.exit(-1)
+
+    if not genBankAsmId.startswith("GCA_"):
+      print(f"ERROR: genBankAsmId does not start with GCA_: {genBankAsmId}")
+      sys.exit(-1)
+
+    # obtain the directories where the assembly build exists
+    rBuildPath = buildPath(refSeqAsmId)
+    ic(rBuildPath)
+    gBuildPath = buildPath(genBankAsmId)
+    ic(gBuildPath)
+
+    # verify refSeq gene track is present on the RefSeq assembly
+    refSeqBb = rBuildPath + "/trackData/ncbiRefSeq/" + refSeqAsmId + ".ncbiRefSeq.bb"
+
+    if not os.path.isfile(refSeqBb):
+      print(f"ERROR: refSeq gene track not present on {refSeqAsmId}")
+      print(f"{refSeqBb}")
+      sys.exit(-1)
+
+    genBankWorkDir = gBuildPath + "/trackData/ncbiRefSeq"
+    if os.path.isdir(genBankWorkDir):
+      print(f"ERROR: genBank ncbiRefSeq directory already exists")
+      print(f"{genBankWorkDir}")
+      sys.exit(-1)
+
+    os.makedirs(genBankWorkDir, exist_ok=True)
+    print(f"working in {genBankWorkDir}", file=sys.stderr)
+    gcfToGcaLift = genBankWorkDir + "/GCF.to.GCA.lift"
+
+    chromAlias = rBuildPath + "/trackData/chromAlias/" + refSeqAsmId + ".chromAlias.txt"
+    ic(chromAlias)
+    aliasBed = rBuildPath + "/trackData/chromAlias/" + refSeqAsmId + ".chromAlias.bed"
+    ic(aliasBed)
+
+    aliasHeaderLine, stdErr = subProcess(["head", "-1", chromAlias])
+    ic(aliasHeaderLine)
+    aliasFieldList = aliasHeaderLine.split('\t')
+    # this genbankIndex is the column in the bedAlias file, hence the + 3
+    try:
+      genbankIndex = aliasFieldList.index("genbank") + 3
+    except ValueError:
+      print(f"ERROR: can not find 'genbank' in alias field list")
+      sys.exit(-1)
+
+    if 3 == genbankIndex:
+      print(f"ERROR: found 'genbank' in alias list field as first one ?")
+      sys.exit(-1)
+
+    ic(genbankIndex)
+
+    aliasLines = 0
+    liftLines = 0
+    with open(gcfToGcaLift, 'w') as gcfToGca:
+      with open(aliasBed, 'r') as bedAlias:
+        lines = csv.reader(bedAlias, delimiter='\t')
+        for line in lines:
+          aliasLines += 1
+          if len(line[genbankIndex]) > 0:
+            gcfToGca.write(f"{line[1]}\t{line[0]}\t{line[2]}\t{line[genbankIndex]}\t{line[2]}\n")
+            liftLines += 1
+    #        print(f"{line[1]}\t{line[0]}\t{line[2]}\t{line[genbankIndex]}\t{line[2]}")
+
+    print(f"written\n{gcfToGcaLift}", file=sys.stderr)
+
+    lostLines = aliasLines - liftLines
+    if lostLines > 2:
+      print(f"ERROR: more than two sequence aliases do not match")
+      print(f"not matched: {lostLines} = {aliasLines} - {liftLines}")
+      sys.exit(-1)
+
+    print(f"# refseq sequence count {aliasLines}, genbank sequence count {liftLines}", file=sys.stderr)
+
+    runSh = genBankWorkDir + "/run.sh"
+    with open(runSh, 'w') as doCmd:
+      outStr = f"""#!/bin/bash
+    set -beEu -o pipefail
+
+    cd {genBankWorkDir}
+    export srcAsm="{refSeqAsmId}"
+    export targetAsm="{genBankAsmId}"
+    export liftFile="`pwd`/GCF.to.GCA.lift"
+
+    doNcbiRefSeq.pl -buildDir=`pwd` -assemblyHub \\
+      -liftFile="${{liftFile}}" \\
+      -target2bit="{gBuildPath}/{genBankAsmId}.2bit" "${{srcAsm}}" "${{targetAsm}}" \\
+         > do.log 2>&1
+
+    cd {gBuildPath}
+    ./doTrackDb.bash
+
+    """
+      doCmd.write(f"{outStr}")
+
+    print(f"running:\n{runSh}")
+    os.chmod(runSh, 0o775)
+
+    stdOut, stdErr = subProcess([runSh])
+    print(f"output from run.sh:\n{stdOut}\n{stdErr}")
+
+    sys.exit(0)
+
+if __name__ == "__main__":
+  main()
+
+# head -3 /hive/data/genomes/asmHubs/refseqBuild/GCF/029/281/585/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri/trackData/chromAlias/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri.chromAlias.txt
+# # refseq        assembly        genbank ncbi    ucsc
+# NC_011120.1     MT              MT      chrM
+# NC_073224.2     chr1_pat_hsa1   CM055446.2      1       chr1
+# [hiram@hgwdev /cluster/home/hiram/kent/src/hg/makeDb/doc/asmHubs] head -3 /hive/data/genomes/asmHubs/refseqBuild/GCF/029/281/585/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri/trackData/chromAlias/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri.chromAlias.bed
+# NC_011120.1     0       16412   NC_011120.1     MT              MT      chrM
+# NC_073224.2     0       243847345       NC_073224.2     chr1_pat_hsa1   CM055446.2      1       chr1
+# NC_073227.2     0       215373162       NC_073227.2     chr3_pat_hsa4   CM055449.2      3       chr3
+#