d258476e0cc734212538e8ab0a7509acb8b24aaa
hiram
  Fri May 24 14:10:17 2024 -0700
remove cruft no redmine

diff --git src/hg/utils/automation/refSeqOnGenBank.py src/hg/utils/automation/refSeqOnGenBank.py
index 7689753..890e329 100755
--- src/hg/utils/automation/refSeqOnGenBank.py
+++ src/hg/utils/automation/refSeqOnGenBank.py
@@ -1,194 +1,187 @@
 #!/usr/bin/env python3
 
 ### refSeqOnGenBank.py
 ### take RefSeq genes from a GCF refseq assembly and add them as tracks
 ### to the equivalent GCA genbank assembly
 
 ### also used is a botique module 'icecream' which is not typically
 ### available.
 
 import sys
 import os
 import re
 import subprocess
 import csv
 
 #############################################################################
 ### helper functions
 
 ### additional error handling could be done here to output information
 ### in case of script crashing with subprocess.CalledProcessError
 ### maybe later
 def subProcess(cmd):
   if sys.version_info >= (3, 7):
     rtn = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
   else:
     rtn = subprocess.run(cmd, stdout=subprocess.PIPE, universal_newlines=True)
   if rtn.returncode != 0:
     print(f"ERROR: subProcess({cmd})")
     sys.exit(-1)
 
   return rtn.stdout.strip(), rtn.stderr
   # the strip() removes any leading or trailing whitespace
   # use just rtn.stdout if you want the precise output
 
 def buildPath(asmId):
   gcX = asmId[0:3]
   d0 = asmId[4:7]
   d1 = asmId[7:10]
   d2 = asmId[10:13]
   tP = "/hive/data/genomes/asmHubs/allBuild/" + "/".join([gcX,d0,d1,d2,asmId])
   buildPath, stdErr = subProcess(["realpath", tP])
   if not os.path.isdir(buildPath):
     print(f"ERROR: can not find build directory {tP}")
     sys.exit(-1)
   return buildPath
 
 #############################################################################
 
 # Check if the Python version is at least 3.6  (due to use of f-strings here)
 if sys.version_info < (3, 6):
     print("This script requires Python 3.6 or higher.")
     sys.exit(1)
 
 hostEnv, stdErr = subProcess(["uname", "-n"])
 ### to use all the special stuff when hgwdev /usr/bin/python3 is used
 ### this allows it to find the icecream module
 if re.match(r'hgwdev*', hostEnv):
   sys.path.append("/cluster/software/lib/python3.6/site-packages")
   
 ### icecream is a convenient debug printout system using the ic() function
 from icecream import install
 install()
 ### ic(someArg) will print out the name of the argument and its contents
 ###             for any type of object
 ic(sys.argv)
 
 # Check if the correct number of command line arguments is provided
 if len(sys.argv) != 3:
     print("Usage: refSeqOnGenBank.py <refSeqAsmId> <genBankAsmId>")
     print("will add the RefSeq gene track to the GenBank assembly")
     print("when the refSeq genes exist on that refSeq assembly")
     print(" e.g.:\nrefSeqOnGenBank.py GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri \\\n\tGCA_029281585.2_NHGRI_mGorGor1-v2.0_pri")
     sys.exit(-1)
 
 def main():
     refSeqAsmId = sys.argv[1]
     genBankAsmId = sys.argv[2]
 
     if not refSeqAsmId.startswith("GCF_"):
       print(f"ERROR: refSeqAsmId does not start with GCF_: {refSeqAsmId}")
       sys.exit(-1)
 
     if not genBankAsmId.startswith("GCA_"):
       print(f"ERROR: genBankAsmId does not start with GCA_: {genBankAsmId}")
       sys.exit(-1)
 
     # obtain the directories where the assembly build exists
     rBuildPath = buildPath(refSeqAsmId)
     ic(rBuildPath)
     gBuildPath = buildPath(genBankAsmId)
     ic(gBuildPath)
 
     # verify refSeq gene track is present on the RefSeq assembly
     refSeqBb = rBuildPath + "/trackData/ncbiRefSeq/" + refSeqAsmId + ".ncbiRefSeq.bb"
 
     if not os.path.isfile(refSeqBb):
       print(f"ERROR: refSeq gene track not present on {refSeqAsmId}")
       print(f"{refSeqBb}")
       sys.exit(-1)
 
     genBankWorkDir = gBuildPath + "/trackData/ncbiRefSeq"
     if os.path.isdir(genBankWorkDir):
       print(f"ERROR: genBank ncbiRefSeq directory already exists")
       print(f"{genBankWorkDir}")
       sys.exit(-1)
 
     os.makedirs(genBankWorkDir, exist_ok=True)
     print(f"working in {genBankWorkDir}", file=sys.stderr)
     gcfToGcaLift = genBankWorkDir + "/GCF.to.GCA.lift"
 
     chromAlias = rBuildPath + "/trackData/chromAlias/" + refSeqAsmId + ".chromAlias.txt"
     ic(chromAlias)
     aliasBed = rBuildPath + "/trackData/chromAlias/" + refSeqAsmId + ".chromAlias.bed"
     ic(aliasBed)
 
     aliasHeaderLine, stdErr = subProcess(["head", "-1", chromAlias])
     ic(aliasHeaderLine)
     aliasFieldList = aliasHeaderLine.split('\t')
     # this genbankIndex is the column in the bedAlias file, hence the + 3
     try:
       genbankIndex = aliasFieldList.index("genbank") + 3
     except ValueError:
       print(f"ERROR: can not find 'genbank' in alias field list")
       sys.exit(-1)
 
     if 3 == genbankIndex:
       print(f"ERROR: found 'genbank' in alias list field as first one ?")
       sys.exit(-1)
 
     ic(genbankIndex)
 
     aliasLines = 0
     liftLines = 0
     with open(gcfToGcaLift, 'w') as gcfToGca:
       with open(aliasBed, 'r') as bedAlias:
         lines = csv.reader(bedAlias, delimiter='\t')
         for line in lines:
           aliasLines += 1
           if len(line[genbankIndex]) > 0:
             gcfToGca.write(f"{line[1]}\t{line[0]}\t{line[2]}\t{line[genbankIndex]}\t{line[2]}\n")
             liftLines += 1
     #        print(f"{line[1]}\t{line[0]}\t{line[2]}\t{line[genbankIndex]}\t{line[2]}")
 
     print(f"written\n{gcfToGcaLift}", file=sys.stderr)
 
     lostLines = aliasLines - liftLines
     if lostLines > 2:
       print(f"ERROR: more than two sequence aliases do not match")
       print(f"not matched: {lostLines} = {aliasLines} - {liftLines}")
       sys.exit(-1)
 
     print(f"# refseq sequence count {aliasLines}, genbank sequence count {liftLines}", file=sys.stderr)
 
     runSh = genBankWorkDir + "/run.sh"
     with open(runSh, 'w') as doCmd:
       outStr = f"""#!/bin/bash
     set -beEu -o pipefail
 
     cd {genBankWorkDir}
     export srcAsm="{refSeqAsmId}"
     export targetAsm="{genBankAsmId}"
     export liftFile="`pwd`/GCF.to.GCA.lift"
 
     doNcbiRefSeq.pl -buildDir=`pwd` -assemblyHub \\
       -liftFile="${{liftFile}}" \\
       -target2bit="{gBuildPath}/{genBankAsmId}.2bit" "${{srcAsm}}" "${{targetAsm}}" \\
          > do.log 2>&1
 
     cd {gBuildPath}
     ./doTrackDb.bash
 
     """
       doCmd.write(f"{outStr}")
 
     print(f"running:\n{runSh}")
     os.chmod(runSh, 0o775)
 
     stdOut, stdErr = subProcess([runSh])
     print(f"output from run.sh:\n{stdOut}\n{stdErr}")
 
     sys.exit(0)
 
+### this construct prevents main() from being parsed and syntax checked
+### before the version_info < 3.6 safety check
+
 if __name__ == "__main__":
   main()
-
-# head -3 /hive/data/genomes/asmHubs/refseqBuild/GCF/029/281/585/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri/trackData/chromAlias/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri.chromAlias.txt
-# # refseq        assembly        genbank ncbi    ucsc
-# NC_011120.1     MT              MT      chrM
-# NC_073224.2     chr1_pat_hsa1   CM055446.2      1       chr1
-# [hiram@hgwdev /cluster/home/hiram/kent/src/hg/makeDb/doc/asmHubs] head -3 /hive/data/genomes/asmHubs/refseqBuild/GCF/029/281/585/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri/trackData/chromAlias/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri.chromAlias.bed
-# NC_011120.1     0       16412   NC_011120.1     MT              MT      chrM
-# NC_073224.2     0       243847345       NC_073224.2     chr1_pat_hsa1   CM055446.2      1       chr1
-# NC_073227.2     0       215373162       NC_073227.2     chr3_pat_hsa4   CM055449.2      3       chr3
-#