8c99a44e8855b4d78a741f91c02abc603c7ca7ca hiram Fri May 24 14:29:24 2024 -0700 the version check stuff does not actually work diff --git src/hg/utils/automation/refSeqOnGenBank.py src/hg/utils/automation/refSeqOnGenBank.py index 890e329..115b233 100755 --- src/hg/utils/automation/refSeqOnGenBank.py +++ src/hg/utils/automation/refSeqOnGenBank.py @@ -1,187 +1,176 @@ #!/usr/bin/env python3 ### refSeqOnGenBank.py ### take RefSeq genes from a GCF refseq assembly and add them as tracks ### to the equivalent GCA genbank assembly ### also used is a botique module 'icecream' which is not typically ### available. import sys import os import re import subprocess import csv ############################################################################# ### helper functions ### additional error handling could be done here to output information ### in case of script crashing with subprocess.CalledProcessError ### maybe later def subProcess(cmd): if sys.version_info >= (3, 7): rtn = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) else: rtn = subprocess.run(cmd, stdout=subprocess.PIPE, universal_newlines=True) if rtn.returncode != 0: print(f"ERROR: subProcess({cmd})") sys.exit(-1) return rtn.stdout.strip(), rtn.stderr # the strip() removes any leading or trailing whitespace # use just rtn.stdout if you want the precise output def buildPath(asmId): gcX = asmId[0:3] d0 = asmId[4:7] d1 = asmId[7:10] d2 = asmId[10:13] tP = "/hive/data/genomes/asmHubs/allBuild/" + "/".join([gcX,d0,d1,d2,asmId]) buildPath, stdErr = subProcess(["realpath", tP]) if not os.path.isdir(buildPath): print(f"ERROR: can not find build directory {tP}") sys.exit(-1) return buildPath ############################################################################# -# Check if the Python version is at least 3.6 (due to use of f-strings here) -if sys.version_info < (3, 6): - print("This script requires Python 3.6 or higher.") - sys.exit(1) - hostEnv, stdErr = subProcess(["uname", "-n"]) ### to use all the special stuff when hgwdev /usr/bin/python3 is used ### this allows it to find the icecream module if re.match(r'hgwdev*', hostEnv): sys.path.append("/cluster/software/lib/python3.6/site-packages") ### icecream is a convenient debug printout system using the ic() function from icecream import install install() ### ic(someArg) will print out the name of the argument and its contents ### for any type of object ic(sys.argv) +ic(sys.version_info) # Check if the correct number of command line arguments is provided if len(sys.argv) != 3: print("Usage: refSeqOnGenBank.py <refSeqAsmId> <genBankAsmId>") print("will add the RefSeq gene track to the GenBank assembly") print("when the refSeq genes exist on that refSeq assembly") print(" e.g.:\nrefSeqOnGenBank.py GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri \\\n\tGCA_029281585.2_NHGRI_mGorGor1-v2.0_pri") sys.exit(-1) -def main(): refSeqAsmId = sys.argv[1] genBankAsmId = sys.argv[2] if not refSeqAsmId.startswith("GCF_"): print(f"ERROR: refSeqAsmId does not start with GCF_: {refSeqAsmId}") sys.exit(-1) if not genBankAsmId.startswith("GCA_"): print(f"ERROR: genBankAsmId does not start with GCA_: {genBankAsmId}") sys.exit(-1) # obtain the directories where the assembly build exists rBuildPath = buildPath(refSeqAsmId) ic(rBuildPath) gBuildPath = buildPath(genBankAsmId) ic(gBuildPath) # verify refSeq gene track is present on the RefSeq assembly refSeqBb = rBuildPath + "/trackData/ncbiRefSeq/" + refSeqAsmId + ".ncbiRefSeq.bb" if not os.path.isfile(refSeqBb): print(f"ERROR: refSeq gene track not present on {refSeqAsmId}") print(f"{refSeqBb}") sys.exit(-1) genBankWorkDir = gBuildPath + "/trackData/ncbiRefSeq" if os.path.isdir(genBankWorkDir): print(f"ERROR: genBank ncbiRefSeq directory already exists") print(f"{genBankWorkDir}") sys.exit(-1) os.makedirs(genBankWorkDir, exist_ok=True) print(f"working in {genBankWorkDir}", file=sys.stderr) gcfToGcaLift = genBankWorkDir + "/GCF.to.GCA.lift" chromAlias = rBuildPath + "/trackData/chromAlias/" + refSeqAsmId + ".chromAlias.txt" ic(chromAlias) aliasBed = rBuildPath + "/trackData/chromAlias/" + refSeqAsmId + ".chromAlias.bed" ic(aliasBed) aliasHeaderLine, stdErr = subProcess(["head", "-1", chromAlias]) ic(aliasHeaderLine) aliasFieldList = aliasHeaderLine.split('\t') # this genbankIndex is the column in the bedAlias file, hence the + 3 try: genbankIndex = aliasFieldList.index("genbank") + 3 except ValueError: print(f"ERROR: can not find 'genbank' in alias field list") sys.exit(-1) if 3 == genbankIndex: print(f"ERROR: found 'genbank' in alias list field as first one ?") sys.exit(-1) ic(genbankIndex) aliasLines = 0 liftLines = 0 with open(gcfToGcaLift, 'w') as gcfToGca: with open(aliasBed, 'r') as bedAlias: lines = csv.reader(bedAlias, delimiter='\t') for line in lines: aliasLines += 1 if len(line[genbankIndex]) > 0: gcfToGca.write(f"{line[1]}\t{line[0]}\t{line[2]}\t{line[genbankIndex]}\t{line[2]}\n") liftLines += 1 # print(f"{line[1]}\t{line[0]}\t{line[2]}\t{line[genbankIndex]}\t{line[2]}") print(f"written\n{gcfToGcaLift}", file=sys.stderr) lostLines = aliasLines - liftLines if lostLines > 2: print(f"ERROR: more than two sequence aliases do not match") print(f"not matched: {lostLines} = {aliasLines} - {liftLines}") sys.exit(-1) print(f"# refseq sequence count {aliasLines}, genbank sequence count {liftLines}", file=sys.stderr) runSh = genBankWorkDir + "/run.sh" with open(runSh, 'w') as doCmd: outStr = f"""#!/bin/bash set -beEu -o pipefail cd {genBankWorkDir} export srcAsm="{refSeqAsmId}" export targetAsm="{genBankAsmId}" export liftFile="`pwd`/GCF.to.GCA.lift" doNcbiRefSeq.pl -buildDir=`pwd` -assemblyHub \\ -liftFile="${{liftFile}}" \\ -target2bit="{gBuildPath}/{genBankAsmId}.2bit" "${{srcAsm}}" "${{targetAsm}}" \\ > do.log 2>&1 cd {gBuildPath} ./doTrackDb.bash """ doCmd.write(f"{outStr}") print(f"running:\n{runSh}") os.chmod(runSh, 0o775) stdOut, stdErr = subProcess([runSh]) print(f"output from run.sh:\n{stdOut}\n{stdErr}") sys.exit(0) - -### this construct prevents main() from being parsed and syntax checked -### before the version_info < 3.6 safety check - -if __name__ == "__main__": - main()