d258476e0cc734212538e8ab0a7509acb8b24aaa hiram Fri May 24 14:10:17 2024 -0700 remove cruft no redmine diff --git src/hg/utils/automation/refSeqOnGenBank.py src/hg/utils/automation/refSeqOnGenBank.py index 7689753..890e329 100755 --- src/hg/utils/automation/refSeqOnGenBank.py +++ src/hg/utils/automation/refSeqOnGenBank.py @@ -1,194 +1,187 @@ #!/usr/bin/env python3 ### refSeqOnGenBank.py ### take RefSeq genes from a GCF refseq assembly and add them as tracks ### to the equivalent GCA genbank assembly ### also used is a botique module 'icecream' which is not typically ### available. import sys import os import re import subprocess import csv ############################################################################# ### helper functions ### additional error handling could be done here to output information ### in case of script crashing with subprocess.CalledProcessError ### maybe later def subProcess(cmd): if sys.version_info >= (3, 7): rtn = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) else: rtn = subprocess.run(cmd, stdout=subprocess.PIPE, universal_newlines=True) if rtn.returncode != 0: print(f"ERROR: subProcess({cmd})") sys.exit(-1) return rtn.stdout.strip(), rtn.stderr # the strip() removes any leading or trailing whitespace # use just rtn.stdout if you want the precise output def buildPath(asmId): gcX = asmId[0:3] d0 = asmId[4:7] d1 = asmId[7:10] d2 = asmId[10:13] tP = "/hive/data/genomes/asmHubs/allBuild/" + "/".join([gcX,d0,d1,d2,asmId]) buildPath, stdErr = subProcess(["realpath", tP]) if not os.path.isdir(buildPath): print(f"ERROR: can not find build directory {tP}") sys.exit(-1) return buildPath ############################################################################# # Check if the Python version is at least 3.6 (due to use of f-strings here) if sys.version_info < (3, 6): print("This script requires Python 3.6 or higher.") sys.exit(1) hostEnv, stdErr = subProcess(["uname", "-n"]) ### to use all the special stuff when hgwdev /usr/bin/python3 is used ### this allows it to find the icecream module if re.match(r'hgwdev*', hostEnv): sys.path.append("/cluster/software/lib/python3.6/site-packages") ### icecream is a convenient debug printout system using the ic() function from icecream import install install() ### ic(someArg) will print out the name of the argument and its contents ### for any type of object ic(sys.argv) # Check if the correct number of command line arguments is provided if len(sys.argv) != 3: print("Usage: refSeqOnGenBank.py <refSeqAsmId> <genBankAsmId>") print("will add the RefSeq gene track to the GenBank assembly") print("when the refSeq genes exist on that refSeq assembly") print(" e.g.:\nrefSeqOnGenBank.py GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri \\\n\tGCA_029281585.2_NHGRI_mGorGor1-v2.0_pri") sys.exit(-1) def main(): refSeqAsmId = sys.argv[1] genBankAsmId = sys.argv[2] if not refSeqAsmId.startswith("GCF_"): print(f"ERROR: refSeqAsmId does not start with GCF_: {refSeqAsmId}") sys.exit(-1) if not genBankAsmId.startswith("GCA_"): print(f"ERROR: genBankAsmId does not start with GCA_: {genBankAsmId}") sys.exit(-1) # obtain the directories where the assembly build exists rBuildPath = buildPath(refSeqAsmId) ic(rBuildPath) gBuildPath = buildPath(genBankAsmId) ic(gBuildPath) # verify refSeq gene track is present on the RefSeq assembly refSeqBb = rBuildPath + "/trackData/ncbiRefSeq/" + refSeqAsmId + ".ncbiRefSeq.bb" if not os.path.isfile(refSeqBb): print(f"ERROR: refSeq gene track not present on {refSeqAsmId}") print(f"{refSeqBb}") sys.exit(-1) genBankWorkDir = gBuildPath + "/trackData/ncbiRefSeq" if os.path.isdir(genBankWorkDir): print(f"ERROR: genBank ncbiRefSeq directory already exists") print(f"{genBankWorkDir}") sys.exit(-1) os.makedirs(genBankWorkDir, exist_ok=True) print(f"working in {genBankWorkDir}", file=sys.stderr) gcfToGcaLift = genBankWorkDir + "/GCF.to.GCA.lift" chromAlias = rBuildPath + "/trackData/chromAlias/" + refSeqAsmId + ".chromAlias.txt" ic(chromAlias) aliasBed = rBuildPath + "/trackData/chromAlias/" + refSeqAsmId + ".chromAlias.bed" ic(aliasBed) aliasHeaderLine, stdErr = subProcess(["head", "-1", chromAlias]) ic(aliasHeaderLine) aliasFieldList = aliasHeaderLine.split('\t') # this genbankIndex is the column in the bedAlias file, hence the + 3 try: genbankIndex = aliasFieldList.index("genbank") + 3 except ValueError: print(f"ERROR: can not find 'genbank' in alias field list") sys.exit(-1) if 3 == genbankIndex: print(f"ERROR: found 'genbank' in alias list field as first one ?") sys.exit(-1) ic(genbankIndex) aliasLines = 0 liftLines = 0 with open(gcfToGcaLift, 'w') as gcfToGca: with open(aliasBed, 'r') as bedAlias: lines = csv.reader(bedAlias, delimiter='\t') for line in lines: aliasLines += 1 if len(line[genbankIndex]) > 0: gcfToGca.write(f"{line[1]}\t{line[0]}\t{line[2]}\t{line[genbankIndex]}\t{line[2]}\n") liftLines += 1 # print(f"{line[1]}\t{line[0]}\t{line[2]}\t{line[genbankIndex]}\t{line[2]}") print(f"written\n{gcfToGcaLift}", file=sys.stderr) lostLines = aliasLines - liftLines if lostLines > 2: print(f"ERROR: more than two sequence aliases do not match") print(f"not matched: {lostLines} = {aliasLines} - {liftLines}") sys.exit(-1) print(f"# refseq sequence count {aliasLines}, genbank sequence count {liftLines}", file=sys.stderr) runSh = genBankWorkDir + "/run.sh" with open(runSh, 'w') as doCmd: outStr = f"""#!/bin/bash set -beEu -o pipefail cd {genBankWorkDir} export srcAsm="{refSeqAsmId}" export targetAsm="{genBankAsmId}" export liftFile="`pwd`/GCF.to.GCA.lift" doNcbiRefSeq.pl -buildDir=`pwd` -assemblyHub \\ -liftFile="${{liftFile}}" \\ -target2bit="{gBuildPath}/{genBankAsmId}.2bit" "${{srcAsm}}" "${{targetAsm}}" \\ > do.log 2>&1 cd {gBuildPath} ./doTrackDb.bash """ doCmd.write(f"{outStr}") print(f"running:\n{runSh}") os.chmod(runSh, 0o775) stdOut, stdErr = subProcess([runSh]) print(f"output from run.sh:\n{stdOut}\n{stdErr}") sys.exit(0) +### this construct prevents main() from being parsed and syntax checked +### before the version_info < 3.6 safety check + if __name__ == "__main__": main() - -# head -3 /hive/data/genomes/asmHubs/refseqBuild/GCF/029/281/585/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri/trackData/chromAlias/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri.chromAlias.txt -# # refseq assembly genbank ncbi ucsc -# NC_011120.1 MT MT chrM -# NC_073224.2 chr1_pat_hsa1 CM055446.2 1 chr1 -# [hiram@hgwdev /cluster/home/hiram/kent/src/hg/makeDb/doc/asmHubs] head -3 /hive/data/genomes/asmHubs/refseqBuild/GCF/029/281/585/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri/trackData/chromAlias/GCF_029281585.2_NHGRI_mGorGor1-v2.0_pri.chromAlias.bed -# NC_011120.1 0 16412 NC_011120.1 MT MT chrM -# NC_073224.2 0 243847345 NC_073224.2 chr1_pat_hsa1 CM055446.2 1 chr1 -# NC_073227.2 0 215373162 NC_073227.2 chr3_pat_hsa4 CM055449.2 3 chr3 -#