742f49971278e8b13c9d6bfef8be97263eae84d1 jnavarr5 Tue Nov 25 16:44:28 2025 -0800 Updating the pairLastzWrapper.py script so it can generate liftOver files for the older assemblies. Previously, you would get a message saying the assembly was suppressed since it could not determine the clade using the file given (all.*.today), refs #36712 #33200 diff --git src/utils/qa/pairLastzWrapper.py src/utils/qa/pairLastzWrapper.py index e6c8e55d2ae..8fd0ec94a9c 100755 --- src/utils/qa/pairLastzWrapper.py +++ src/utils/qa/pairLastzWrapper.py @@ -24,31 +24,31 @@ import argparse from datetime import datetime # Creates an arguement passer parser = argparse.ArgumentParser(description="A program that determines the target, query, clades, and full GC assembly hub name to run the pairLastz script") # Adds arguemets by calling the arguement passer, for assembly one parser.add_argument("-a1", "--assembly_one", help="Specify assembly one. Ex. hg38 or GCF_001704415.1", required=True) # Adds arguemets by calling the arguement passer, for assembly two parser.add_argument("-a2", "--assembly_two", help="Specify assembly two Ex. hg38 or GCF_001704415.1", required=True) # Parses arguemets through using the parse args method. args = parser.parse_args() -validClades = ['primate', 'mammal'] +validClades = {'primate': ['primates','primates(L)'], 'mammal': ['mammals','mammals(L)']} #List of prefix names of the native primate databases primates=['hg','panTro', 'panPan', 'gorGor', 'ponAbe', 'nomLeu', 'chlSab', 'macFas', 'rheMac', 'papAnu', 'papHam', 'nasLar', 'rhiRox', 'calJac', 'saiBol', 'tarSyr'] def bash(cmd): """Input bash cmd and return stdout""" rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True) return(rawOutput.stdout.split('\n')[0:-1]) def bash_v2(cmd): """Input bash cmd and return stdout plus stderr""" rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) return(rawOutput.stdout) def goto(gc): @@ -65,39 +65,44 @@ cdDir = bash("ls -d "+destDir+"/"+gc+"*") except subprocess.CalledProcessError: sys.stdout = sys.__stdout__ print('# can not find '+destDir) quit() cdDir = str(cdDir)[1:-1] gcName = cdDir.split('//')[1][:-1] return gcName def getClade(assembly): """Input assembly and return clade of assembly""" if assembly[0:3]=='GCA' or assembly[0:3]=='GCF': chromSizes_1="/hive/data/genomes/asmHubs/"+assembly[0:3]+"/"+assembly[4:7]+"/"+assembly[7:10]+"/"+assembly[10:13]+"/"+assembly+"/"+assembly+".chrom.sizes.txt" assembly=goto(assembly) - find_gcNum=bash("grep "+assembly+" /hive/data/outside/ncbi/genomes/reports/newAsm/all.*.today | head -1") - find_gcNum=str(find_gcNum)[1:-1] + assembly_version = "_".join(assembly.split('_', 2)[:2]) # Get the assembly ID, e.g. GCF_016772045.1 + find_gcNum=bash(f"grep {assembly_version} /hive/data/genomes/asmHubs/UCSC_GI.assemblyHubList.txt")[0] #Bash returns a list, so get the first item + try: - clade=find_gcNum.split('.')[2][:-1] + clade=find_gcNum.split('\t')[5] # Value is tab-separated, the clade is the last item except IndexError: print('# can not find '+assembly+', the assembly might be suppressed') quit() if clade in validClades: return clade + for singular, plural in validClades.items(): + if clade in plural: + return singular # check to see if 'mammals' or 'mammal(L)' is given. If so return 'mammal' + else: clade='other' else: chromSizes_1="/hive/data/genomes/"+assembly+"/chrom.sizes" try: bash('ls '+chromSizes_1) #print(bash('ls '+chromSizes_1)) except subprocess.CalledProcessError: sys.stdout = sys.__stdout__ print('ls: cannot access '+chromSizes_1+': No such file or directory') quit() digit_pos= re.search(r"\d", assembly) db_edit = assembly[0:digit_pos.start()] if db_edit in primates: clade='primate' else: