src/utils/qa/pairLastzWrapper.py 742f49971278e8b13c9d6bfef8be97263eae84d1

742f49971278e8b13c9d6bfef8be97263eae84d1
jnavarr5
  Tue Nov 25 16:44:28 2025 -0800
Updating the pairLastzWrapper.py script so it can generate liftOver files for the older assemblies. Previously, you would get a message saying the assembly was suppressed since it could not determine the clade using the file given (all.*.today), refs #36712 #33200

diff --git src/utils/qa/pairLastzWrapper.py src/utils/qa/pairLastzWrapper.py
index e6c8e55d2ae..8fd0ec94a9c 100755
--- src/utils/qa/pairLastzWrapper.py
+++ src/utils/qa/pairLastzWrapper.py
@@ -24,31 +24,31 @@
 import argparse
 from datetime import datetime
 
 # Creates an arguement passer
 parser = argparse.ArgumentParser(description="A program that determines the target, query, clades, and full GC assembly hub name to run the pairLastz script")
 
 # Adds arguemets by calling the arguement passer, for assembly one
 parser.add_argument("-a1", "--assembly_one", help="Specify assembly one. Ex. hg38 or GCF_001704415.1", required=True)
 
 # Adds arguemets by calling the arguement passer, for assembly two
 parser.add_argument("-a2", "--assembly_two", help="Specify assembly two Ex. hg38 or GCF_001704415.1", required=True)
 
 # Parses arguemets through using the parse args method.
 args = parser.parse_args()
 
-validClades = ['primate', 'mammal']
+validClades = {'primate': ['primates','primates(L)'], 'mammal': ['mammals','mammals(L)']}
 
 #List of prefix names of the native primate databases
 primates=['hg','panTro', 'panPan', 'gorGor', 'ponAbe', 'nomLeu', 'chlSab', 'macFas', 'rheMac', 'papAnu', 'papHam', 'nasLar', 'rhiRox', 'calJac', 'saiBol', 'tarSyr']
 
 def bash(cmd):
     """Input bash cmd and return stdout"""
     rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
     return(rawOutput.stdout.split('\n')[0:-1])
 
 def bash_v2(cmd):
     """Input bash cmd and return stdout plus stderr"""
     rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
     return(rawOutput.stdout)
 
 def goto(gc):
@@ -65,39 +65,44 @@
       cdDir = bash("ls -d "+destDir+"/"+gc+"*")
     except subprocess.CalledProcessError:
       sys.stdout = sys.__stdout__
       print('# can not find '+destDir)
       quit()
     cdDir = str(cdDir)[1:-1]
     gcName = cdDir.split('//')[1][:-1]
     return gcName
 
 
 def getClade(assembly):
     """Input assembly and return clade of assembly"""
     if assembly[0:3]=='GCA' or assembly[0:3]=='GCF':
         chromSizes_1="/hive/data/genomes/asmHubs/"+assembly[0:3]+"/"+assembly[4:7]+"/"+assembly[7:10]+"/"+assembly[10:13]+"/"+assembly+"/"+assembly+".chrom.sizes.txt"
         assembly=goto(assembly)
-        find_gcNum=bash("grep "+assembly+" /hive/data/outside/ncbi/genomes/reports/newAsm/all.*.today | head -1")
-        find_gcNum=str(find_gcNum)[1:-1]
+        assembly_version = "_".join(assembly.split('_', 2)[:2]) # Get the assembly ID, e.g. GCF_016772045.1
+        find_gcNum=bash(f"grep {assembly_version} /hive/data/genomes/asmHubs/UCSC_GI.assemblyHubList.txt")[0] #Bash returns a list, so get the first item
+
         try:
-               clade=find_gcNum.split('.')[2][:-1]
+               clade=find_gcNum.split('\t')[5] # Value is tab-separated, the clade is the last item
         except IndexError:
                print('# can not find '+assembly+', the assembly might be suppressed')
                quit()
         if clade in validClades:
             return clade
+        for singular, plural in validClades.items():
+            if clade in plural:
+                return singular # check to see if 'mammals' or 'mammal(L)' is given. If so return 'mammal'
+
         else: clade='other'
     else:
         chromSizes_1="/hive/data/genomes/"+assembly+"/chrom.sizes"
         try:
                 bash('ls '+chromSizes_1)
                 #print(bash('ls '+chromSizes_1))
         except subprocess.CalledProcessError:
                 sys.stdout = sys.__stdout__
                 print('ls: cannot access '+chromSizes_1+': No such file or directory')
                 quit()
         digit_pos= re.search(r"\d", assembly)
         db_edit = assembly[0:digit_pos.start()]
         if db_edit in primates:
             clade='primate'
         else: