269136d8d81138891ee5df1ab47e5e6b42676bed
gperez2
  Wed Mar 23 10:46:18 2022 -0700
Adding a wrapper script for pairLastz.sh, refs #23122

diff --git src/utils/qa/pairLastzWrapper.py src/utils/qa/pairLastzWrapper.py
new file mode 100644
index 0000000..2d3636c
--- /dev/null
+++ src/utils/qa/pairLastzWrapper.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+
+# Program Header
+# Name:   Gerardo Perez
+# Description: A program that determines the target, query, clades, and full GC assembly hub name to run the pairLastz script
+#
+#
+#
+# pairLastzWrapper
+#
+#
+# Development Environment: VIM - Vi IMproved version 7.4.629
+# Version: Python 3.6.5 
+
+# Imports module
+import os
+import getpass
+import sys
+import re
+import json
+import io
+import requests
+import subprocess
+import argparse
+from datetime import datetime
+
+# Creates an arguement passer
+parser = argparse.ArgumentParser(description="A program that determines the target, query, clades, and full GC assembly hub name to run the pairLastz script")
+
+# Adds arguemets by calling the arguement passer, for assembly one
+parser.add_argument("-a1", "--assembly_one", help="Specify assembly one. Ex. hg38 or GCF_001704415.1", required=True)
+
+# Adds arguemets by calling the arguement passer, for assembly two
+parser.add_argument("-a2", "--assembly_two", help="Specify assembly two Ex. hg38 or GCF_001704415.1", required=True)
+
+# Parses arguemets through using the parse args method.
+args = parser.parse_args()
+
+validClades = ['primate', 'mammal']
+
+#List of prefix names of the native primate databases
+primates=['hg','panTro', 'panPan', 'gorGor', 'ponAbe', 'nomLeu', 'chlSab', 'macFas', 'rheMac', 'papAnu', 'papHam', 'nasLar', 'rhiRox', 'calJac', 'saiBol', 'tarSyr']
+
+def bash(cmd):
+    """Input bash cmd and return stdout"""
+    rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
+    return(rawOutput.stdout.split('\n')[0:-1])
+
+def bash_v2(cmd):
+    """Input bash cmd and return stdout plus stderr"""
+    rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+    return(rawOutput.stdout)
+
+def goto(gc):
+    """Input GC identifier and return full GC assembly hub name"""
+    gcX=gc[0:3]
+    d0=gc[4:7]
+    d1=gc[7:10]
+    d2=gc[10:13]
+    if gcX == "GCF":
+      destDir="/hive/data/genomes/asmHubs/refseqBuild/"+gcX+"/"+d0+"/"+d1+"/"+d2+"/"
+    else:
+      destDir="/hive/data/genomes/asmHubs/genbankBuild/"+gcX+"/"+d0+"/"+d1+"/"+d2+"/"
+    try:
+      cdDir = bash("ls -d "+destDir+"/"+gc+"*")
+    except subprocess.CalledProcessError:
+      sys.stdout = sys.__stdout__
+      print('# can not find '+destDir)
+      quit()
+    cdDir = str(cdDir)[1:-1]
+    gcName = cdDir.split('//')[1][:-1]
+    return gcName
+
+
+def getClade(assembly):
+    """Input assembly and return clade of assembly"""
+    if assembly[0:3]=='GCA' or assembly[0:3]=='GCF':
+        chromSizes_1="/hive/data/genomes/asmHubs/"+assembly[0:3]+"/"+assembly[4:7]+"/"+assembly[7:10]+"/"+assembly[10:13]+"/"+assembly+"/"+assembly+".chrom.sizes.txt"
+        assembly=goto(assembly)
+        find_gcNum=bash("grep "+assembly+" /hive/data/outside/ncbi/genomes/reports/newAsm/all.*.today | head -1")
+        find_gcNum=str(find_gcNum)[1:-1]
+        try:
+               clade=find_gcNum.split('.')[2][:-1]
+        except IndexError:
+               print('# can not find '+assembly+', the assembly might be suppressed')
+               quit()
+        if clade in validClades:
+            return clade
+        else: clade='other'
+    else:
+        chromSizes_1="/hive/data/genomes/"+assembly+"/chrom.sizes"
+        try:
+                bash('ls '+chromSizes_1)
+                #print(bash('ls '+chromSizes_1))
+        except subprocess.CalledProcessError:
+                sys.stdout = sys.__stdout__
+                print('ls: cannot access '+chromSizes_1+': No such file or directory')
+                quit()
+        digit_pos= re.search(r"\d", assembly)
+        db_edit = assembly[0:digit_pos.start()]
+        if db_edit in primates:
+            clade='primate'
+        else:
+            db = bash("hgsql -e \"select genome from dbDb where name='"+assembly+"'\G\" hgcentraltest | grep  genome | cut -c 9-")
+            db = str(db)[1:-1]
+            db = bash("hgsql -e \"select clade from genomeClade where genome="+db+"\G\" hgcentraltest | grep  clade | cut -c 8-")
+            db = str(db)[2:-2]
+            if db in validClades:
+                clade=db
+            else: clade='other'
+    return clade
+
+def getAssembly(assembly):
+    """Input assembly and return native assembly or GC assembly name"""
+    if assembly[0:3]=='GCA' or assembly[0:3]=='GCF':
+        gcAssembly=goto(assembly)
+        return gcAssembly
+    else: 
+        return assembly
+
+def scoringScheme(assembly):
+    """Input assembly and return a score using Hiram's scoring scheme"""
+    if assembly[0:3]=='GCA' or assembly[0:3]=='GCF':
+        chromSizes_1="/hive/data/genomes/asmHubs/"+assembly[0:3]+"/"+assembly[4:7]+"/"+assembly[7:10]+"/"+assembly[10:13]+"/"+assembly+"/"+assembly+".chrom.sizes.txt"
+    else:
+        chromSizes_1="/hive/data/genomes/"+assembly+"/chrom.sizes"   
+    try:
+        nFiftyOutput = bash_v2('n50.pl '+chromSizes_1)
+    except subprocess.CalledProcessError:
+        sys.stdout = sys.__stdout__
+        print('ls: cannot access '+chromSizes_1+': No such file or directory')
+        quit()
+    nFiftyOutputLineOne=nFiftyOutput.split('\n')[1]
+    scores= re.findall(r'[0-9]+', nFiftyOutputLineOne)
+    scores.pop(-1)
+    scores.append(nFiftyOutput.split('\n')[3].split()[1])
+    scores.append(nFiftyOutput.split('\n')[3].split()[3])
+    return scores
+
+def compareScores(scoresOne, scoresTwo):
+    """Input scores of two assemblies and return a score comparison count"""
+    count=0
+    if int(scoresOne[0]) < int(scoresTwo[0]):
+       count=count+1
+   
+    if int(scoresOne[1]) > int(scoresTwo[1]):
+       count=count+1
+   
+    if int(scoresOne[2]) == int(scoresTwo[2]):
+       count=count+0.5
+   
+    elif int(scoresOne[2]) < int(scoresTwo[2]):
+       count=count+1
+       
+    if int(scoresOne[3]) > int(scoresTwo[3]):
+       count=count+1
+    return count
+
+
+
+def main(assembly_one, assembly_two):
+    """Input two assemblies and prints target, query, clades, and full GC assembly hub name to run pairLastz script"""
+    sys.stdout = sys.__stdout__
+    print("screen -S "+datetime.now().strftime("%Y%m%d")+"\n")
+    count=compareScores(scoringScheme(assembly_one), scoringScheme(assembly_two))
+    if count==2:
+        if int(scoringScheme(assembly_one)[1])>int(scoringScheme(assembly_two)[1]):
+            print("time (~/kent/src/hg/utils/automation/pairLastz.sh "+getAssembly(assembly_one)+" "+getAssembly(assembly_two)+" "+getClade(assembly_one)+" "+getClade(assembly_two)+") >  "+assembly_one+"."+assembly_two+"_"+datetime.now().strftime("%Y%m%d")+".log 2>&1 &")
+            quit()
+        else:
+            print("time (~/kent/src/hg/utils/automation/pairLastz.sh "+getAssembly(assembly_two)+" "+getAssembly(assembly_one)+" "+getClade(assembly_two)+" "+getClade(assembly_one)+") >  "+assembly_two+"."+assembly_one+"_"+datetime.now().strftime("%Y%m%d")+".log 2>&1 &")
+            quit()
+    if count >= 2.5:
+        print("time (~/kent/src/hg/utils/automation/pairLastz.sh "+getAssembly(assembly_one)+" "+getAssembly(assembly_two)+" "+getClade(assembly_one)+" "+getClade(assembly_two)+") >  "+assembly_one+"."+assembly_two+"_"+datetime.now().strftime("%Y%m%d")+".log 2>&1 &")
+        quit()
+    else:
+        print("time (~/kent/src/hg/utils/automation/pairLastz.sh "+getAssembly(assembly_two)+" "+getAssembly(assembly_one)+" "+getClade(assembly_two)+" "+getClade(assembly_one)+") >  "+assembly_two+"."+assembly_one+"_"+datetime.now().strftime("%Y%m%d")+".log 2>&1 &")
+        quit()       
+    return
+
+main(args.assembly_one, args.assembly_two)
+
+# Program Output (Commented out)
+
+#python3 pairLastzWrapper.py -a1 GCF_001704415.1 -a2 hg38 
+#screen -S 20220323
+
+#time (~/kent/src/hg/utils/automation/pairLastz.sh hg38 GCF_001704415.1_ARS1 primate mammal) >  hg38.GCF_001704415.1_20220323.log 2>&1 &