4329d3be64083601aa9e9e4804d2c479bba780be
gperez2
  Fri Mar 25 11:51:07 2022 -0700
Updating the pairLastzWrapper.py shebang line to have python3, refs #23122

diff --git src/utils/qa/pairLastzWrapper.py src/utils/qa/pairLastzWrapper.py
old mode 100644
new mode 100755
index 2d3636c..a8a16cd
--- src/utils/qa/pairLastzWrapper.py
+++ src/utils/qa/pairLastzWrapper.py
@@ -1,188 +1,188 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Program Header
 # Name:   Gerardo Perez
 # Description: A program that determines the target, query, clades, and full GC assembly hub name to run the pairLastz script
 #
 #
 #
 # pairLastzWrapper
 #
 #
 # Development Environment: VIM - Vi IMproved version 7.4.629
 # Version: Python 3.6.5 
 
 # Imports module
 import os
 import getpass
 import sys
 import re
 import json
 import io
 import requests
 import subprocess
 import argparse
 from datetime import datetime
 
 # Creates an arguement passer
 parser = argparse.ArgumentParser(description="A program that determines the target, query, clades, and full GC assembly hub name to run the pairLastz script")
 
 # Adds arguemets by calling the arguement passer, for assembly one
 parser.add_argument("-a1", "--assembly_one", help="Specify assembly one. Ex. hg38 or GCF_001704415.1", required=True)
 
 # Adds arguemets by calling the arguement passer, for assembly two
 parser.add_argument("-a2", "--assembly_two", help="Specify assembly two Ex. hg38 or GCF_001704415.1", required=True)
 
 # Parses arguemets through using the parse args method.
 args = parser.parse_args()
 
 validClades = ['primate', 'mammal']
 
 #List of prefix names of the native primate databases
 primates=['hg','panTro', 'panPan', 'gorGor', 'ponAbe', 'nomLeu', 'chlSab', 'macFas', 'rheMac', 'papAnu', 'papHam', 'nasLar', 'rhiRox', 'calJac', 'saiBol', 'tarSyr']
 
 def bash(cmd):
     """Input bash cmd and return stdout"""
     rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
     return(rawOutput.stdout.split('\n')[0:-1])
 
 def bash_v2(cmd):
     """Input bash cmd and return stdout plus stderr"""
     rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
     return(rawOutput.stdout)
 
 def goto(gc):
     """Input GC identifier and return full GC assembly hub name"""
     gcX=gc[0:3]
     d0=gc[4:7]
     d1=gc[7:10]
     d2=gc[10:13]
     if gcX == "GCF":
       destDir="/hive/data/genomes/asmHubs/refseqBuild/"+gcX+"/"+d0+"/"+d1+"/"+d2+"/"
     else:
       destDir="/hive/data/genomes/asmHubs/genbankBuild/"+gcX+"/"+d0+"/"+d1+"/"+d2+"/"
     try:
       cdDir = bash("ls -d "+destDir+"/"+gc+"*")
     except subprocess.CalledProcessError:
       sys.stdout = sys.__stdout__
       print('# can not find '+destDir)
       quit()
     cdDir = str(cdDir)[1:-1]
     gcName = cdDir.split('//')[1][:-1]
     return gcName
 
 
 def getClade(assembly):
     """Input assembly and return clade of assembly"""
     if assembly[0:3]=='GCA' or assembly[0:3]=='GCF':
         chromSizes_1="/hive/data/genomes/asmHubs/"+assembly[0:3]+"/"+assembly[4:7]+"/"+assembly[7:10]+"/"+assembly[10:13]+"/"+assembly+"/"+assembly+".chrom.sizes.txt"
         assembly=goto(assembly)
         find_gcNum=bash("grep "+assembly+" /hive/data/outside/ncbi/genomes/reports/newAsm/all.*.today | head -1")
         find_gcNum=str(find_gcNum)[1:-1]
         try:
                clade=find_gcNum.split('.')[2][:-1]
         except IndexError:
                print('# can not find '+assembly+', the assembly might be suppressed')
                quit()
         if clade in validClades:
             return clade
         else: clade='other'
     else:
         chromSizes_1="/hive/data/genomes/"+assembly+"/chrom.sizes"
         try:
                 bash('ls '+chromSizes_1)
                 #print(bash('ls '+chromSizes_1))
         except subprocess.CalledProcessError:
                 sys.stdout = sys.__stdout__
                 print('ls: cannot access '+chromSizes_1+': No such file or directory')
                 quit()
         digit_pos= re.search(r"\d", assembly)
         db_edit = assembly[0:digit_pos.start()]
         if db_edit in primates:
             clade='primate'
         else:
             db = bash("hgsql -e \"select genome from dbDb where name='"+assembly+"'\G\" hgcentraltest | grep  genome | cut -c 9-")
             db = str(db)[1:-1]
             db = bash("hgsql -e \"select clade from genomeClade where genome="+db+"\G\" hgcentraltest | grep  clade | cut -c 8-")
             db = str(db)[2:-2]
             if db in validClades:
                 clade=db
             else: clade='other'
     return clade
 
 def getAssembly(assembly):
     """Input assembly and return native assembly or GC assembly name"""
     if assembly[0:3]=='GCA' or assembly[0:3]=='GCF':
         gcAssembly=goto(assembly)
         return gcAssembly
     else: 
         return assembly
 
 def scoringScheme(assembly):
     """Input assembly and return a score using Hiram's scoring scheme"""
     if assembly[0:3]=='GCA' or assembly[0:3]=='GCF':
         chromSizes_1="/hive/data/genomes/asmHubs/"+assembly[0:3]+"/"+assembly[4:7]+"/"+assembly[7:10]+"/"+assembly[10:13]+"/"+assembly+"/"+assembly+".chrom.sizes.txt"
     else:
         chromSizes_1="/hive/data/genomes/"+assembly+"/chrom.sizes"   
     try:
         nFiftyOutput = bash_v2('n50.pl '+chromSizes_1)
     except subprocess.CalledProcessError:
         sys.stdout = sys.__stdout__
         print('ls: cannot access '+chromSizes_1+': No such file or directory')
         quit()
     nFiftyOutputLineOne=nFiftyOutput.split('\n')[1]
     scores= re.findall(r'[0-9]+', nFiftyOutputLineOne)
     scores.pop(-1)
     scores.append(nFiftyOutput.split('\n')[3].split()[1])
     scores.append(nFiftyOutput.split('\n')[3].split()[3])
     return scores
 
 def compareScores(scoresOne, scoresTwo):
     """Input scores of two assemblies and return a score comparison count"""
     count=0
     if int(scoresOne[0]) < int(scoresTwo[0]):
        count=count+1
    
     if int(scoresOne[1]) > int(scoresTwo[1]):
        count=count+1
    
     if int(scoresOne[2]) == int(scoresTwo[2]):
        count=count+0.5
    
     elif int(scoresOne[2]) < int(scoresTwo[2]):
        count=count+1
        
     if int(scoresOne[3]) > int(scoresTwo[3]):
        count=count+1
     return count
 
 
 
 def main(assembly_one, assembly_two):
     """Input two assemblies and prints target, query, clades, and full GC assembly hub name to run pairLastz script"""
     sys.stdout = sys.__stdout__
     print("screen -S "+datetime.now().strftime("%Y%m%d")+"\n")
     count=compareScores(scoringScheme(assembly_one), scoringScheme(assembly_two))
     if count==2:
         if int(scoringScheme(assembly_one)[1])>int(scoringScheme(assembly_two)[1]):
             print("time (~/kent/src/hg/utils/automation/pairLastz.sh "+getAssembly(assembly_one)+" "+getAssembly(assembly_two)+" "+getClade(assembly_one)+" "+getClade(assembly_two)+") >  "+assembly_one+"."+assembly_two+"_"+datetime.now().strftime("%Y%m%d")+".log 2>&1 &")
             quit()
         else:
             print("time (~/kent/src/hg/utils/automation/pairLastz.sh "+getAssembly(assembly_two)+" "+getAssembly(assembly_one)+" "+getClade(assembly_two)+" "+getClade(assembly_one)+") >  "+assembly_two+"."+assembly_one+"_"+datetime.now().strftime("%Y%m%d")+".log 2>&1 &")
             quit()
     if count >= 2.5:
         print("time (~/kent/src/hg/utils/automation/pairLastz.sh "+getAssembly(assembly_one)+" "+getAssembly(assembly_two)+" "+getClade(assembly_one)+" "+getClade(assembly_two)+") >  "+assembly_one+"."+assembly_two+"_"+datetime.now().strftime("%Y%m%d")+".log 2>&1 &")
         quit()
     else:
         print("time (~/kent/src/hg/utils/automation/pairLastz.sh "+getAssembly(assembly_two)+" "+getAssembly(assembly_one)+" "+getClade(assembly_two)+" "+getClade(assembly_one)+") >  "+assembly_two+"."+assembly_one+"_"+datetime.now().strftime("%Y%m%d")+".log 2>&1 &")
         quit()       
     return
 
 main(args.assembly_one, args.assembly_two)
 
 # Program Output (Commented out)
 
 #python3 pairLastzWrapper.py -a1 GCF_001704415.1 -a2 hg38 
 #screen -S 20220323
 
 #time (~/kent/src/hg/utils/automation/pairLastz.sh hg38 GCF_001704415.1_ARS1 primate mammal) >  hg38.GCF_001704415.1_20220323.log 2>&1 &