src/utils/qa/jasparUpdate.py 9b025ae8f9ec2c72f1c7d72e2b3b9c25be099e05

9b025ae8f9ec2c72f1c7d72e2b3b9c25be099e05
jnavarr5
  Tue Jun 11 11:46:05 2024 -0700
Adding a usage statement to the updateJaspar.py script. Now prints the usage statement when no arguments are given and runs the script when using a -e flag to run the script. refs #32537

diff --git src/utils/qa/jasparUpdate.py src/utils/qa/jasparUpdate.py
index fe87711..41ebb28 100755
--- src/utils/qa/jasparUpdate.py
+++ src/utils/qa/jasparUpdate.py
@@ -1,215 +1,248 @@
 #!/usr/bin/python3
+"""
+Program to automate the update of the JASPAR tracks for mm10, mm39, hg19, and hg38.
+Builds shared trackDb stanza based on hg38's trackDb and writes to a file (jaspar.ra).
+Downloads all files to the current directory (bigBeds and HTML).
+Creates a file with the steps to complete the update (updateSteps.txt).
+
+Assumptions:
+   * Shared HTML, PFMs, and bigBeds are all obtained from:
+     https://frigg.uio.no/JASPAR/JASPAR_genome_browser_tracks/current/
+"""
 import argparse
 import sys
 import subprocess
 from io import BytesIO
 from zipfile import ZipFile
 from urllib.request import urlopen
 from ucscGb.qa.tables import trackUtils
 from ucscGb.qa import qaUtils
 
+
+
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return (bashStdoutt)
 
 def download(URL):
     '''
     Generic command to wget a URL and prints the command to STDOUT.
     '''
     cmd = "wget %s" % URL
     print("    " + cmd)
     bash(cmd)
 
 def analyzePFM(PFMzip):
     '''
     Given a URL to the PFM matrix for the JASPAR hub, read each line and calculate the
     nucleotide percents at each position for the Motif display. The function returns a dicitonary
     with the motif name as the key and a tuple as the value (the percents and column counts 
     '''
     archive = ZipFile(BytesIO(urlopen(PFMzip).read()),'r')
     PFMtable = {}
     for eachFile in archive.namelist(): # Iterate through each of files
         aList = [] 
         cList = []
         gList = []
         tList = []
         numOfColumns = 0 # container for the number of columns
         for line in archive.open(eachFile):
             currentLine = line.decode('utf-8').strip() #convert to utf-8
             if currentLine.startswith('>'):
                 continue  # Skip the header line
             # Matrix line creates a list of seen values
             matrix = currentLine.split("[")[1].strip().split(']')[0].strip().split()
             numOfColumns = len(matrix)
             if currentLine.startswith('A'):
                 aList = [int(x) for x in matrix]
             elif currentLine.startswith('C'):
                 cList = [int(x) for x in matrix]
             elif currentLine.startswith('G'):
                 gList = [int(x) for x in matrix]
             elif currentLine.startswith('T'):
                 tList = [int(x) for x in matrix]
             else:
                 print ("Line not recognized")
         aPercent = []
         cPercent = []
         gPercent = []
         tPercent = []
         for A,C,G,T in zip(aList,cList,gList,tList): # Go through each of the counts in the matrix
             total = A+C+T+G # Calcualte the column total
             aPercent.append(float("%.6g" % (A/total)))
             cPercent.append(float("%.6g" % (C/total)))
             gPercent.append(float("%.6g" % (G/total)))
             tPercent.append(float("%.6g" % (T/total)))
 
         #remove .jaspar from the filename
         matrixName = eachFile[:-7]
         percents = str(aPercent).strip("[]") + "\t"
         percents += str(cPercent).strip("[]") + "\t"
         percents += str(gPercent).strip("[]") + "\t"
         percents += str(tPercent).strip("[]") # Add each percent as a tab-separated column
          
         PFMtable[matrixName] = percents, numOfColumns # Add the matrix to the dictionary
     return PFMtable
 
 def getTrackDb(db):
     '''
     Read through the trackDb file. This generator reads through the first stanza in the trackDb
     file, and returns the key/value pairs. Upon finding the first empty new line, the generator
     stops. (Could lead to a possible bug if the first line is empty.)
     '''
     trackDb = "https://frigg.uio.no/JASPAR/JASPAR_genome_browser_tracks/current/%s/trackDb.txt" % db
     try:
         for line in urlopen(trackDb):
             key,value = line.decode('utf-8').strip().split(" ", 1) # split using first space
             yield key,value
     except ValueError:
         return key,value
 
 def buildTrackDb(db):
     '''
     Given a database, creates a trackDb for the JASPAR track for the UCSC database. Modifies
     settings that were specific to the hub and changes them to fit our architechure.
     '''
     newTrackDb = [] #final container to return
     year = "" #container for this years release
     for setting, value in getTrackDb(db): # Make a unified trackDb using hg38
         if setting == "track":
             newTrackName = value.split("_")[0].lower() #edit the track name to how we have it
             year = newTrackName[6:]
             newTrackDb.append([setting,newTrackName])
             newTrackDb.append(["parent", "jaspar on"]) #Adding settings that aren't in the hub
             newTrackDb.append(["priority","1"])
         elif setting == "shortLabel":
             shortLabel = "JASPAR %s TFBS" % year
             newTrackDb.append([setting, shortLabel])
         elif setting == "longLabel":
             longLabel = "JASPAR CORE %s - Predicted Transcription Factor Binding Sites" % year
             newTrackDb.append([setting, longLabel])
         elif setting == "html": #skip the HTML section for now
             continue
         elif setting == "bigDataUrl": # Make the bigDataUrl point to /gbdb
             #download file
             path = "/gbdb/$D/jaspar/JASPAR%s.bb" % year
             newTrackDb.append([setting, path])
         elif setting == "filterValues.name":
             # test to make sure the filters are in alphebetical order
             sortedFilter = sorted(value.split(","), key=str.casefold)
             newTrackDb.append([setting, ",".join(sortedFilter)])
         else:
             newTrackDb.append([setting,value])
 
     return newTrackDb
 
+def get_options():
+    '''
+    Parses the command line options
+    '''
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter,
+                                     usage="%(prog)s -e")
+    parser.add_argument('-e', '--execute', action='store_true',
+                        help='Flag to run the program')
+    if len(sys.argv)==1:  # if no arguments are given, print the usage statment
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+
+    options =  parser.parse_args()
+
+    if not options.execute: # if you do not specify the execute flag, print the usage statment
+        parser.print_help(sys.stderr)
+
 def main(args):
     '''
     Program to automate the update of the JASPAR tracks for mm10, mm39, hg19, and hg38.
 
     Assumptions:
        * Shared HTML, PFMs, and bigBeds are all obtained from:
          https://frigg.uio.no/JASPAR/JASPAR_genome_browser_tracks/current/
     '''
+    
+    get_options() # Parse the user's command line arguenments. 
+
     # Setting some definitions
     assemblies= ["hg19", "hg38", "mm10", "mm39"]
     # Any broken links in the script can be fixed below.
     currentAddress = "https://frigg.uio.no/JASPAR/JASPAR_genome_browser_tracks/current/"
     PFMs = currentAddress + "PFMs.zip"
     sharedHtml = currentAddress + "JASPAR2024_TFBS_help.html"
     bigBedUrl = currentAddress + "%s/JASPAR2024_%s.bb"
 
     #################################################################
     #                           Build trackDb                       #
     #################################################################
     print ("########################################################")
     print ("#            Creating the new trackDb stanza           #")
     print ("########################################################")
     newTrackDb = buildTrackDb("hg38") #container for all trackDb settings
            
     #################################################################
     #                 Print trackDb to the file                     #
     #################################################################
     trackDb = open("jaspar.ra", "w")
     for setting,value in newTrackDb:
         print("\t",setting,value, file=trackDb)
     trackDb.close()
     print("    Done!")
     
     #################################################################
     #          Read the PFMs via URL and calculate matrix           #
     #################################################################
     print ("########################################################")
     print ("#       Reading PFM.zip file and anlayzing matrix      #")
     print ("########################################################")
     hgFixed = open("jasparMotif.tab", 'w')
     completeMatrix = analyzePFM(PFMs)
     for matrixName, (percents, numOfColumns) in completeMatrix.items():
         print (matrixName, numOfColumns, percents, sep="\t", file=hgFixed)
 
     print("    Done!")
 
     #################################################################
     #           Download any files needed for the update            #
     #################################################################
     print ("########################################################")
     print ("#             Downloading HTML files                   #")
     print ("########################################################")
     print ("Downloading HTML page:")
     download(sharedHtml) # Download the HTML file
     print ("Done!")
     print ("########################################################")
     print ("#             Downloading bigBed files                 #")
     print ("########################################################")
     for db in assemblies:
         print ("Downloading the bigBed for %s" % db)
         bigBed = bigBedUrl % (db,db) # Use the correct URL for the current database
         download(bigBed) # Download the file
         print ("Done!", end="\n\n")
 
     #################################################################
     #         Create a file with notes about how to update          #
     #################################################################
     print ("########################################################")
     print ("#            Next steps to complete update             #")
     print ("########################################################")
     print (" updateSteps.txt - contains instructions on how to")
     print ("                   finish the update.")
     steps = open("updateSteps.txt",'w') # file to write all steps
     print ("########################################################", file=steps)
     print ("#            Next steps to complete update             #", file=steps)
     print ("########################################################", file=steps)
     print ("1. Add trackDb statment to jaspar.ra", file=steps)
     print ("2. Review HTML page and add new changes to the existing HTML page", file=steps)
     print ("3. Move bigBed files to /hive/data/genomes/$DB/bed/jaspar", file=steps)
     print ("4. Create a symlink for the bigBed in /gbdb/$DB/jaspar/", file=steps)
     print ("5. Create the hgFixed table for the PFM image on hgc, i.e.", file=steps)
     print ("   hgLoadSqlTab hgFixed jasparCore2024 ~/kent/src/hg/lib/dnaMotif.sql jasparMotif.tab", file=steps)
 
 
 if __name__ == "__main__":
     sys.exit(main(sys.argv))