src/utils/qa/jasparUpdate.py b5d6d7abaeb13f8b9cf29db22c0827a54e66dcbc

b5d6d7abaeb13f8b9cf29db22c0827a54e66dcbc
jnavarr5
  Fri Jun 7 14:33:09 2024 -0700
Creating a script to automate the wrangling of the JASPAR transcription factor track. refs #32537

diff --git src/utils/qa/jasparUpdate.py src/utils/qa/jasparUpdate.py
new file mode 100755
index 0000000..fe87711
--- /dev/null
+++ src/utils/qa/jasparUpdate.py
@@ -0,0 +1,215 @@
+#!/usr/bin/python3
+import argparse
+import sys
+import subprocess
+from io import BytesIO
+from zipfile import ZipFile
+from urllib.request import urlopen
+from ucscGb.qa.tables import trackUtils
+from ucscGb.qa import qaUtils
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return (bashStdoutt)
+
+def download(URL):
+    '''
+    Generic command to wget a URL and prints the command to STDOUT.
+    '''
+    cmd = "wget %s" % URL
+    print("    " + cmd)
+    bash(cmd)
+
+def analyzePFM(PFMzip):
+    '''
+    Given a URL to the PFM matrix for the JASPAR hub, read each line and calculate the
+    nucleotide percents at each position for the Motif display. The function returns a dicitonary
+    with the motif name as the key and a tuple as the value (the percents and column counts 
+    '''
+    archive = ZipFile(BytesIO(urlopen(PFMzip).read()),'r')
+    PFMtable = {}
+    for eachFile in archive.namelist(): # Iterate through each of files
+        aList = [] 
+        cList = []
+        gList = []
+        tList = []
+        numOfColumns = 0 # container for the number of columns
+        for line in archive.open(eachFile):
+            currentLine = line.decode('utf-8').strip() #convert to utf-8
+            if currentLine.startswith('>'):
+                continue  # Skip the header line
+            # Matrix line creates a list of seen values
+            matrix = currentLine.split("[")[1].strip().split(']')[0].strip().split()
+            numOfColumns = len(matrix)
+            if currentLine.startswith('A'):
+                aList = [int(x) for x in matrix]
+            elif currentLine.startswith('C'):
+                cList = [int(x) for x in matrix]
+            elif currentLine.startswith('G'):
+                gList = [int(x) for x in matrix]
+            elif currentLine.startswith('T'):
+                tList = [int(x) for x in matrix]
+            else:
+                print ("Line not recognized")
+        aPercent = []
+        cPercent = []
+        gPercent = []
+        tPercent = []
+        for A,C,G,T in zip(aList,cList,gList,tList): # Go through each of the counts in the matrix
+            total = A+C+T+G # Calcualte the column total
+            aPercent.append(float("%.6g" % (A/total)))
+            cPercent.append(float("%.6g" % (C/total)))
+            gPercent.append(float("%.6g" % (G/total)))
+            tPercent.append(float("%.6g" % (T/total)))
+
+        #remove .jaspar from the filename
+        matrixName = eachFile[:-7]
+        percents = str(aPercent).strip("[]") + "\t"
+        percents += str(cPercent).strip("[]") + "\t"
+        percents += str(gPercent).strip("[]") + "\t"
+        percents += str(tPercent).strip("[]") # Add each percent as a tab-separated column
+         
+        PFMtable[matrixName] = percents, numOfColumns # Add the matrix to the dictionary
+    return PFMtable
+
+def getTrackDb(db):
+    '''
+    Read through the trackDb file. This generator reads through the first stanza in the trackDb
+    file, and returns the key/value pairs. Upon finding the first empty new line, the generator
+    stops. (Could lead to a possible bug if the first line is empty.)
+    '''
+    trackDb = "https://frigg.uio.no/JASPAR/JASPAR_genome_browser_tracks/current/%s/trackDb.txt" % db
+    try:
+        for line in urlopen(trackDb):
+            key,value = line.decode('utf-8').strip().split(" ", 1) # split using first space
+            yield key,value
+    except ValueError:
+        return key,value
+
+def buildTrackDb(db):
+    '''
+    Given a database, creates a trackDb for the JASPAR track for the UCSC database. Modifies
+    settings that were specific to the hub and changes them to fit our architechure.
+    '''
+    newTrackDb = [] #final container to return
+    year = "" #container for this years release
+    for setting, value in getTrackDb(db): # Make a unified trackDb using hg38
+        if setting == "track":
+            newTrackName = value.split("_")[0].lower() #edit the track name to how we have it
+            year = newTrackName[6:]
+            newTrackDb.append([setting,newTrackName])
+            newTrackDb.append(["parent", "jaspar on"]) #Adding settings that aren't in the hub
+            newTrackDb.append(["priority","1"])
+        elif setting == "shortLabel":
+            shortLabel = "JASPAR %s TFBS" % year
+            newTrackDb.append([setting, shortLabel])
+        elif setting == "longLabel":
+            longLabel = "JASPAR CORE %s - Predicted Transcription Factor Binding Sites" % year
+            newTrackDb.append([setting, longLabel])
+        elif setting == "html": #skip the HTML section for now
+            continue
+        elif setting == "bigDataUrl": # Make the bigDataUrl point to /gbdb
+            #download file
+            path = "/gbdb/$D/jaspar/JASPAR%s.bb" % year
+            newTrackDb.append([setting, path])
+        elif setting == "filterValues.name":
+            # test to make sure the filters are in alphebetical order
+            sortedFilter = sorted(value.split(","), key=str.casefold)
+            newTrackDb.append([setting, ",".join(sortedFilter)])
+        else:
+            newTrackDb.append([setting,value])
+
+    return newTrackDb
+
+def main(args):
+    '''
+    Program to automate the update of the JASPAR tracks for mm10, mm39, hg19, and hg38.
+
+    Assumptions:
+       * Shared HTML, PFMs, and bigBeds are all obtained from:
+         https://frigg.uio.no/JASPAR/JASPAR_genome_browser_tracks/current/
+    '''
+    # Setting some definitions
+    assemblies= ["hg19", "hg38", "mm10", "mm39"]
+    # Any broken links in the script can be fixed below.
+    currentAddress = "https://frigg.uio.no/JASPAR/JASPAR_genome_browser_tracks/current/"
+    PFMs = currentAddress + "PFMs.zip"
+    sharedHtml = currentAddress + "JASPAR2024_TFBS_help.html"
+    bigBedUrl = currentAddress + "%s/JASPAR2024_%s.bb"
+
+    #################################################################
+    #                           Build trackDb                       #
+    #################################################################
+    print ("########################################################")
+    print ("#            Creating the new trackDb stanza           #")
+    print ("########################################################")
+    newTrackDb = buildTrackDb("hg38") #container for all trackDb settings
+           
+    #################################################################
+    #                 Print trackDb to the file                     #
+    #################################################################
+    trackDb = open("jaspar.ra", "w")
+    for setting,value in newTrackDb:
+        print("\t",setting,value, file=trackDb)
+    trackDb.close()
+    print("    Done!")
+    
+    #################################################################
+    #          Read the PFMs via URL and calculate matrix           #
+    #################################################################
+    print ("########################################################")
+    print ("#       Reading PFM.zip file and anlayzing matrix      #")
+    print ("########################################################")
+    hgFixed = open("jasparMotif.tab", 'w')
+    completeMatrix = analyzePFM(PFMs)
+    for matrixName, (percents, numOfColumns) in completeMatrix.items():
+        print (matrixName, numOfColumns, percents, sep="\t", file=hgFixed)
+
+    print("    Done!")
+
+    #################################################################
+    #           Download any files needed for the update            #
+    #################################################################
+    print ("########################################################")
+    print ("#             Downloading HTML files                   #")
+    print ("########################################################")
+    print ("Downloading HTML page:")
+    download(sharedHtml) # Download the HTML file
+    print ("Done!")
+    print ("########################################################")
+    print ("#             Downloading bigBed files                 #")
+    print ("########################################################")
+    for db in assemblies:
+        print ("Downloading the bigBed for %s" % db)
+        bigBed = bigBedUrl % (db,db) # Use the correct URL for the current database
+        download(bigBed) # Download the file
+        print ("Done!", end="\n\n")
+
+    #################################################################
+    #         Create a file with notes about how to update          #
+    #################################################################
+    print ("########################################################")
+    print ("#            Next steps to complete update             #")
+    print ("########################################################")
+    print (" updateSteps.txt - contains instructions on how to")
+    print ("                   finish the update.")
+    steps = open("updateSteps.txt",'w') # file to write all steps
+    print ("########################################################", file=steps)
+    print ("#            Next steps to complete update             #", file=steps)
+    print ("########################################################", file=steps)
+    print ("1. Add trackDb statment to jaspar.ra", file=steps)
+    print ("2. Review HTML page and add new changes to the existing HTML page", file=steps)
+    print ("3. Move bigBed files to /hive/data/genomes/$DB/bed/jaspar", file=steps)
+    print ("4. Create a symlink for the bigBed in /gbdb/$DB/jaspar/", file=steps)
+    print ("5. Create the hgFixed table for the PFM image on hgc, i.e.", file=steps)
+    print ("   hgLoadSqlTab hgFixed jasparCore2024 ~/kent/src/hg/lib/dnaMotif.sql jasparMotif.tab", file=steps)
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))