src/utils/trackDbIndexBb/trackDbIndexBb d660f59a95e2156f2d2deefd6658b6c54bab1b52

d660f59a95e2156f2d2deefd6658b6c54bab1b52
lrnassar
  Tue Jun 2 14:35:45 2020 -0700
Fixing the finish statement to reflect the new trackDb usage, no RM

diff --git src/utils/trackDbIndexBb/trackDbIndexBb src/utils/trackDbIndexBb/trackDbIndexBb
index 43c8b17..bc73d77 100755
--- src/utils/trackDbIndexBb/trackDbIndexBb
+++ src/utils/trackDbIndexBb/trackDbIndexBb
@@ -1,398 +1,399 @@
 #!/usr/bin/env python
 """
 Given a track name, a trackDb.ra composite of bigBeds, and a chrom.sizes
 file, will create index files needed to optimize hideEmptySubtracks setting.
 Will also build track associations between tracks sharing metadata, which
 will cause them to display together whenever the primary bigBed track is present.
 Depending on size and quantity of files, can take over 60 minutes.
 
 This script has three dependancies: bigBedToBed, bedToBigBed, and bedtools. The first two are
 UCSC Genome Browser utilities which can be downloaded to the current directory with the 
 following commands:
 
 1) wget http://hgdownload.soe.ucsc.edu/admin/exe/<system>.x86_64/<tool>
 
 where: 
 <system> is macOSX or linux
 <tool> is bedToBigBed and bigBedToBed
 
 2) chmod +x <tool>
 
 bedtools can be found here: https://bedtools.readthedocs.io
     
 These dependancies can be in the path, in the local directory the script is run from, or 
 specified using the optional flags.
 
     
 Example run:
     trackDbIndexBb -t mm10HMMdata -r mm10HMMdata.ra -c mm10chrom.sizes
     trackDbIndexBb -t hg19peaks -r hg19peaks.ra -c hg19chrom.sizes -o ./hg19peaks/output -p /user/bin
 """
 import subprocess, os, shutil, tempfile, atexit, argparse
 from distutils.spawn import find_executable
 from collections import OrderedDict 
 
 # ==== functions =====
 def parseArgs():
     """
     Parse the command line arguments.
     """
     parser = argparse.ArgumentParser(description = __doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
     optional = parser._action_groups.pop()
     required = parser.add_argument_group('required arguments')
     
     required.add_argument ("-t", "--trackName", dest = "trackName",
         help = "Track name for top level coposite which contains the bigBed tracks.")
     required.add_argument ("-r", "--raFile", dest = "trackDbRaFile",
         help = "Relative or absolute path to trackDb.ra file containing the " + \
             "composite track with bigDataUrls.")
     required.add_argument ("-c", "--chromSizes", dest = "chromSizes",
         help = "Chrom.sizes for database which the track belongs to. Needed " + \
             "to build final bigBed file.")
     # Optional arguments
     optional.add_argument ("-o", "--out", dest = "outDir", default = ".",
         help = "Optional: Output directory for files. Default current directory.")
     optional.add_argument ("-p", "--pathTools", dest = "toolsPath", default = ".",
         help = "Optional: Path to directory where bedtools/bedToBigBed/bigBedToBed " + \
             "can be found")
     optional.add_argument ("-n", "--noDelete", dest = "noDelete", default = False, action = "store_true",
         help = "Optional: Do not delete intermediary multibed.bed file. This option will " + \
             "result in both the multibed.bb and multibed.bed files in the output directory.")
     optional.add_argument ("-m", "--metaDataVar", dest = "metaDataVar", default = "subGroups",
         help = "Optional: Used when there are associated tracks to be displayed " + \
             "alongside the primary BB track. Such as peak tracks with related " + \
             "signals. To relate the tracks, trackDbIndexBb expects all except " + \
             "one of the metaData variables to match among associated tracks. " + \
             "By default, trackDbIndexBb attempts to make association between tracks by " + \
             "using the metaData in the 'subGroups' trackDb parameter. Use this " + \
             "flag to change it to a different association, often 'metaData' is " + \
             "also used.") 
     optional.add_argument ("-s", "--subGroupRemove", dest = "subGroupRemove", default = "view",
         help = "Optional: Used when there are associated tracks to be displayed " + \
             "alongside the primary BB track. Such as peak tracks with related " + \
             "signals. To relate the tracks, trackDbIndexBb expects all except one " + \
             "of the metaData variables to match among associated tracks. This " + \
             "metaData often looks likes: 'view=Peaks mark=A6_H3K36me3' for the .bb " + \
             "track, and 'view=Signal mark=A6_H3K36me3' for the .bw track. In this " + \
             "case, you would want to exclude the 'view' varaible to make histone " + \
             "mark associations (A6_H3K36me3). This flag can be used to pass a " + \
             "different exclusionary variable than the default 'view'") 
     parser._action_groups.append(optional)
     options = parser.parse_args()
 
     if options.trackName is None or options.trackDbRaFile is None or options.chromSizes is None:
         parser.print_usage()
         # Print concise help message and usage statement when no arguments passed.
         print("\nGiven a track name, a trackDb.ra composite of bigBeds, and a chrom.sizes\n" + \
         "file, will create index files needed to optimize hideEmptySubtracks setting.\n" + \
         "Depending on size and quantity of files, can take over 60 minutes.\n\n" + \
         "For a more verbose help message, run with help flag: trackDbIndexBb -h\n\n" + \
         "Example run:\n" + \
         "    trackDbIndexBb -t mm10HMMdata -r mm10HMMdata.ra -c mm10chrom.sizes\n" + \
         "    trackDbIndexBb -t hg19peaks -r hg19peaks.ra -c hg19chrom.sizes " + \
         "-o ./hg19peaks/output -p /user/bin\n\n" + \
         "required arguments:\n" + \
         "  -t TRACKNAME, --trackName TRACKNAME\n" + \
         "                        Track name for top level coposite which contains the\n" + \
         "                        bigBed tracks.\n" + \
         "  -r TRACKDBRAFILE, --raFile TRACKDBRAFILE\n" + \
         "                        Relative or absolute path to trackDb.ra file\n" + \
         "                        containing the composite track with bigDataUrls.\n" + \
         "  -c CHROMSIZES, --chromSizes CHROMSIZES\n" + \
         "                        Chrom.sizes for database which the track belongs to.\n" + \
         "                        Needed to build final bigBed file.\n")
         exit(0)
 
     return  options
 
 def saveTrackBigDataUrl(bigDataUrls,track,field,raStanza):
     """
     Initialize an ordered Dictionary for a track entry that matches
     queried track, and save the bigDataUrl
     """
     bigDataUrls[track] = OrderedDict()
     bigDataUrls[track][field] = raStanza[track][field][0]
     return(bigDataUrls)
 
 def saveParentTrackRecord(raStanza,track,compositeNames,field,bigDataUrls):
     """
     Mark down parent tracks associated with target track and save bigDataUrls.
     """
     if raStanza[track]['parent'] not in compositeNames:
         compositeNames.append(raStanza[track]['parent'][0])
     if 'type' in raStanza[track].keys() and field in raStanza[track].keys():
         if 'bigBed' in raStanza[track]['type']:
             saveTrackBigDataUrl(bigDataUrls,track,field,raStanza)
 
     elif 'bigDataUrl' in raStanza[track].keys():
         if 'bigBed' in raStanza[raStanza[track]['parent'][0]]['type']:
             saveTrackBigDataUrl(bigDataUrls,track,field,raStanza)
     return(bigDataUrls,compositeNames)
 
 def finishCurrentRecord(trackName,track,raStanza,field,bigDataUrls,compositeNames):
     """
     To be executed when a trackDb stanza has finished being parsed. Looks to see if
     the track name matches the target track, if not checks parent, then parent's parent.
     If matches are found, invokes saveParentTrackRecord().
     """
     if trackName == track:
         if 'type' in raStanza[track].keys() and field in raStanza[track].keys():
             if 'bigBed' in raStanza[track]['type'] and trackName in raStanza[track]['track']:
                 saveTrackBigDataUrl(bigDataUrls,track,field,raStanza)
     # See if desired track is parent of current record                
     elif 'parent' in raStanza[track].keys():
         if trackName in raStanza[track]['parent']:
             saveParentTrackRecord(raStanza,track,compositeNames,field,bigDataUrls)
         # See if desired track is parent's parent of current record                            
         elif 'parent' in raStanza[raStanza[track]['parent'][0]].keys():
             if raStanza[raStanza[raStanza[track]['parent'][0]]['parent'][0]]['track'] == trackName:
                 saveParentTrackRecord(raStanza,track,compositeNames,field,bigDataUrls)
     return(bigDataUrls,compositeNames)
 
 def startNewRecord(line,raStanza):
     """
     Invokes when a new trackDb stanza is encountered. Strips lines, and initializes
     an ordered dictionary to contain the stanza variables.
     """
     line = line.lstrip().split(" ")
     track = line[1] 
     raStanza[track] = OrderedDict()
     raStanza[track]['track'] = track
     return(line,track,raStanza)
 
 def buildTrackAssociations(bigDataUrls,metaDataVar,raStanza,subGroupsRemove,compositeNames):
     """
     Iterate through all saved bigDataUrls, which should represent only entries that match
     the target track, then look for track associations through the chosen metaDataVar
     variable. This is done by extracting the metaDataVar variable, then iterating through
     it removing any entries that match subGroupsRemove. If associations are made, add the
     association to the track's bigDataUrls dictionary entry under relatedTracks.
     """
     for track in bigDataUrls.keys():
         if metaDataVar in raStanza[track]:
             subGroups = raStanza[track][metaDataVar]
             for Groups in subGroups:
                 if Groups.startswith(subGroupsRemove):
                     subGroups.remove(Groups)
             for trackTitle in raStanza.keys():
                 if trackTitle == track:
                     continue
                 elif 'parent' in raStanza[trackTitle].keys():
                     if raStanza[trackTitle]['parent'][0] in compositeNames:
                         if metaDataVar in raStanza[trackTitle].keys():
                             if all(meta in raStanza[trackTitle][metaDataVar] for meta in subGroups):
                                 if 'relatedTracks' not in bigDataUrls[track].keys():
                                     bigDataUrls[track]['relatedTracks'] = [trackTitle]
                                 else:
                                     bigDataUrls[track]['relatedTracks'].append(trackTitle)
     return(bigDataUrls)
 
 def raInfoExtract(raFile,field,subGroupsRemove,trackName,metaDataVar):
     """
     Primary function that parses the trackDb.ra file. Iterates by line, creating a 
     new dictionary key for each track stanza, and saving the parameters. This is done
     for all stanzas, and not just the one that match desired track name, in order
     to build parent/child associations, and also to match any additional metaDataVar
     track associations.
     """
     bigDataUrls = OrderedDict()
     # List to hold all possible parent names part of composite
     compositeNames = ['trackName']
     with open(raFile,'r') as file:
         raStanza = OrderedDict()
         for line in file:
             line = line.rstrip()
             # Ignore headers
             if line.startswith('#'):
                 continue
             elif line in [""," "]:
                 if raStanza == {}:
                     continue
                 # Finish current record
                 else: 
                     finishCurrentRecord(trackName,track,raStanza,field,bigDataUrls,compositeNames) 
             # Start new track record
             elif line.lstrip().startswith("track"):
                     newRecord = startNewRecord(line,raStanza)
                     line, track = newRecord[0], newRecord[1]
             elif raStanza == {}: # Support for useOneFile on
                 continue
             # Save all trackDb parameters for record
             else:
                 line = line.lstrip().split(" ")
                 raStanza[track][line[0]] = line[1:]
             
     buildTrackAssociations(bigDataUrls,metaDataVar,raStanza,subGroupsRemove,compositeNames)
     return(bigDataUrls)
 
 def tryDelete(file):
     """
     Check if file exists, then delete.
     """
     if os.path.isfile(file):
         os.remove(file)
 
 def handle_exit(tempFiles,trackName,outDir):
     """
     Clean up function to be run when the script exits out. Deletes all files created.
     """
     if tempFiles != []:
         for tempFile in tempFiles:
             tryDelete(tempFile)
     tryDelete('bed3Sources.as')
     tryDelete('{outDir}/{trackName}.multiBed.bed'.format(trackName=trackName,outDir=outDir))
         
 def createBed3as():
     """
     Make bed3 .as file.
     """
     with open('bed3Sources.as','w') as file:
         file.write('''table bed3Sources
 "BED3+2 with a count and list of sources for combined data"
     (
     string chrom;      "Reference sequence chromosome or scaffold"
     uint   chromStart; "Start position in chromosome"
     uint   chromEnd;   "End position in chromosome"
     uint sourceCount;  "Number of sources"
     uint[sourceCount] sourceIds; "Source ids"
     )''')
         
 def createMultiBedSourcesFile(trackName,outDir,raInfo):
     """
     Builds sources file with any additional associations. Iterates through all parsed
     trackDb entries, and looks for presence of relatedTracks. If not found, make
     single entry of track with no additional associations. If found, create entry
     of track with all associated tracks separated by a tab.
     """
     with open('{outDir}/{trackName}.multiBedSources.tab'.format(trackName=trackName,outDir=outDir),'w') as file:
         n=0
         for trackNames in raInfo.keys():
             n+=1
             if 'relatedTracks' not in raInfo[trackNames].keys():
                 file.write('{trackNumber}\t{trackName}\n'.format(trackNumber=n,trackName=trackNames))
             else:
                 file.write('{trackNumber}\t{trackName}\t{relatedTracks}\n'.format(trackNumber=n,\
                     trackName=trackNames, relatedTracks='\t'.join(raInfo[trackNames]['relatedTracks'])))
 
 def checkTools(toolsPath,tool):
     """
     Looks for tools. Runs for bedtools, bedToBigBed, and bigBedToBed. Checks for the tools
     in the current directory, in path, and a few variations. Returns specific error message
     if tools not found.
     """
     bedToolsErrorMessage = "bedtools not found in path or local directory. If it is " + \
     "not installed, it can be found here: https://bedtools.readthedocs.io\n\If it is " + \
     "installed but not in path, the directory where bedtools can be found can be " + \
     "specified with the \-b /path/to/dir flag.\nE.g. trackDbIndexBb -b /bin/bedtools"
     kentUtilErrorMessage = "bedToBigBed/bigBedToBed not found in path or local " + \
     "directory.\n\The program help message includes commands to download the " + \
     "utilities. \They can be found here: http://hgdownload.soe.ucsc.edu/admin/exe/"
     
     if find_executable(tool) is not None:
         return tool
     elif find_executable(toolsPath) is not None and tool in find_executable(toolsPath):
         return toolsPath
     elif find_executable("{toolsPath}/{tool}".format(toolsPath=toolsPath,tool=tool)) is not None:
         toolPath = "{toolsPath}/{tool}".format(toolsPath=toolsPath,tool=tool)
         return toolPath 
     elif find_executable("./{tool}".format(tool=tool)) is not None:
         toolPath = "./{tool}".format(tool=tool)
         return toolPath  
     else:
         if tool == 'bedtools':
             print(bedToolsErrorMessage)
         else:
             print(kentUtilErrorMessage)
         sys.exit(0)
         
 def trackDbIndexBb(trackName, trackDbRaFile, chromSizes, outDir, toolsPath, subGroupsRemove, metaDataVar, noDelete):
     """
     Created multibed and sources file to be used with hideEmptySubtracks trackDb setting.
     Checks for existence of necessary tools (bedtools, bigBedToBed, bedToBigBed), then 
     creates an ordered dictionary of all information in the trackDb.ra file. Makes a list
     of the BB bigDataUrls, then converts them all to bed as temp files. Then runs bed tools
     to create a multibed of all the temp bed files. Creates a .as file, then converts the
     multibed to BB. Creates the sources file by assigning 1-n to each of the track names,
     and adding any related associated tracks separated by a tab. Finally clean up all
     created files and print a completion message.
     """ 
     # Set up exit handler to delete files and check where dependencies are located
     tempFiles = []
     atexit.register(handle_exit,tempFiles,trackName,outDir)
     
     bedtools = checkTools(toolsPath,'bedtools')
     bigBedToBed = checkTools(toolsPath,'bigBedToBed')
     bedToBigBed = checkTools(toolsPath,'bedToBigBed')
 
     raInfo = raInfoExtract(trackDbRaFile,'bigDataUrl',subGroupsRemove,trackName,metaDataVar)
 
     bbFiles = []
     for trackNames in raInfo.keys():
         bbFiles.append(raInfo[trackNames]['bigDataUrl'])
 
     # Covert all BB to bed and sort
     FNULL = open(os.devnull, 'w') 
     for bb in bbFiles:
         tfName = tempfile.mktemp()
         tempFiles.append(tfName)
         try:
             subprocess.check_call("{bigBedToBed} {bigBedFile} {tfName}".format(bigBedToBed=bigBedToBed,\
                                                                            bigBedFile=bb,tfName=tfName), 
                              shell=True, stdout=FNULL, stderr=subprocess.STDOUT)
         except:
             relPath = "/".join(trackDbRaFile.split('/')[0:len(trackDbRaFile.split('/'))-1])+"/"
             subprocess.check_call("{bigBedToBed} {relPath}{bigBedFile} {tfName}".format(bigBedToBed=bigBedToBed,\
                                                                    relPath=relPath,bigBedFile=bb,tfName=tfName), 
                              shell=True,stderr=subprocess.STDOUT)
     FNULL.close()
 
     # Make multiBed intersection of where all tracks have data
     subprocess.check_call("{bedtools} multiinter -i {tempFiles} |\
 cut -f 1-5 > {outDir}/{trackName}.multiBed.bed".format(bedtools=bedtools,tempFiles=' '.join(tempFiles),\
                                                        outDir=outDir,trackName=trackName), 
                shell=True)
         
     # Make BB .as file, and convert multiBed to BB
     createBed3as()
     subprocess.check_call("{bedToBigBed} -as=bed3Sources.as -type=bed3+2 \
 {outDir}/{trackName}.multiBed.bed {chromSizes} {outDir}/{trackName}.multiBed.bb".format(bedToBigBed=bedToBigBed,\
                                                        trackName=trackName,chromSizes=chromSizes,outDir=outDir), 
                shell=True)
 
     #Create index source file associating all tracks to a number
     createMultiBedSourcesFile(trackName,outDir,raInfo)
     
     #Clean up and print completion
     for tempFile in tempFiles:
         tryDelete(tempFile)
     
     if noDelete is False:
     	tryDelete('{outDir}/{trackName}.multiBed.bed'.format(trackName=trackName,outDir=outDir))
     
     tryDelete('bed3Sources.as')
 
     print("Files successfully created:\n{outDir}/{trackName}.multiBed.bb\n{outDir}" + \
           "/{trackName}.multiBedSources.tab\n\nPlace the files in their final " + \
           "directory and add the following line to the top trackDb stanza" + \
           ":".format(outDir=outDir,trackName=trackName))
-    print("hideEmptySubtracks on $myPath/{trackName}.multiBed.bb $myPath/{trackName}" + \
-          ".multiBedSources.tab".format(trackName=trackName))  
+    print("hideEmptySubtracks on\n" + \
+          "hideEmptySubtracksMultiBedUrl $myPath/{trackName}.multiBed.bb\n" + \
+          "hideEmptySubtracksSourcesUrl $myPath/{trackName}.multiBedSources.tab".format(trackName=trackName))  
         
 def main():
     """
     Initialized options and calls other functions.
     """
     options = parseArgs()
 
     trackName = options.trackName
     trackDbRaFile = options.trackDbRaFile
     chromSizes = options.chromSizes
     outDir = options.outDir
     toolsPath  = options.toolsPath
     subGroupsRemove = options.subGroupRemove
     metaDataVar = options.metaDataVar
     noDelete = options.noDelete
 
     trackDbIndexBb(trackName, trackDbRaFile, chromSizes, outDir, toolsPath, subGroupsRemove, metaDataVar, noDelete)
     
 main()