src/hg/hgMirror/hgMirror b6cbd59bc90b14b3a643ba42783040e769d238bb

b6cbd59bc90b14b3a643ba42783040e769d238bb
chmalee
  Wed Jul 21 10:44:57 2021 -0700
Add gtexGenev8 to list of tables always downloaded by hgMirror and gbib, since it is a default track on hg38, refs #27854

diff --git src/hg/hgMirror/hgMirror src/hg/hgMirror/hgMirror
index de6ebe2..396bd17 100755
--- src/hg/hgMirror/hgMirror
+++ src/hg/hgMirror/hgMirror
@@ -1,1941 +1,1941 @@
 #!/usr/bin/env python2.7
 
 # A little CGI interface to download the tables for a set of tracks via udr 
 # to local machine. This is mostly useful when setting up a mirror or a VM
 # of the browser. It does not run on hgwdev.
 
 # This script does the following:
 # - get trackDb and grp table from hgDownload
 # - get table and gbdb sizes from ucsc rsync server
 # - get list with track<->filename for all bigfile tracks from hgwdev
 # - try to assign table names to gbdb files using this list and some hacky rules
 # - parse hg.conf to find mysql server and hide a few tracks in its trackDb
 # - infer track/subtrack hierarchy by parsing trackDb
 # - generate HTML table with labels/sizes/tablecounts for all tracks and their child tracks
 # - when user clicks submit, start rsync transfer and redirect to page that shows progress
 # - handles non-existing tables and some hgFixed tables
 
 # When run from the command line, this CGI hides some tracks and removes tracks from
 # track search on hg19. This is usually run from a cronjob after trackDb updates.
 
 # This script requires the following setup
 # - mysqldb python module
 # - "rsync" in path
 # - "at" in path
 #   To allow this on ubuntu, add these lines to /etc/sudoers:
 #www-data     ALL = (mysql:mysql) NOPASSWD: /bin/ls,/usr/bin/rsync,/bin/rm
 #www-data     ALL = (root:root) NOPASSWD: /bin/mkdir /gbdb
 #www-data     ALL = (root:root) NOPASSWD: /bin/chown www-data.www-data /gbdb</pre>
 # - the apache user has to be able to run 'at' jobs. 
 #   To allow this on ubuntu, need to run this command to remove www-data from /etc/at.deny
 #     sudo sed -i s/www-data//g /etc/at.deny
 
 # This script does not handle:
 # tables joined to other tables are not downloaded. Would have to parse all.joiner for that.
 
 # format python errors in html, as we're a CGI script
 import cgi
 import cgitb; cgitb.enable()
 
 # these are default python modules on python 2.7, no errors expected here
 import urllib, urllib2, zlib, collections, StringIO, gzip, string, sys, os, random, \
     subprocess, re, types, socket, cPickle, copy, glob, tempfile, Cookie
 
 from collections import defaultdict, namedtuple
 from os.path import *
 from distutils import spawn
 
 # import the UCSC-specific library
 sys.path.append(join(dirname(__file__), "pyLib"))
 try:
     from hgLib import getNonce, getCspMetaHeader, jsOnEventByIdF, jsInlineFinish 
 	    # cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \
             # sqlConnect, sqlQuery, errAbort, cfgOption, runCmd, cgiGetAll, printHgcHeader, \
             # printHgcSection, 
 	    # webStartGbNoBanner, htmlPageEnd, hConnectCentral, sqlTableExists, \
             # readSmallFile
 except:
     print("Content-type: text/html\n")
     print("Cannot find the directory cgi-bin/pyLib in Apache. This is an installation error.")
     print("All all parts of cgi-bin installed? Did you do 'make' in kent/src/hg/pyLib?")
 
 defaultDb = "hg19"
 
 # the mysqldb module has to be installed with one of these commands:
 # - many common linuxes and OSX: pip install mysqldb
 # - debian: sudo apt-get install python-mysqldb
 # - fedora/centos/redhat: sudo yum install python-mysqldb
 # The script works without the mysqldb module but cannot auto-hide some tracks.
 mysqlDbLoaded = True
 try:
     import MySQLdb
 except:
     mysqlDbLoaded = False
 
 # default mysql data dir on debian-based distros
 MYSQLDIR = "/var/lib/mysql"
 
 # can probably autodetect this, but hardcoded here
 APACHEUSER = "www-data"
 
 # optional file with rows to add to tableList
 TABLELISTADD = "/root/tableListAdd.hg19.tab"
 
 # directory for temporary files, keep trailing slash
 TMPDIR = "/tmp/hgMirror/"
 
 #DEBUG=True
 DEBUG=False
 
 # list of tables to exclude from track search
 REMOVESEARCH = ["wgEncodeGencodeBasicV19", "wgEncodeGencodeCompV17", "wgEncodeGencodeBasicV14", "wgEncodeGencodeBasicV17", "wgEncodeGencodeCompV14", "mgcFullMrna", "wgEncodeGencodeBasicV7", "orfeomeMrna", "wgEncodeGencodePseudoGeneV14", "wgEncodeGencodePseudoGeneV17", "wgEncodeGencodePseudoGeneV19", "wgEncodeGencodeCompV7", "knownGeneOld6", "geneReviews", "transMapAlnSplicedEst", "gbCdnaInfo", "oreganno", "vegaPseudoGene", "transMapAlnMRna", "ucscGenePfam", "qPcrPrimers", "transMapAlnUcscGenes", "transMapAlnRefSeq", "genscan", "bacEndPairs", "fosEndPairs"]
 
 # list of tracks to hide by default
 FORCEHIDE = ["intronEst", "cons100way", "cons46way", "ucscRetroAli5", "mrna",
         "wgEncodeRegDnaseClustered"]
 
 # always copy these (small) tables for the current db, if they exist
 FORCETABLES = ['cytoBand', 'chromInfo', 'cytoBandIdeo', 'kgColor', \
     'knownGene', 'kgXref', 'ensemblLift', 'ucscToEnsembl','wgEncodeRegTfbsCells', \
     'tableList', 'refSeqStatus', 'wgEncodeRegTfbsCellsV3', 'extFile', 'trackDb', 'grp',
     'ucscRetroInfo5', "refLink", "ucscRetroSeq5", "ensemblLift", "knownCanonical",
     'gbExtFile', 'flyBase2004Xref',
     # for gencode/knownGene tracks and hg38 in particular
     "knownToTag", "ncbiRefSeqLink", "ncbiRefSeqCurated", "gtexGeneModel", "gtexGene", "knownAttrs",
-    "seqNcbiRefSeq",
+    "seqNcbiRefSeq", "gtexGeneModelV8", "gtexGeneV8",
     # for faster searches
     'hgFindSpec', 'ensemblToGeneName', "ucscToINSDC",
     # these are almost required for searches, common tracks and not too big
     "ensGene", "xenoRefGene",
     # added in Feb 2021, necessary now for knownGenes display
     "knownCds"
     ]
 
 # always copy these hgFixed tables
 FORCEFIXED = ['trackVersion']
 #FORCEFIXED = ['trackVersion', 'tableList']
 
 # big file table base URL
 # points to a http directory with <db>/bigFiles.tab files that tell us which bigfile goes to which track
 #BIGFILETABLEURL = "http://hgwdev.soe.ucsc.edu/~max/browserbox/%s/bigFiles.tab.gz" # %s == db
 BIGFILETABLEURL = "http://hgdownload.soe.ucsc.edu/goldenpath/%s/database/bigFiles.txt.gz"
 
 # cache of hg.conf dict
 hgConf = None
 
 def parseConf(fname):
     " parse a hg.conf style file, return as dict key -> value (all strings) "
     conf = {}
     for line in open(fname):
         line = line.strip()
         if line.startswith("#"):
             continue
         elif line.startswith("include "):
             inclFname = line.split()[1]
             inclPath = abspath(join(dirname(fname), inclFname))
             if isfile(inclPath):
                 inclDict = parseConf(inclPath)
                 conf.update(inclDict)
         elif "=" in line: # string search for "="
             key, value = string.split(line, "=", 1)
             conf[key] = value
     return conf
 
 
 def parseHgConf():
     """ return hg.conf as dict key:value """
     global hgConf
     if hgConf is not None:
         return hgConf
 
     hgConf = dict() # python dict = hash table
 
     confDir = dirname(__file__)
     fname = join(confDir, "hg.conf")
     hgConf = parseConf(fname)
 
     return hgConf
 
 def sqlConnect(db, name):
     """ connect to sql """
     if name=="public":
         host, user, passwd = "genome-mysql.soe.ucsc.edu", "genomep", "password"
     elif name=="local":
         cfg = parseHgConf()
         host, user, passwd = cfg["db.host"], cfg["db.user"], cfg["db.password"]
     conn = MySQLdb.connect(host=host, user=user, passwd=passwd, db=db)
     return conn
 
 def debug(msg):
     if DEBUG:
         print(msg+"<br>")
         sys.stdout.flush()
 
 def runningAtUcsc():
     if "hgwdev" in socket.gethostname():
         return True
     return False
     
 def runCmd(cmd, mustRun=True):
     " wrapper around os.system that makes sure sudo is not called "
     if runningAtUcsc() and cmd.startswith("sudo"):
         return 0
 
     ret = os.system(cmd)
     if ret!=0 and mustRun:
         print "Could not run command %s" % cmd
         sys.exit(0)
     return ret
 
 def loadGroups(db):
     """ load grp table via mysql and return as a list of tuples (name, label)"""
     groups = []
     if mysqlDbLoaded:
         conn = sqlConnect(db, "public")
         cur = conn.cursor()
         cur.execute("SELECT name, label from grp order by priority")
         groups = []
         for row in cur.fetchall():
             groups.append((row[0], row[1]))
         cur.close()
         conn.close()
     else:
         for row in downloadTable(db, "grp"):
             groups.append((row[0], row[1]))
     return groups
 
 def downloadTable(db, table):
     """
     download table from hgdownload by parsing sql file first to get the field
     names, then the tab sep file. Returns a list of objects, with field names
     as attributes and their values from the tab sep file.
     """
     baseUrl = 'http://hgdownload.soe.ucsc.edu/goldenPath/%s/database/' % db
 
     # parse the .sql file and create a namedtuple "struct" for it
     sqlUrl = baseUrl+table+".sql"
     sqlLines = urllib2.urlopen(sqlUrl).read().splitlines()
     fieldNames = []
     for l in sqlLines:
         if l.strip().startswith("PRIMARY KEY"):
             continue
         if l.startswith("  "):
             fieldName = l.split()[0].strip("`")
             fieldNames.append(fieldName)
     Struct = namedtuple("rec", fieldNames)
 
     # read the tab-sep data
     # can use a cached copy from /tmp
     tmpFname = TMPDIR+db+"."+table+".txt.gz"
     if isfile(tmpFname):
         data = open(tmpFname)
     else:
         dataUrl = baseUrl+table+".txt.gz"
         remoteData = urllib2.urlopen(dataUrl).read()
         data = StringIO.StringIO(remoteData) # gunzipping requires to wrap a pseudo-file around the gzip data
         # write to cache file
         tmpFh = open(tmpFname, "w")
         tmpFh.write(remoteData)
         tmpFh.close()
     data = gzip.GzipFile(fileobj=data).read()
     data = data.replace("\\\n", "\a") # translate escaped mysql newline to \a
     data = data.replace("\\\t", "\b") # translate escaped mysql tab to \b
     lines = data.split("\n")
 
     # convert tab-sep lines to namedtuples (=objects)
     rows = []
     for line in lines:
         if len(line)==0:
             continue
         fields = line.split("\t")
         fields = [f.replace("\a", "\n").replace("\b", "\t") for f in fields]
         row = Struct(*fields)
         rows.append(row)
 
     return rows
 
 def parseRa(text):
     " parse ra-style string and return as dict name:value "
     lines = text.split("\n")
     data = dict()
     for l in lines:
         if len(l)==0:
             continue
         if " " not in l:
             continue
         key, val = string.split(l, " ", maxsplit=1)
         data[key] = val
     return data
 
 def getParent(settingDict):
     """ given a dict key -> value from trackDb, return the 'parent' of a track, either the
     parent or superTrack or subTrack names
 
     This is really confusing...
     """
     parent = None
     # subTrack is like "parent"
     if "subTrack" in settingDict:
         # "parent <trackName> on"
         # remove the "on" part 
         parent = settingDict.get("subTrack")
         parent = parent.split()[0]
 
     if "parent" in settingDict:
         # "parent <trackName> on"
         # remove the "on" part 
         parent = settingDict.get("parent")
         parent = parent.split()[0]
 
     elif "superTrack" in settingDict:
         parent = settingDict.get("superTrack")
         parent = parent.split()[0]
         if parent=="on": # ignore "superTrack on" lines
             parent = None
 
     return parent
 
 def getTrackVis(settings):
     trackVis= "hide" # default vis is hide
     if "visibility" in settings:
         vis = settings["visibility"]
         trackVis= vis
 
     # the order is important: tracks can have both superTrack <parent> <vis>
     # AND visibility <vis>. superTrack has prority, see wgEncodeRegMarkH3k27ac
 
     # visibiltiy can be expressed with
     # superTrack <parent> dense
     # or
     # visibility dense
     # 
     # superTrack on 
     # implies visibility hide
     isSuperTrack = False
     if "subTrack" in settings:
         opts = settings["subTrack"].split()
         if opts[-1]=="on":
             trackVis = "full"
 
     if "parent" in settings:
         opts = settings["parent"].split()
         if opts[-1]=="on":
             trackVis = "full"
 
     if "superTrack" in settings:
         opts = settings["superTrack"].split()
         if opts[0]=="on":
             isSuperTrack = True
             if len(opts)==1:
                 trackVis = "hide"
             elif opts[1]=="show":
                 trackVis = "full"
             elif opts[1]=="hide":
                 trackVis = "hide"
             else:
                 assert(False)
         elif len(opts)==2:
             trackVis = opts[1]
         elif len(opts)==1:
             isSuperTrack = True
             trackVis = "hide"
         else:
             assert(False)
     return trackVis, isSuperTrack
 
 def parseTrackDb(db, tableSizes):
     """ download and parse trackDb, returns 8 values
     1) trackLabels = dict trackName -> shortLabel
     2) trackTables = dict trackName -> tableName (usually the same as trackName, but not for encode and genbank)
     3) trackParents = dict trackName -> trackName of parent track
     4) topLevelTracks = dict group -> list of top-level trackNames (not table names)
     5) pseudoTracks = set of names of tracks that have no tables ("views", "composites", "superTracks", "container multiWig")
     6) trackVis = dict trackName -> visibility
     7) superTracks = set of all superTrack names
     8) bigDataFiles = dict trackName -> bigDataUrl value
     """
     sqlTrackTables = trackToTableNames(tableSizes)
     rows = downloadTable(db, "trackDb")
     trackLabels = dict()
     trackParents = dict()
     trackTables = dict()
     trackBigDataFiles = dict()
     groups = defaultdict(list)
     pseudos = set()
     trackVis = dict()
     superTracks = set()
 
     for row in rows:
         track = row.tableName
         shortLabel = row.shortLabel
         settings = parseRa(row.settings)
         # get visibility
         trackVis[track], isSuperTrack = getTrackVis(settings)
         if isSuperTrack:
             superTracks.add(track)
         # a track has no associated table if:
         # - it defines any view with "view xxx"
         # - it sets "compositeTrack on"
         # - it sets "superTrack on"
 
         if      "view" in settings or \
                 settings.get("compositeTrack","")=="on" or \
                 settings.get("container","")=="multiWig" or \
                 isSuperTrack:
             pseudos.add(track)
             isPseudo = True
         else:
             isPseudo = False
 
         parent = getParent(settings)
         if parent!=None:
             trackParents[track] = parent
         else:
             group = settings.get("group")
             groups[group].append(track)
 
         trackLabels[track] = shortLabel
 
         if "table" in settings:
             tableName = settings["table"]
         else:
             tableName = track
 
         if not isPseudo:
             trackTables[track] = sqlTrackTables[track]
 
         if "bigDataUrl" in settings:
             trackBigDataFiles[track] = settings["bigDataUrl"]
             # create a pseudo table, so we know we have to download something
             trackTables[track] = [track]
 
         # same for bigGeneDataUrl, only used on knownGene right now
         if "bigGeneDataUrl" in settings:
             trackBigDataFiles[track] = settings["bigGeneDataUrl"]
             trackTables[track] = [track]
 
         if track=="bacEndPairs":
             trackTables[track].add("all_bacends")
 
     return trackLabels, trackTables, trackParents, groups, pseudos, \
         trackVis, superTracks, trackBigDataFiles
 
 def htmlHeader():
     " print start of page "
     print """
 <html>
 <head>
 %s
 <title>UCSC Genome Browser mirror tool</title>
 
 <script type='text/javascript' SRC='../js/jquery.js'></script>
 <script type='text/javascript' SRC='../js/jquery.plugins.js'></script>
 <link rel="stylesheet" href="../style/HGStyle.css" type="text/css" />
 <link rel='stylesheet' href='../style/nice_menu.css' type='text/css' />
 
 </head>
 <body>
     """ % (getCspMetaHeader())
     print open("../htdocs/inc/globalNavBar.inc").read()
     sys.stdout.flush()
 
 def htmlFooter():
     " print end of page "
     print """
 </body>
 </html>
     """
 
 def findTopParent(trackParents, trackName):
     " recursively search for the top level parent of a track "
     if trackName not in trackParents:
         return trackName
     return findTopParent(trackParents, trackParents[trackName])
 
 def makeTrackHierarchy(trackParents):
     """ given a dict with track->parent return dict with parent->list of child tracks.
     """
     debug("hierarchy")
     trackChildren = dict()
     for track, parent in trackParents.iteritems():
         if parent not in trackChildren:
             trackChildren[parent] = []
         trackChildren[parent].append(track)
     return trackChildren
 
 def getAllChildren(trackName, trackChildren):
     """ given track name and hierarchy info, return list of all children  (recursive)
     
     """
     if trackName not in trackChildren:
         return []
     children = trackChildren[trackName]
     assert(type(children) is types.ListType)
 
     tracks = []
     tracks.extend(children)
     for child in children:
         grandkids = getAllChildren(child, trackChildren)
         tracks.extend(grandkids)
     return tracks
 
 def getTrackTables(trackName, trackTables, trackChildren):
     """ return list of all track or table names required for a track,
     this includes:
     - tables that are split or specified via the trackDb "table" statement
     - track names that do not have a table associated with them
     - the same for all sub tracks of the track
     """
     subTracks = getAllChildren(trackName, trackChildren)
     subTracks.append(trackName)
 
     trackTableNames = set()
     if trackName in ["nonTrackTables", "allTables"]:
         # just use the table names as they are, there might not be tracks for them
         trackTableNames = set(trackChildren[trackName])
         if "history" in trackTableNames:
             trackTableNames.remove("history")
     else:
         # resolve track name -> table for track (e.g. for split tables or encode)
         for t in subTracks:
             if t in trackTables:
                 trackTableNames.update(trackTables[t])
 
     return list(trackTableNames)
 
 def humanReadable(totalSize):
     " convert number to human readable string, adding MB/kb etc "
     mbyte = 1024*1024
     gbyte = mbyte*1024
     tbyte = gbyte*1024
     if totalSize>tbyte:
         sizeStr = "%.1f TB" % (float(totalSize)/tbyte)
     elif totalSize>gbyte:
         sizeStr = "%.1f GB" % (float(totalSize)/gbyte)
     elif totalSize>mbyte:
         sizeStr = "%d MB" % (totalSize/mbyte)
     elif totalSize>1024:
         sizeStr = "%d kb" % (totalSize/1024)
     else:
         sizeStr = "%d bytes" % (totalSize)
     return sizeStr
 
 #def freespace(p):
     #""" Returns the number of free bytes on the drive that p is on """
     # does not make a lot of sense in virtual box, with a virtual disk that is auto-extending
     #s = os.statvfs(p)
     #return s.f_bsize * s.f_bavail
 
 def trackToFiles(trackName, trackTables, trackChildren, tableSizes, tableToGbdbFiles, noTableTracks, gbdbSizes):
     " return list of mysql tables and gbdb files for a given trackName "
     tableNames = []
     gbdbFiles = []
 
     # these tracks don't have any tables, but all gbdb files
     if trackName=="allGbdb":
         return [], gbdbSizes.keys()
     elif trackName=="liftOver":
         return [], tableToGbdbFiles.get("liftOver", [])
     elif trackName=="nonTrackTables":
         return trackChildren.get("nonTrackTables", []), []
 
     trackTableNames = getTrackTables(trackName, trackTables, trackChildren)
 
     # this predef track doesn't have any gbdb files
     if trackName=="allTables":
         return trackTableNames, []
 
     for table in trackTableNames:
         if table in noTableTracks:
             continue
         tableNames.append(table)
 
         for gbdbFname in tableToGbdbFiles[table]:
             # this special case on purpose does not include alignments to save space
             if trackName=="defaultConsTables" and (gbdbFname.endswith(".maf") or gbdbFname.endswith(".wib")):
                 continue
             gbdbFiles.append(gbdbFname)
 
     return tableNames, gbdbFiles
 
 def makeTableFileList(jobId, db, trackNames, trackTables, trackChildren, tableToGbdbFiles, tableSizes, forceTables, noTableTracks, gbdbSizes):
     """
     create an rsync include file in /tmp for a list of tracks and return file name
 
     special handling for "defaultConsTables"
     """
     outFname = TMPDIR+"%s_mysql_filesToDownload.txt" % jobId
     mysqlListFh = open(outFname, "w")
     gbdbListFname = TMPDIR+"%d_gbdb_filesToDownload.txt" % jobId
     gbdbListFh = open(gbdbListFname, "w")
 
     # write table and gbdb files names to two different files, one for mysql rsync, one for gbdb rsync
     for trackName in trackNames:
         tableNames, gbdbFiles = trackToFiles(trackName, trackTables, trackChildren, tableSizes, \
                 tableToGbdbFiles, noTableTracks, gbdbSizes)
 
         for table in tableNames:
             if table in tableSizes and tableSizes[table]!=0:
                 for ext in [".MYD", ".MYI", ".frm"]:
                     mysqlListFh.write("%s%s\n" % (table,ext))
 
         for gbdbFname in gbdbFiles:
             gbdbListFh.write(gbdbFname+"\n")
 
     # add the db twobit file and the trackDb index
     gbdbListFh.write("%s.2bit\n" % db)
     gbdbListFh.write("html/description.html\n")
     gbdbListFh.write("trackDb.ix\n")
     gbdbListFh.write("trackDb.ixx\n")
 
     # add some special tables that are always good to have
     for table in forceTables:
         for ext in [".MYD", ".MYI", ".frm"]:
             mysqlListFh.write("%s%s\n" % (table,ext))
 
     mysqlListFh.close()
     gbdbListFh.close()
 
     # same thing for files in hgFixed
     fixListFname = TMPDIR+"%d_hgFixed_filesToDownload.txt" % jobId
     ofh = open(fixListFname, "w")
     for table in FORCEFIXED:
         for ext in [".MYD", ".MYI", ".frm"]:
             ofh.write("%s%s\n" % (table,ext))
     ofh.close()
 
     return outFname, gbdbListFname, fixListFname
 
 def printTrackInfo(db, trackName, trackTables, trackLabels, localSizes, trackChildren, tableSizes, \
             trackToGbdbFiles, gbdbSizes, showSubtracks, indent, noTableTracks):
     " print info about one track as html "
     trackTableNames, gbdbFnames = trackToFiles(trackName, trackTables, trackChildren, tableSizes, \
         trackToGbdbFiles, noTableTracks, gbdbSizes)
 
     tableSize = sum([tableSizes.get(n, 0) for n in trackTableNames])
     gbdbSize = sum([gbdbSizes.get(n, 0) for n in gbdbFnames])
 
     # add note for omim/decipher/etc
     label = trackLabels[trackName]
     addHtml, addNote = "", ""
     remoteSize = tableSize + gbdbSize
 
     if remoteSize==0:
         addHtml = 'disabled="disabled"'
         addNote = "<small>no mirroring, not present or source database restricts data distribution</small>"
 
     sizeStr = humanReadable(tableSize)
     if len(trackTableNames)==1:
         tableStr = "%d table" % len(trackTableNames)
     else:
         tableStr = "%d tables" % len(trackTableNames)
 
     localTableSize = sum([localSizes.get(track, 0) for track in trackTableNames])
     localGbdbSize = sum([localSizes.get(fname, 0) for fname in gbdbFnames])
 
     localSize = localTableSize + localGbdbSize
     debug("remoteTableSize %d, remote gbdb size %d, local table size %d, localGbdbSize %d" % \
         (tableSize, gbdbSize, localTableSize, localGbdbSize))
 
     status = ""
     isGrey = False
     if remoteSize!=0 and remoteSize<=localSize:
         addHtml = 'disabled="disabled"'
         isGrey = True
         status = ", downloaded"
     elif localTableSize != 0:
         status = ", partially downloaded"
 
     indentStr = ""
     if indent!=0:
         indentStr = "".join(indent*["&nbsp;&nbsp;&nbsp;"])
 
     gbdbSizeStr = humanReadable(gbdbSize)
     gbdbFCount = len(gbdbFnames)
     gbdbStr=""
     if gbdbFCount!=0:
         gbdbStr = " + %(gbdbFCount)d gbdb files, %(gbdbSizeStr)s" % locals()
 
     debug("tables: "+str(trackTableNames))
     debug("gbdbFiles: "+str(gbdbFnames))
 
     outStr = '%(indentStr)s<input type="checkbox" %(addHtml)s name="%(trackName)s" id="%(trackName)s">'\
         '<a href="hgMirror?db=%(db)s&showFiles=%(trackName)s">%(label)s</a> ' \
         '(<span>%(trackName)s</span>): %(tableStr)s, %(sizeStr)s%(gbdbStr)s%(status)s %(addNote)s<br>' % locals()
     if isGrey:
         outStr = '<span style="color:#C0C0C0">'+outStr+'</span>'
     print outStr
 
     indent += 1
     if showSubtracks:
         for subTrack in trackChildren.get(trackName, []):
             printTrackInfo(db, subTrack, trackTables, trackLabels, localSizes, trackChildren, \
                 tableSizes, trackToGbdbFiles, gbdbSizes, \
                 showSubtracks, indent, noTableTracks)
 
 def getRevokedTracks(db):
     " many encode tracks have been revoked or renamed. We ignore them. "
     
     outFname = join(TMPDIR,"%s.revokedTrackes" % db)
     if isfile(outFname):
         tables = set(open(outFname).read().splitlines())
     else:
         query = 'select distinct obj from metaDb where val like "renamed%" or val like "revoked%" or val like "replaced%";'
         conn = sqlConnect(db, "public")
         cur = conn.cursor()
         # make sure that table exists
         try:
             cur.execute("SELECT 1 FROM metaDb LIMIT 1;")
         except:
             return set()
 
         cur.execute(query)
         tables = set()
         for row in cur.fetchall():
             tables.add(row[0])
 
         ofh = open(outFname, "w")
         ofh.write("\n".join(tables))
         ofh.close()
     
     return tables
 
 def htmlDropBox(selName, elList, selKey):
     """ 
     print html dropdown box with a selected default element and auto-reload on selection 
     print html dropbox box with (key, desc) elList, selKey is the key of the selected element.
     the name of the dropbox variable is selName and upon selection, a page refresh will be issued
     with the new value in selName
     """
     addStr = ""
     if selName=="clade":
         addStr += "document.orgForm.org.value = 0; "
     if selName=="org" or selName=="clade":
         addStr += "document.orgForm.db.value = 0; "
 
     print '''<SELECT id='%s' NAME="%s" >''' % (selName, selName)
 
     jsOnEventByIdF("change", selName, "document.orgForm.%s.value = document.mainForm.%s.options[document.mainForm.%s.selectedIndex].value; %s document.orgForm.submit();", selName, selName, selName, addStr)
 
     for row in elList:
         elKey, desc = row
         selStr = ""
         if elKey==selKey:
             selStr = " SELECTED"
         print '<OPTION%s VALUE="%s">%s</OPTION>' % (selStr, elKey, desc)
     print '</SELECT>'
 
 def printHiddenForm(clade, org, db):
     " hidden form is needed for javascript "
     print """
     <FORM ACTION="hgMirror" METHOD="GET" NAME="orgForm">
     <input type="hidden" name="clade" value="%s">
     <input type="hidden" name="org" value="%s">
     <input type="hidden" name="db" value="%s">
     </FORM>
     """ % (clade, org, db)
 
 def getHgDownloadDbs():
    " get all dbs available from hgdownload "
    fname = TMPDIR+"hgdownload.dbs.txt"
    if not isfile(fname):
        proc = subprocess.Popen(['rsync','hgdownload.soe.ucsc.edu::mysql'],stdout=subprocess.PIPE)
        dbs = set()
        for line in proc.stdout:
            db = line.rstrip("\n").split()[-1]
            dbs.add(db)
        open(fname, "w").write("\n".join(dbs))
    else:
        dbs = set(open(fname).read().splitlines())
    return dbs
 
 def getOrgInfo(validDbs):
     " get db info, either from local db or cached version "
     fname = TMPDIR+"orgInfo.cache"
     if isfile(fname):
          return cPickle.load(open(fname))
 
     defaultClade = "mammal"
     conn = sqlConnect(hgConf.get("central.db", "hgcentral"), "local")
     cur = conn.cursor()
     # list of all clade descriptions ("mammal", "Mammal"), ...
     cladeList = []
     cur.execute("SELECT name, label FROM clade ORDER BY priority")
     for row in cur.fetchall():
         cladeList.append((row[0], row[1]))
 
     # dict of clade -> list (orgName, orgName)
     cladeToOrgs = defaultdict(list)
     #cur.execute('SELECT clade, genome FROM genomeClade ORDER BY genome')
     cur.execute('SELECT clade, genomeClade.genome, name FROM genomeClade, dbDb where dbDb.genome=genomeClade.genome AND active=1 ORDER BY orderKey;')
     doneOrgs = set()
     for row in cur.fetchall():
         clade, genome, name = row
         if genome=="Human" and not clade=="model": # model is broken on hgwdev right now
             defaultClade = clade
         if row[2] not in validDbs:
             continue
 
         if row[1] in doneOrgs:
             continue
         else:
             cladeToOrgs[row[0]].append((row[1], row[1]))
             doneOrgs.add(row[1])
 
     # default genome per clade = the one with the lowest priority
     cladeToBestOrg = dict()
     cur.execute('SELECT clade, genome, priority FROM genomeClade;')
     cladePriorityMax = dict()
     for row in cur.fetchall():
             clade, genome, priority = row
             if priority < cladePriorityMax.get(clade, 99999):
                 cladeToBestOrg[clade]=genome
                 cladePriorityMax[clade] = priority
 
 
     # dict of orgName -> list of (db, description)
     # SELECT genome, name FROM dbDb order by orderKey;
     # dict of db -> description
     orgToDbs = defaultdict(list)
     cur.execute('SELECT genome, name, description FROM dbDb WHERE active=1 order by orderKey;')
     for row in cur.fetchall():
         rowDb = row[1]
         if rowDb not in validDbs:
             continue
         orgToDbs[row[0]].append((row[1], row[2]))
 
     # default db per orgName, {"Human":"hg19"}
     orgToBestDb = {}
     cur.execute('SELECT genome, name FROM defaultDb')
     for row in cur.fetchall():
         orgToBestDb[row[0]] = row[1]
     # add every genome that has only one db
     for org, dbs in orgToDbs.iteritems():
         if len(dbs)==1 and org not in orgToBestDb:
             orgToBestDb[org] = dbs[0][0]
 
     cur.close()
     conn.close()
 
     orgInfo = {}
     orgInfo["clades"] = cladeList
     #orgInfo["orgs"] = orgList
     orgInfo["cladeToOrgs"] = cladeToOrgs
     orgInfo["orgToDbs"] = orgToDbs
     orgInfo["orgToBestDb"] = orgToBestDb
     orgInfo["cladeToBestOrg"] = cladeToBestOrg
     orgInfo["defaultClade"] = defaultClade
     
     cPickle.dump(orgInfo, open(fname, "w"))
     return orgInfo
 
 def getCladeAssemblyDb(clade, org, db):
     """ return the list of clades, organisms and assemblies and default DBs.
     any of clade, org or db can be None.
     Return values for clade, org, db so all are valid strings
     """
     if not mysqlDbLoaded:
         print("<small>info: MySQLDb not installed, cannot retrieve hgcentral.dbDb, using internal defaults</small>")
         orgInfo = {}
         orgInfo["clades"] = [("mammal", "Mammal")]
         orgInfo["cladeToOrgs"] = {"mammal":[("Human", "Human")]}
         orgInfo["cladeToBestOrg"] = {"mammal":"Human"}
         orgInfo["orgToDbs"] = {"Human":["hg19"]}
         orgInfo["dbs"] = [("hg19", "Human (GrCh37)"), ("mm9", "Mouse (NCBI37)")]
 
         return orgInfo, "mammal", "Human", "hg19"
     
     validDbs = getHgDownloadDbs()
     orgInfo = getOrgInfo(validDbs)
 
     if db is not None and clade is None and org is None:
         dbToClade = dict()
         # search for the right org for this db
         for o, dbList in orgInfo["orgToDbs"].items():
             for orgDb, desc in dbList:
                 if orgDb==db:
                     org = o
                     break
         # search for the right clade for this org
         for c, orgList in orgInfo["cladeToOrgs"].items():
             for o, _ in orgList:
                 if o==org:
                     clade = c
                     break
 
     if clade is None or clade=="0":
         clade = orgInfo["defaultClade"]
     if org is None or org=="0" and clade not in [None, "0"]:
         org = orgInfo["cladeToBestOrg"][clade]
 
     if (db==None or db=="0"):
         if org not in orgInfo["orgToBestDb"]:
             print "organism is not valid:", org
             sys.exit(0)
         db = orgInfo["orgToBestDb"][org]
 
     if not db in validDbs:
         print "error: db %s does not exist on hgdownload" % db
         org = "Human"
         clade = "mammal"
         db = "hg19"
 
     return orgInfo, clade, org, db
 
 def htmlDbSelector(orgInfo, clade, org, db):
     " print dropdown boxes with clade, assembly, DBs and refresh when selected "
     print '<FORM name="mainForm">'
 
     print("Group: ")
     htmlDropBox("clade", orgInfo["clades"], clade)
     print("Genome: ")
     htmlDropBox("org", orgInfo["cladeToOrgs"][clade], org)
     print("Assembly: ")
     htmlDropBox("db", orgInfo["orgToDbs"][org], db)
 
     print '<INPUT type="submit" name="submit" value="Select"></input>'
     print '</FORM>'
 
     printHiddenForm(clade, org, db)
 
 def htmlTrackTable(db, trackLabels, trackTables, \
         trackParents, trackChildren, groupList, groupToTopTracks, \
         tableSizes, localSizes, gbdbSizes, trackToGbdbFiles, showSubtracks, noTableTracks):
     " print list of track sizes/tablecount as a html form, sorted by group "
 
     myUrl = basename(__file__)
 
     print 'Locally mirrored tracks are faster to browse than tracks that are accessed through the internet.<br>'
     print 'Select any number of tracks from the list below and click "Download" when finished.<br>'
     print 'The data will be downloaded from the UCSC servers with rsync and copied to the local mysql database and %s.<p>' % getGbdbDir()
 
     htmlStats(localSizes, gbdbSizes, tableSizes)
 
     print '<form action="%s" method="post">'  % myUrl
     print '<input type="submit" name="submit" value="Download"></input>'
 
     if showSubtracks:
         print '<a href="%s?db=%s">hide subtracks and show predefined groups</a><br>' % (myUrl, db)
         del groupToTopTracks["special"]
     else:
         print '<a href="%s?db=%s&showSubtracks=1">show subtracks</a><br>' % (myUrl, db)
 
     print "<h4>Track groups</h4>"
     print "<ul>"
     for groupName, groupLabel in groupList:
         if len(groupToTopTracks[groupName])==0:
             continue
         print '<li><a href="#{}">{}</a></li>'.format(groupName, groupLabel)
     print "</ul>"
 
     for groupName, groupLabel in groupList:
         # skip empty groups like custom
         if len(groupToTopTracks[groupName])==0:
             continue
         print '<a name="%s"><h4>Group: %s</h4>' % ( groupName, groupLabel)
 
         for trackName in groupToTopTracks[groupName]:
             printTrackInfo(db, trackName, trackTables, trackLabels, localSizes, \
                 trackChildren, tableSizes, trackToGbdbFiles, gbdbSizes, showSubtracks, 0, noTableTracks)
 
     print '<p>'
     print '<input type="hidden" name="db" value="%s"></input>' % db
     print '<input type="submit" name="submit" value="Download"></input>'
     print '</form>'
 
 def downloadCache(url, cacheFname):
     " download file from url or open local cached copy. Return list of lines "
     cachePath = TMPDIR+cacheFname
     if isfile(cachePath):
         return open(cachePath).read().splitlines()
 
     try:
         data = urllib2.urlopen(url).read()
     except urllib2.HTTPError:
         print "<small>info: Could not find %s. bigWig/bigBed/bam files will be skipped.<br></small>" % url
         data = None
 
     if data is not None and url.endswith(".gz"):
         data = StringIO.StringIO(data) # gunzipping requires to wrap a pseudo-file around the gzip data
         data = gzip.GzipFile(fileobj=data).read()
 
     # only create cache file if we got some data
     if data == None:
         data = ""
     else:
         cacheFh = open(cachePath, "wb")
         cacheFh.write(data)
         cacheFh.close()
 
     return data.splitlines()
 
 def linkTrackToGbdb(fnames, db, tableNames, bigDataFiles):
     """
     do some educated guessing on the gbdb files<->track links. returns a dict track -> list of files
     Needs a file bigFiles.tab.gz that assigns the bbi link table names to big files in them.
 
     fnames is a list of gbdb filenames that we try to assign somehow.
     """
 
     debug("linking trackDb")
     # download a list of trackname -> bigFile name from hgwdev
     # cannot do this on the fly
     lines = downloadCache(BIGFILETABLEURL % db, db+"_bigFiles.tab")
     tableFiles = defaultdict(list)
     assignedFnames = []
     fileTables = dict()
     for line in lines:
         table, fname = line.rstrip("\n").split("\t")
         if not fname.startswith("/gbdb"):
             # cannot get files on external http servers for internal tracks 
             continue
         else:
             fname = fname.replace("/gbdb/%s/" % db, "")
         tableFiles[table].append(fname)
         fileTables[fname] = table
         assignedFnames.append(fname)
 
     manualRules = [
         # format: regex in filename -> name of track
         (db+".2bit", "seq"),
         ("description.html", "seq"),
         ("gc5Base", "gc5Base"),
         ("multiz([0-9]+)way" , r'multiz\1way'),
         ("evoFold" , "evofold"),
         ("RNA-img" , "tRNAs"),
         ("Patch([0-9]+)" , r'altSeqComposite\1'),
         ("snp([0-9]+)Common" , r'snp\1Common'),
         ("snp([0-9]+)" , r'snp\1'),
         ("liftOver" , "liftOver"),
         ("cloneend" , "bacCloneEnds"),
         ("sts.11" , "stsMap"),
         ("kgTarget" , "knownGene"),
         ("fosEnds" , "fosEndPairs"),
         ("laminB1" , "laminB1"),
         ("hgmd" , "hgmdVar"),
         ("lrg.bb" , "lrg"),
         ("integrated_phase1" , "tgpPhase1"),
         ("HGDP" , "hgdpGeo")
         ]
 
     regexRules = []
     for regex, repl in manualRules:
         regexRules.append((re.compile(regex), repl))
 
     for fname in fnames:
         # assign bai files to their bam file table
         if fname.endswith(".ixx") or fname.endswith(".ix"):
             table = splitext(fname)[0]
             tableFiles[table].append(fname)
             assignedFnames.append(fname)
             continue
 
         if fname.endswith(".bai"):
             bamName = splitext(fname)[0]
             if bamName not in fileTables:
                 debug("%s: bam file has index but is not used by any track" % bamName)
                 continue
 
             table = fileTables[bamName] # if this fails, then a .bai file has no bam file
             tableFiles[table].append(fname)
             assignedFnames.append(fname)
             continue
 
         # check fname for regex and assign to some manually defined track
         for regex, repl in regexRules:
             match = regex.search(fname)
             if match != None:
                # transform matching string using regex
                matchStr = match.group()
                table = regex.sub(repl, matchStr)
                tableFiles[table].append(fname)
                assignedFnames.append(fname)
                #if "liftOver" in fname:
                    #print fname, "<br>"
                    #print "table", table, "<br>"
 
     orphanFnames = set(fnames) - set(assignedFnames)
     for fname in sorted(orphanFnames):
         debug("unassigned gbdb file: "+fname)
     #print assignedFnames
 
     misassignedTables = set(tableFiles) - set(tableNames)
     for table in sorted(misassignedTables):
         debug("not existing table: "+table)
         debug("for files: "+",".join(tableFiles[table]))
 
     # add the bigDataUrl files from trackDb
     for trackName, bigDataFile in bigDataFiles.iteritems():
         fname = bigDataFile.replace("/gbdb/%s/" % db, "")
         tableFiles[trackName].append(fname)
 
         # also add indexes for vcf or bam files
         tbiFname = fname+".tbi"
         baiFname = fname+".bai"
         if tbiFname in fnames:
             tableFiles[trackName].append(tbiFname)
         if baiFname in fnames:
             tableFiles[trackName].append(baiFname)
 
     return tableFiles
 
 def getRsyncSizes(dataType, db):
     """
     debug("rsync sizes")
     if dataType is "mysql: return dict with tableName:size for given db (includes indexes, frm + data)
     if dataType is "gbdb": return dict with filaname:size for given db
     """
     # run rsync command
     tmpFname = TMPDIR+"%s_%s.rsync.txt" % (dataType, db)
     if not isfile(tmpFname):
         cmd = "rsync -nav rsync://hgdownload.soe.ucsc.edu/%s/%s/ > %s.new && mv %s.new %s" % (dataType, db, tmpFname, tmpFname, tmpFname)
         ret = runCmd(cmd, mustRun=False)
         if ret!=0:
             print "<p>"
             print "Cannot run %s<br>" % cmd
             print "<hr>"
             print "It seems that the rsync server hgdownload.soe.ucsc.edu is not available<br>"
             print "Please check your network connection. If this problem persists send email to genome@soe.ucsc.edu<br>"
             sys.exit(0)
 
     # parse rsync output file
     tableSizes = collections.defaultdict(int)
 
     for line in open(tmpFname):
         ## rsync output looks like this:
         # receiving incremental file list
         # drwxr-xr-x     1875968 2013/11/23 23:37:59 .
         # -rw-rw-r--     2031084 2011/01/04 14:55:26 HInv.MYD
         fields = line.rstrip("\n").split()
         if len(fields)!=5:
             continue
         if fields[0][0]=="d":
             continue
         fileName = fields[-1]
         # newer rsync versions include commas in the size string
         tableSize = int(fields[1].replace(",",""))
 
         if dataType=="gbdb":
             resName = "%s" % (fileName)
         else:
             resName = fileName.split(".")[0]
         tableSizes[resName] += tableSize
     return dict(tableSizes)
 
 #def getTableRelations(db):
     # a start to parse all.joiner
     # set hg hg16,hg17,hg18,hg19
     # identifier jkgTranscriptId
     # "Known genes 3 trancript identifier"
     #    $hg,$mm.jkgTxCdsRepick.name
     #    $hg,$mm.jkgTxInfo.name
     #    $hg,$mm.jkgTxCdsEvidence.name
 
     # 
     #assert(False) # not finished
     #sets = {}
     #ifh = open("all.joiner")
     #for line in ifh:
         #if line.startswith("set"):
             #fields = line.rstrip("\n").split()
             #var = fields[1]
             #targets = set(fields[-1].split(","))
             #resolvedTargets = []
             #for t in targets:
                 #if t.startswith("$"):
                     #resolvedTargets.extend(sets[t[1:]])
                 #else:
                     #resolvedTargets.append(t)
             #sets[var] = resolvedTargets
 
 def udrIsUsable():
     " return true if we can use udr "
     # http://stackoverflow.com/a/12611523/233871
     # this is doing he same as the unix which command
     if spawn.find_executable("rsync") is None:
         print "ERROR: could not find the rsync executable in the PATH"
         sys.exit(0)
 
     # we currently don't use udr: it often fails as the port is not open
     # it's very hard to detect if it fails or not
     # return False
 
     # check if we can find udr binary
     udrPath = spawn.find_executable("udr")
     if udrPath is None:
         return False
 
     # created by /etc/rc.local
     if isfile(TMPDIR+"useUdr"):
         return True
 
     return False
 
 def buildRsyncCmd(remotePath, localPath, listFname, logFname):
     """ returns an rsync or udr command  to get files from remotePath to localPath"""
     if udrIsUsable():
         cmd =\
             'udr rsync -av --no-progress --files-from=%(listFname)s ' \
             'hgdownload.soe.ucsc.edu::%(remotePath)s %(localPath)s >> %(logFname)s 2>&1\n' % locals()
     else:
         cmd =\
             'rsync -av --no-progress --files-from=%(listFname)s ' \
             'rsync://hgdownload.soe.ucsc.edu/%(remotePath)s %(localPath)s >> %(logFname)s 2>&1\n' % locals()
     return cmd
 
 
 def runRsyncJobs(jobId, db, listFname, gbdbListFname, fixedFname):
     " run rsync, redirect stdout and return the ID of the stdout log file "
     localDbDir = join(MYSQLDIR, db)
     localFixedDir = join(MYSQLDIR, "hgFixed")
     jobFname = TMPDIR+"%d.sh" % jobId
     logFname = TMPDIR+"%d.log" % jobId
 
     # write job script
     cmdFh = open(jobFname, "w")
 
     cmdFh.write("ps -o pgid= -p $$ > %slastJob.pid\n" % TMPDIR)
 
     # make sure that mysql is restarted if the job is cancelled
     # by the system for some reason
     cmd = 'trap "trap - EXIT; sudo service mysql start; exit 1;" TERM EXIT\n'
     cmdFh.write(cmd)
 
     cmd = "sudo service mysql stop\n"
     cmdFh.write(cmd)
 
     cmd = "sudo -u mysql " + buildRsyncCmd("mysql/"+db, localDbDir, listFname, logFname)
     cmdFh.write(cmd)
 
     cmd = "sudo -u mysql " + buildRsyncCmd("mysql/hgFixed", localFixedDir, fixedFname, logFname)
     cmdFh.write(cmd)
 
     # for some reason this table was often in a crashed state.
     cmd = "sudo myisamchk /data/mysql/%s/hgFindSpec.MYI --fast  --recover\n" % db
     cmdFh.write(cmd)
 
     cmd = "sudo service mysql start\n"
     cmdFh.write(cmd)
 
     # the tables on hgdownload are sometimes not in a closed state. Fix them after the download.
     cmd = "sudo mysqlcheck --all-databases --auto-repair --quick --fast --silent\n"
     cmdFh.write(cmd)
 
     cmd = buildRsyncCmd("gbdb/"+db, getGbdbDir()+"/"+db, gbdbListFname, logFname)
     cmdFh.write(cmd)
 
     cmd = "/usr/local/apache/cgi-bin/hgMirror postRsyncChanges\n"
     cmdFh.write(cmd)
 
     cmdFh.close()
     # "at" suggested by http://stackoverflow.com/questions/6024472/start-background-process-daemon-from-cgi-script/6091159#6091159
     # must run job as a script in a subshell, not with -f option as
     # otherwise the trap command in the script won't work
     cmd = "echo bash %s | at now -M" % jobFname
 
     # write commands to log file
     logFh = open(TMPDIR+"%d.log" % jobId, "w")
     logFh.write(open(jobFname).read())
     logFh.write("\nrsync output:\n")
     logFh.close()
 
     # write jobId to status file
     idFh = open(TMPDIR+"lastJob.log", "w")
     idFh.write(str(jobId))
     idFh.close()
 
     # run commands
     print("Starting udr/rsync command script...")
     runCmd(cmd)
 
 def refreshPage(paramStr, delay=5, addNote=False):
     " refresh current CGI page using javascript "
     newUrl = basename(__file__)+"?"+paramStr
     print """
     <script type="text/javascript" nonce='%s'>
     function Redirect()
         { window.location="%s"; }
     setTimeout(Redirect(), %d);
     </script>
     """ % (getNonce(), newUrl, delay)
 
     if addNote:
         print("""Redirecting to <a href="%s">%s</a>"""  % (newUrl, newUrl))
 
 def jobsAreDone():
     " True if no jobs currently in the 'at' queue "
     cmd = "at -l | wc -l > %satWc.txt" % TMPDIR
     runCmd(cmd)
     lineCount = int(open(TMPDIR+"atWc.txt").read().strip())
     os.remove(TMPDIR+"atWc.txt")
     return lineCount==0
 
 def printLog(jobId):
     " print rsync log for given jobId to stdout "
     jobFname = TMPDIR+"%d.log" % int(jobId)
 
     print "rsync download commands:<p>"
     print "<pre>"
     if not isfile(jobFname):
         print "This download job is not active anymore.<p>"
         print '<a href="hgMirror">Start a new download</a>'
         sys.exit(0)
 
     lines = open(jobFname).read().splitlines()
     for line in lines[-30:]:
         print line
 
     print "</pre>"
     if jobsAreDone():
         print '<a href="hgTracks?hgt.reset=on">Back to Genome Browser</a><p>'
         print '<a href="hgMirror">Download more tracks</a>'
     else:
         print '<i>Downloading files... Page will reload every 4 seconds until download is complete.</i>'
         print '<i><a href="hgMirror?stopAllJobs=1">Cancel download now</a></i>'
         refreshPage("jobId=%d" % jobId, delay=4000, addNote=False)
 
 def removeSomeTracksFromSearch(conn):
     cur = conn.cursor()
     for table in REMOVESEARCH:
         query = "DELETE FROM hgFindSpec WHERE searchTable='%s';" % table
         try:
             cur.execute(query)
         except MySQLdb.ProgrammingError:
             pass
     cur.close()
 
 def hideSomeTracks(db, conn, trackNames):
     """
     hide some notoriously slow tracks by default
     """
     if not mysqlDbLoaded:
         print "warning: cannot hide some tracks, module mysqldb not installed"
         return
 
     # do we need this?
     # if "trackDb" not in localTables:
         #return
 
     # find all conservation tracks     
     hideTracks = []
     for t in trackNames:
         if (t.startswith("multiz") or t.startswith("cons")) and t.endswith("way"):
             hideTracks.append(t)
         elif t in FORCEHIDE:
             hideTracks.append(t)
 
     hideList = ["'"+s+"'" for s in hideTracks]
     hideStr = ", ".join(hideList)
     hideStr = "(%s)" % hideStr
 
     cur = conn.cursor()
     try:
         cur.execute("SELECT 1 FROM trackDb LIMIT 1;")
     except:
         return 
 
     query = "UPDATE trackDb SET visibility=0 WHERE tableName in %s" % hideStr
     try:
         cur.execute(query)
     except:
         print "could not execute query %s in database %s" % (query, db) 
     cur.close()
 
 def getGbdbDir():
     " return local gbdb dir without trailing slash "
     gbdbLoc = parseHgConf().get("gbdbLoc1", "/gbdb")
     gbdbLoc = gbdbLoc.rstrip("/")
     return gbdbLoc
 
 def getLocalSizes(db):
     """
     return a dict with table name -> total size of a mysql directory and filename -> size for the gbdb dir
     (gbdb filenames are relative to the gbdb/db directory)
     """
 
     path = join(MYSQLDIR, db)
     sizes = defaultdict(int)
     if runningAtUcsc():
         return sizes
 
     sqlFh = tempfile.NamedTemporaryFile()
     sqlFname = sqlFh.name
     cmd = 'sudo -u mysql /bin/ls -l %s > %s' % (path, sqlFname)
     ret = runCmd(cmd, mustRun=False)
     if ret!=0:
         if runningAtUcsc():
             print "<small>info: cannot read file sizes from %s, local file sizes unknown</small><br>" % path
         return sizes
 
     for line in open(sqlFname):
         fields = line.strip().split()
         if len(fields)<4:
             continue
         size = int(fields[4])
         fname = fields[-1]
         fileNoExt = splitext(basename(fname))[0]
         sizes[fileNoExt] += size
     if len(sizes)==0:
         print "(warning: local directory %s seems to be empty)" % path
 
     gbdbDir = join(getGbdbDir(), db)
     if isdir(gbdbDir):
         # get the size of all gbdb files
         gbdbFh = tempfile.NamedTemporaryFile()
         gbdbFname = gbdbFh.name
         # trailing slash is important
         cmd = 'find %s/ -type f > %s ' % (gbdbDir, gbdbFname)
         runCmd(cmd)
         fnames = open(gbdbFname).read().splitlines()
         for fname in fnames:
             relName = fname.replace(gbdbDir+"/", "")
             sizes[relName] = getsize(fname)
     return sizes
 
 def makeGbdb():
     """ create the gbdb directory and assign it to the apache user """
     if not isdir(getGbdbDir()):
         cmd = "sudo mkdir %s" % getGbdbDir()
         runCmd(cmd)
         cmd = "sudo chown %s.%s %s" % (APACHEUSER, APACHEUSER, getGbdbDir())
         runCmd(cmd)
 
 def checkGbdbMysqlAccess():
     """ check if we have write access to gbdb and mysql dir and can run the at command """
 
     msg = """<pre>        www-data     ALL = (mysql:mysql) NOPASSWD: /usr/local/bin/udr,/bin/ls,/usr/bin/rsync,/bin/rm
         www-data     ALL = (root:root) NOPASSWD: /bin/mkdir /data/gbdb
         www-data     ALL = (root:root) NOPASSWD: /bin/chown www-data.www-data /data/gbdb</pre>
         """
 
     # check if we can write to gbdb
     tmpFname = "%s/test.tmp" % getGbdbDir()
     try:
         open(tmpFname, "w")
     except IOError:
         print "This program cannot write to the %s directory. Please make sure that the apache user has permission to write to %s<br>" % (getGbdbDir(), getGbdbDir())
         print 'Use "sudo visudo" to add these lines to /etc/sudoers:<br>'
         print msg
         print 'Or check the directory permissions if the directory already exists.<br>'
         sys.exit(0)
 
     # check if we can rsync to mysql
     tmpFname2= "%s/test.tmp" % MYSQLDIR
     cmd = "sudo -u mysql rsync %s %s" % (tmpFname, tmpFname2)
     ret = runCmd(cmd, mustRun=False)
     if ret!=0:
         print "Could not run %s<br>" % cmd
         print """Cannot run rsync as the mysql user. Please make sure that you have these lines in /etc/sudoers:<br>"""
         print msg
         sys.exit(0)
 
     # cleanup the two tmp files
     os.remove(tmpFname)
 
     cmd = "sudo -u mysql rm %s" % tmpFname2
     ret = runCmd(cmd, mustRun=False)
     if ret!=0:
         os.remove(tmpFname2)
 
     # check if we can run "at"
     cmd = "echo echo hi | at -M now"
     ret = runCmd(cmd, mustRun=False)
     if ret != 0:
         print "Could not run %s<br>" % cmd
         print "It looks like we cannot run the 'at' command<br>"
         print "You might have to remove %s from /etc/at.deny" % APACHEUSER
         sys.exit(0)
 
 def trackToTableNames(tableNames):
     " handle split tables and some spec cases by creating a dictionary trackName -> tableNames "
     # as a heuristic to detect split tables, first get the counts of all suffixes 
     # after the underscore
     suffixCounts = defaultdict(int)
     for t in tableNames:
         if "_" in t and not t.startswith("uniGene"):
             prefix, track = string.split(t, "_", maxsplit=1)
             suffixCounts[track] += 1
 
     suffixDict = defaultdict(set)
     for t in tableNames:
         if "_" in t and not t.startswith("uniGene"):
             prefix, track = string.split(t, "_", maxsplit=1)
             # assume that any table a_b is not a split table
             # if there are less than 5 tables with the suffix b
             # the code in hgTracks is different, but we do not have a 
             # list of chromosomes in here
             if suffixCounts[track] < 5 and prefix!="all":
                 # the tracks all_mrna and all_est should be treated like
                 # split tables, as their track names are indeed mrna
                 # and est. sigh.
                 track = t
         else:
             track = t
 
         # assign chain..Link tables to chain table
         if track.startswith("chain") and track.endswith("Link"):
             track = track.replace("Link", "")
         if track.startswith("multiz") and track.endswith("Summary"):
             track = track.replace("Summary", "")
         if track.startswith("multiz") and track.endswith("Frames"):
             track = track.replace("Frames", "")
 
         suffixDict[track].add(t)
 
     
     return suffixDict
 
 def findImportantTables(trackSizes):
     """
     some tables are important for some assemblies, we always add them
     """
     tables = set()
     for t in FORCETABLES:
         if t in trackSizes:
             tables.add(t)
     return tables
 
 def htmlStats(localSizes, gbdbSizes, tableSizes):
     locTotal   = humanReadable(sum(localSizes.values()))
     gbdbTotal  = humanReadable(sum(gbdbSizes.values()))
     tableTotal = humanReadable(sum(tableSizes.values()))
     print("<b>Total size at UCSC</b>: mysql tables %(tableTotal)s, gbdb files %(gbdbTotal)s<br>" % locals())
     print("Total size of tables and gbdb on local disk: %(locTotal)s<p>" % locals())
 
 def addPredefined(trackVis, superTracks, groupList, groupToTopTracks, trackLabels, \
         trackChildren, tableToGbdbFiles, trackTables, tableSizes, revokedTracks):
     """ add special predefined track groups in place, modifies the last five parameters 
         These groups are special "tracks" that don't exist, their children are the real tracks
     """
     # get all track that are not hidden
     defaultTracks = []
     for track, vis in trackVis.iteritems():
         if vis!="hide":
             defaultTracks.append(track)
 
     # go down the hierarchy and let "hide" on higher levels override the lower ones
     # remove all tracks that are somehow hidden by higher tracks from the first list
     defaultTracks = set(defaultTracks)
     for topTracks in groupToTopTracks.values():
         for topTrack in topTracks:
             if trackVis.get(topTrack, None)=="hide":
                 for child in getAllChildren(topTrack, trackChildren):
                     if child in defaultTracks:
                         defaultTracks.remove(child)
 
     # also remove all superTracks from this list, otherwise will pull in all of encode again
     # somehow visibility/inheritance works this way
     defaultTracks = defaultTracks-set(superTracks)
     # need lists, not sets
     defaultTracks = list(defaultTracks)
 
     debug(",".join(defaultTracks))
 
     # create the two other sets of tracks, based on defaultTracks:
     defaultNoCons = []
     for t in defaultTracks:
         if "multiz" not in t and not t.startswith("cons"):
             defaultNoCons.append(t)
 
     nonEncode = []
     for track in trackVis:
         if track in defaultTracks or not track.startswith("wgEncode"):
             nonEncode.append(track)
 
     #nonEncAllTables = [t for t in tableSizes.keys() if not t.startswith("wgEncode")]
 
     # create a set of table names that are not linked to any trackDb entry
     # e.g. gbExtFile or extFile tables
     allTrackTables = set()
     for tableList in trackTables.values(): # flatten list
         allTrackTables.update(tableList)
     #print "<hr>", revokedTracks
     nonTrackTables = list(set(tableSizes) - set(allTrackTables) - revokedTracks)
     # hack to remove the ATTIC removed encode files, they're not in metaDb
     nonTrackTables = [t for t in nonTrackTables if not t.startswith("wgEncode") and not "Gencode" in t]
     #print "<hr>", nonTrackTables # XX
 
     groupList.insert(0, ("special", "Predefined tracks sets"))
     groupToTopTracks["special"] = ["defaultNoCons", "defaultConsTables", \
         "default", "nonEncode", "liftOver", "nonTrackTables", "allTables", "allGbdb"]
 
     trackLabels["defaultNoCons"] = "Default tracks without conservation"
     trackLabels["defaultConsTables"] = "Default tracks with conservation tables, but no alignments"
     trackLabels["default"] = "Default tracks"
     trackLabels["nonEncode"] = "Default tracks plus all non-Encode tracks"
     trackLabels["liftOver"] = "Liftover files"
     #trackLabels["nonEncAllTables"] = "Non-Encode tracks"
     trackLabels["nonTrackTables"] = "Secondary database tables not assigned to any track"
     trackLabels["allTables"] = "All database tables"
     trackLabels["allGbdb"] = "All non-database binary/text files"
 
     trackChildren["defaultNoCons"] = defaultNoCons
     trackChildren["defaultConsTables"] = defaultTracks
     trackChildren["default"] = defaultTracks
     trackChildren["nonEncode"] = nonEncode
     #trackChildren["liftOver"] = trackToGbdbFiles["liftOver"]
     #trackChildren["nonEncAllTables"] = nonEncAllTables
     trackChildren["nonTrackTables"] = nonTrackTables
     trackChildren["allTables"] = tableSizes.keys()
 
 def stopAllJobs():
     """ stop all waiting at jobs and the currently running download job """
     cmd = "atrm `at -l | cut -f1`"
     runCmd(cmd)
 
     if isfile(TMPDIR+"lastJob.pid"):
         lastPid = open(TMPDIR+"lastJob.pid").read().strip()
         cmd = "sudo kill -- -%s" % lastPid
         runCmd(cmd)
 
         # make sure mysql is started again
         cmd = 'sudo service mysql start'
         runCmd(cmd)
     
 def assertEmptyQueue():
     """ if the at queue is not empty, show link to log file of running job and stop program """
     if jobsAreDone():
         return
 
     jobId = open(TMPDIR+"lastJob.log").read()
     print "There is still a download job running<br>"
     print '<a href="hgMirror?jobId=%s">Show download job</a>' % jobId
     sys.exit(0)
 
 # the list of allowed chars in cgi args: digits, letters and dashes
 legalChars = set(string.digits)
 legalChars.update(set(string.letters))
 legalChars.update("_-.()/: ")
 
 def mustBeClean(str):
     """ make sure a string contains only letters and digits """
     if str==None:
         return str
 
     str = urllib.unquote(str)
     str = str.strip()
 
     for s in str:
         if s not in legalChars:
             print "illegal character in CGI parameter"
             sys.exit(0)
     return str
 
 #def getDb(args, org):
     #""" get Db from either CGI args or hgcentral, makes sure that DB actually exists """
     #db = mustBeClean(args.getvalue("db", default=None))
     #if db=="0":
         #db = None
 #
     #if db!=None:
         #if mysqlDbLoaded:
             #import _mysql_exceptions
             #try:
                 #conn  = sqlConnect(db, "public")
             #except _mysql_exceptions.OperationalError:
                 #print("<small>error: DB %s does not exist on public sql server</small>" % db)
                 #db = None
     ##else:
         #if mysqlDbLoaded:
             #conn = sqlConnect("hgcentral", "public")
             #cur = conn.cursor()
             ## get default db for org
             #cur.execute('SELECT name from defaultDb where genome=%s', (org,))
             #rowList = cur.fetchall()
             #print rowList
             #db = rowList[0][0]
             #if db=="":
                 #print "Invalid org parameter %s" % org
                 #sys.exit(0)
     #else:
         #db = "hg19"
     #return db
 
 def getCgiVar(args, name, default=None):
     return mustBeClean(args.getvalue(name, default=default))
 
 def addTableList(db, conn):
     " add a local file with rows to add to tableList for hg19 "
     if db=="hg19" and isfile(TABLELISTADD):
         query = 'LOAD DATA INFILE "%s" INTO TABLE tableList' % (TABLELISTADD)
         cur = conn.cursor()
         cur.execute(query)
         cur.close()
 
 def getAllTrackNames(conn):
     " return list of all tracks in trackDb "
     cur = conn.cursor()
     names = []
     query = "SELECT tableName from trackDb";
     try:
         cur.execute(query)
     except MySQLdb.ProgrammingError:
         return None
     for row in cur.fetchall():
         names.append(row[0])
     return names
 
 def postRsyncChanges(db, localSizes):
     " remove some tracks from trackDb, others from hgFindSpec and add a few row to tableList"
     if not mysqlDbLoaded:
         return
     try:
         conn = sqlConnect(db, "local")
         tracks = getAllTrackNames(conn)
         if tracks is None:
            print("mysql database %s has no trackDb" % db)
         else:
            hideSomeTracks(db, conn, tracks)
            removeSomeTracksFromSearch(conn)
         addTableList(db, conn)
         conn.close()
     except MySQLdb.OperationalError, e:
         print "Could not run postRsyncChanges on db %s" % db
         return
 
 def reloadPage():
     " trigger page reload with javascript "
     print """
     <script type="text/javascript" nonce='%s'>
     location.reload()
     </script>
     </html>""" % (getNonce())
     sys.stdout.flush()
     sys.exit(0)
 
 def htmlMiddle(args):
     " print html middle part "
 
     clade = getCgiVar(args, "clade")
     org   = getCgiVar(args, "org")
     db    = getCgiVar(args, "db", defaultDb)
     orgInfo, clade, org, db = getCladeAssemblyDb(clade, org, db)
 
     if not "jobId" in args.keys():
         # load all the trackDb info, track<->file relationships and file sizes
         cacheFname = TMPDIR+"%s.trackDataCache" % db
         marshalError = False
         if isfile(cacheFname):
             # use cached results, if we have them
             try:
                 groupList, trackLabels, trackTables, trackParents, groupToTopTracks, noTableTracks,\
                     trackVis, superTracks, tableSizes, trackChildren, gbdbSizes, \
                     tableToGbdbFiles = cPickle.load(open(cacheFname))
             except ValueError:
                 os.remove(cacheFname)
                 reloadPage()
 
         else:
             # parse trackDb: takes up to 10 seconds via transatlantic link so save the result
             debug("parsing")
             print("Creating list of downloadable tracks for database %s ... please wait ..." % db)
             sys.stdout.flush()
             groupList = loadGroups(db)
             tableSizes = getRsyncSizes("mysql", db)
             gbdbSizes = getRsyncSizes("gbdb", db)
 
             trackLabels, trackTables, trackParents, groupToTopTracks, \
                 noTableTracks, trackVis, superTracks, bigDataFiles = parseTrackDb(db, tableSizes)
             trackChildren = makeTrackHierarchy(trackParents)
 
             tableToGbdbFiles = linkTrackToGbdb(gbdbSizes, db, tableSizes, bigDataFiles)
             allData = groupList, trackLabels, trackTables, trackParents, \
                 groupToTopTracks, noTableTracks, trackVis, superTracks, \
                 tableSizes, trackChildren, gbdbSizes, tableToGbdbFiles
             cPickle.dump(allData, open(cacheFname, "w"))
             reloadPage()
             sys.exit(0)
 
     # when user has clicked OK, run rsync and refresh with jobId
     if "submit" in args.keys() and args["submit"].value=="Download":
         if runningAtUcsc():
             print "<br>Cannot do this on hgwdev. Copying things from hgdownload to hgwdev is not a good idea."
             sys.exit(0)
 
         makeGbdb()
         checkGbdbMysqlAccess()
 
         trackList = set(args.keys())
         for t in trackList:
             mustBeClean(t)
         trackList.remove("submit")
         trackList.remove("db")
 
         forceTables = findImportantTables(tableSizes)
 
         jobId = int(random.random()*1000000)
 
         revokedTracks = getRevokedTracks(db)
         addPredefined(trackVis, superTracks, groupList, groupToTopTracks, trackLabels, trackChildren, \
             tableToGbdbFiles, trackTables, tableSizes, revokedTracks)
 
         listFname, gbdbFname, fixedFname = makeTableFileList(jobId, db, trackList, trackTables, trackChildren, \
                                         tableToGbdbFiles, tableSizes, forceTables, noTableTracks, gbdbSizes)
         runRsyncJobs(jobId, db, listFname, gbdbFname, fixedFname)
         refreshPage("jobId=%d" % jobId, addNote=True)
 
     # show list of files if told to do so
     elif "showFiles" in args.keys() and "db" in args.keys():
         revokedTracks = getRevokedTracks(db)
         addPredefined(trackVis, superTracks, groupList, groupToTopTracks, trackLabels, trackChildren, \
             tableToGbdbFiles, trackTables, tableSizes, revokedTracks)
         trackName = args["showFiles"].value
         tables, gbdbFiles = trackToFiles(trackName, trackTables, trackChildren, \
                 tableSizes, tableToGbdbFiles, noTableTracks, gbdbSizes)
 
         totalSize = 0
 
         print "<h4>MySQL tables linked to %s</h4>" % trackName
         for table in tables:
             if table not in tableSizes:
                 continue
             size = tableSizes[table]
             sizeStr = humanReadable(size)
             totalSize += size
             print table+" (%s) <br>" % sizeStr
         
         print "<h4>GBDB files linked to %s</h4>" % trackName
         for gbdbFname in gbdbFiles:
             size = gbdbSizes[gbdbFname]
             totalSize += size
             sizeStr = humanReadable(size)
             print gbdbFname+" (%s) <br>" % sizeStr
 
         totalSizeStr = humanReadable(totalSize)
         print "<h4>Total size: %s</h4>" % totalSizeStr
         sys.exit(0)
 
     # if we have a jobId in URL, show the rsync log
     elif "jobId" in args.keys():
         if runningAtUcsc():
             print "cannot do this on hgwdev"
             sys.exit(0)
 
         jobId = int(args["jobId"].value)
         printLog(jobId)
 
     # stop job if requested to do so
     elif "stopAllJobs" in args.keys():
         stopAllJobs()
         print "All download jobs have been stopped<br>"
         print '<a href="hgMirror">Return to track selection</a>'
         sys.exit(0)
 
     # show tracklist and change default tracks if no param
     else:
         assertEmptyQueue()
 
         print '<h4>UCSC genome browser track download tool</h4>'
         htmlDbSelector(orgInfo, clade, org, db)
 
         showSubtracks = bool(int(args.getfirst("showSubtracks", "0")))
 
         localSizes = getLocalSizes(db)
         revokedTracks = getRevokedTracks(db)
 
         addPredefined(trackVis, superTracks, groupList, groupToTopTracks, trackLabels, trackChildren, \
             tableToGbdbFiles, trackTables, tableSizes, revokedTracks)
 
         htmlTrackTable(db, trackLabels, trackTables, trackParents, trackChildren, \
             groupList, groupToTopTracks, \
             tableSizes, localSizes, gbdbSizes, tableToGbdbFiles, showSubtracks, noTableTracks)
 
         #postRsyncChanges(db, trackTables)
 
 def getAllLocalDbs():
     " return list of locally present DBs "
     conn = sqlConnect(parseHgConf().get("central.db", "hgcentral"), "local")
     cur = conn.cursor()
     cur.execute("SHOW DATABASES")
     rows =  cur.fetchall()
 
     dbSet = set()
     for row in rows:
         dbSet.add(row[0]) # rows are always a list, even if just one value
     
     conn.close()
     dbSet = dbSet - set(['information_schema', 'customTrash', 'hgFixed', 'hgTemp', 'hgcentral', 'mysql','performance_schema'])
     return dbSet
 
 def getCookies(cookieDb):
     """ return db cookie value. Called usually before headers are printed.
     Default value can be sent as argument.
     """
     cookie_string = os.environ.get('HTTP_COOKIE')
     if cookie_string:
         cookie = Cookie.SimpleCookie()
         cookie.load(cookie_string)
         if "db" in cookie:
             cookieDb = cookie["db"].value
     return cookieDb
 
 def setCookies(db):
     """
     Send a cookie header to set the "db" cookie to the value specified
     """
     cookie = Cookie.SimpleCookie()
     cookie['db'] = db
     cookie['db']['expires'] = 30 * 24 * 60 * 60
     cookie['db']['comment'] = 'holds the last hgMirror database'
     print cookie
 
 def setDefaultDb(genome, db):
     " change the default db "
     query = 'UPDATE hgcentral SET name="%s" where genome="%s"' % (genome, db)
     conn = sqlConnect(hgConf.get("central.db", "hgcentral"), "local")
     cur = conn.cursor()
     cur.execute(query)
     conn.close()
 
 def main():
     # hide some tracks if any argument specified. makes this tool usable from command line, 
     # from cronjobs that only need to run this function
     if len(sys.argv)>1:
         dbSet = getAllLocalDbs()
         for db in dbSet:
             localSizes = getLocalSizes(db)
             postRsyncChanges(db, localSizes)
         #setDefaultDb("Human", "hg19")
         sys.exit(0)
 
     # the code above is not creating or using TMPDIR, as from the cronjob, we're
     # root but as a CGI we're the apache user and cannot write to root-owned directories.
     # Only create this directory when we're a CGI.
     if not isdir(TMPDIR):
         os.makedirs(TMPDIR)
 
     global defaultDb
     defaultDb = getCookies(defaultDb)
 
     args = cgi.FieldStorage()
 
     if "db" in args:
         setCookies(args["db"].value)
 
     print "Content-type: text/html"
     print
 
     parseHgConf()
     if hgConf==None or not hgConf.get("allowHgMirror", "0").lower() in ["1", "yes", "true", "on"]:
         print("hgMirror is not activated on this machine<br>")
         print("Set allowHgMirror=1 in your cgi-bin/hg.conf file.<br>")
         print("In the Genome-Browser-in-a-box VM, use the command gbibMirrorTracksOn.")
         sys.exit(0)
         
     if "debug" in args.keys():
         global DEBUG
         DEBUG = True
 
     if "reset" in args.keys():
         for fname in glob.glob(TMPDIR+"*"):
             os.remove(fname)
         os.rmdir(TMPDIR)
         print "temporary files deleted"
         sys.exit(0)
 
     htmlHeader()
     htmlMiddle(args)
     jsInlineFinish()
     htmlFooter()
 
 main()