UCSC Genome Browser mirror tool

0f3ca3eaf5792df01b7c600a5428d2d0b2809fcd max Fri Sep 20 13:18:01 2024 -0700 Revert "more features to hubtools: search in both parent and subdirs, better docs" This reverts commit 05e67c59a20a5d00b810a981aef3b00c5bef82e1. diff --git src/hg/hgMirror/hgMirror src/hg/hgMirror/hgMirror index 98e25fb..396bd17 100755 --- src/hg/hgMirror/hgMirror +++ src/hg/hgMirror/hgMirror @@ -1,81 +1,86 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python2.7 # A little CGI interface to download the tables for a set of tracks via udr # to local machine. This is mostly useful when setting up a mirror or a VM # of the browser. It does not run on hgwdev. # This script does the following: # - get trackDb and grp table from hgDownload # - get table and gbdb sizes from ucsc rsync server # - get list with track<->filename for all bigfile tracks from hgwdev # - try to assign table names to gbdb files using this list and some hacky rules # - parse hg.conf to find mysql server and hide a few tracks in its trackDb # - infer track/subtrack hierarchy by parsing trackDb # - generate HTML table with labels/sizes/tablecounts for all tracks and their child tracks # - when user clicks submit, start rsync transfer and redirect to page that shows progress # - handles non-existing tables and some hgFixed tables # When run from the command line, this CGI hides some tracks and removes tracks from # track search on hg19. This is usually run from a cronjob after trackDb updates. # This script requires the following setup +# - mysqldb python module # - "rsync" in path # - "at" in path # To allow this on ubuntu, add these lines to /etc/sudoers: #www-data ALL = (mysql:mysql) NOPASSWD: /bin/ls,/usr/bin/rsync,/bin/rm #www-data ALL = (root:root) NOPASSWD: /bin/mkdir /gbdb #www-data ALL = (root:root) NOPASSWD: /bin/chown www-data.www-data /gbdb # - the apache user has to be able to run 'at' jobs. # To allow this on ubuntu, need to run this command to remove www-data from /etc/at.deny # sudo sed -i s/www-data//g /etc/at.deny # This script does not handle: # tables joined to other tables are not downloaded. Would have to parse all.joiner for that. # format python errors in html, as we're a CGI script import cgi import cgitb; cgitb.enable() # these are default python modules on python 2.7, no errors expected here -import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, zlib, collections, io, gzip, string, sys, os, random, \ - subprocess, re, types, socket, pickle, copy, glob, tempfile, http.cookies +import urllib, urllib2, zlib, collections, StringIO, gzip, string, sys, os, random, \ + subprocess, re, types, socket, cPickle, copy, glob, tempfile, Cookie from collections import defaultdict, namedtuple from os.path import * from distutils import spawn # import the UCSC-specific library sys.path.append(join(dirname(__file__), "pyLib")) try: - from hgLib3 import getNonce, getCspMetaHeader, jsOnEventByIdF, jsInlineFinish + from hgLib import getNonce, getCspMetaHeader, jsOnEventByIdF, jsInlineFinish # cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \ # sqlConnect, sqlQuery, errAbort, cfgOption, runCmd, cgiGetAll, printHgcHeader, \ # printHgcSection, # webStartGbNoBanner, htmlPageEnd, hConnectCentral, sqlTableExists, \ # readSmallFile except: print("Content-type: text/html\n") print("Cannot find the directory cgi-bin/pyLib in Apache. This is an installation error.") print("All all parts of cgi-bin installed? Did you do 'make' in kent/src/hg/pyLib?") defaultDb = "hg19" -# The script works without mysql module but cannot auto-hide some tracks. +# the mysqldb module has to be installed with one of these commands: +# - many common linuxes and OSX: pip install mysqldb +# - debian: sudo apt-get install python-mysqldb +# - fedora/centos/redhat: sudo yum install python-mysqldb +# The script works without the mysqldb module but cannot auto-hide some tracks. mysqlDbLoaded = True try: - import pymysql.cursors + import MySQLdb except: mysqlDbLoaded = False # default mysql data dir on debian-based distros MYSQLDIR = "/var/lib/mysql" # can probably autodetect this, but hardcoded here APACHEUSER = "www-data" # optional file with rows to add to tableList TABLELISTADD = "/root/tableListAdd.hg19.tab" # directory for temporary files, keep trailing slash TMPDIR = "/tmp/hgMirror/" @@ -151,95 +156,95 @@ return hgConf def sqlConnect(db, name): """ connect to sql """ if name=="public": host, user, passwd = "genome-mysql.soe.ucsc.edu", "genomep", "password" elif name=="local": cfg = parseHgConf() host, user, passwd = cfg["db.host"], cfg["db.user"], cfg["db.password"] conn = MySQLdb.connect(host=host, user=user, passwd=passwd, db=db) return conn def debug(msg): if DEBUG: - print((msg+"
")) + print(msg+"
") sys.stdout.flush() def runningAtUcsc(): if "hgwdev" in socket.gethostname(): return True return False def runCmd(cmd, mustRun=True): " wrapper around os.system that makes sure sudo is not called " if runningAtUcsc() and cmd.startswith("sudo"): return 0 ret = os.system(cmd) if ret!=0 and mustRun: - print("Could not run command %s" % cmd) + print "Could not run command %s" % cmd sys.exit(0) return ret def loadGroups(db): """ load grp table via mysql and return as a list of tuples (name, label)""" groups = [] if mysqlDbLoaded: conn = sqlConnect(db, "public") cur = conn.cursor() cur.execute("SELECT name, label from grp order by priority") groups = [] for row in cur.fetchall(): groups.append((row[0], row[1])) cur.close() conn.close() else: for row in downloadTable(db, "grp"): groups.append((row[0], row[1])) return groups def downloadTable(db, table): """ download table from hgdownload by parsing sql file first to get the field names, then the tab sep file. Returns a list of objects, with field names as attributes and their values from the tab sep file. """ baseUrl = 'http://hgdownload.soe.ucsc.edu/goldenPath/%s/database/' % db # parse the .sql file and create a namedtuple "struct" for it sqlUrl = baseUrl+table+".sql" - sqlLines = urllib.request.urlopen(sqlUrl).read().splitlines() + sqlLines = urllib2.urlopen(sqlUrl).read().splitlines() fieldNames = [] for l in sqlLines: if l.strip().startswith("PRIMARY KEY"): continue if l.startswith(" "): fieldName = l.split()[0].strip("`") fieldNames.append(fieldName) Struct = namedtuple("rec", fieldNames) # read the tab-sep data # can use a cached copy from /tmp tmpFname = TMPDIR+db+"."+table+".txt.gz" if isfile(tmpFname): data = open(tmpFname) else: dataUrl = baseUrl+table+".txt.gz" - remoteData = urllib.request.urlopen(dataUrl).read() - data = io.StringIO(remoteData) # gunzipping requires to wrap a pseudo-file around the gzip data + remoteData = urllib2.urlopen(dataUrl).read() + data = StringIO.StringIO(remoteData) # gunzipping requires to wrap a pseudo-file around the gzip data # write to cache file tmpFh = open(tmpFname, "w") tmpFh.write(remoteData) tmpFh.close() data = gzip.GzipFile(fileobj=data).read() data = data.replace("\\\n", "\a") # translate escaped mysql newline to \a data = data.replace("\\\t", "\b") # translate escaped mysql tab to \b lines = data.split("\n") # convert tab-sep lines to namedtuples (=objects) rows = [] for line in lines: if len(line)==0: continue fields = line.split("\t") @@ -405,79 +410,79 @@ trackTables[track] = [track] # same for bigGeneDataUrl, only used on knownGene right now if "bigGeneDataUrl" in settings: trackBigDataFiles[track] = settings["bigGeneDataUrl"] trackTables[track] = [track] if track=="bacEndPairs": trackTables[track].add("all_bacends") return trackLabels, trackTables, trackParents, groups, pseudos, \ trackVis, superTracks, trackBigDataFiles def htmlHeader(): " print start of page " - print(""" + print """ %s UCSC Genome Browser mirror tool - """ % (getCspMetaHeader())) - print(open("../htdocs/inc/globalNavBar.inc").read()) + """ % (getCspMetaHeader()) + print open("../htdocs/inc/globalNavBar.inc").read() sys.stdout.flush() def htmlFooter(): " print end of page " - print(""" + print """ - """) + """ def findTopParent(trackParents, trackName): " recursively search for the top level parent of a track " if trackName not in trackParents: return trackName return findTopParent(trackParents, trackParents[trackName]) def makeTrackHierarchy(trackParents): """ given a dict with track->parent return dict with parent->list of child tracks. """ debug("hierarchy") trackChildren = dict() - for track, parent in trackParents.items(): + for track, parent in trackParents.iteritems(): if parent not in trackChildren: trackChildren[parent] = [] trackChildren[parent].append(track) return trackChildren def getAllChildren(trackName, trackChildren): """ given track name and hierarchy info, return list of all children (recursive) """ if trackName not in trackChildren: return [] children = trackChildren[trackName] - assert(type(children) is list) + assert(type(children) is types.ListType) tracks = [] tracks.extend(children) for child in children: grandkids = getAllChildren(child, trackChildren) tracks.extend(grandkids) return tracks def getTrackTables(trackName, trackTables, trackChildren): """ return list of all track or table names required for a track, this includes: - tables that are split or specified via the trackDb "table" statement - track names that do not have a table associated with them - the same for all sub tracks of the track """ @@ -516,31 +521,31 @@ return sizeStr #def freespace(p): #""" Returns the number of free bytes on the drive that p is on """ # does not make a lot of sense in virtual box, with a virtual disk that is auto-extending #s = os.statvfs(p) #return s.f_bsize * s.f_bavail def trackToFiles(trackName, trackTables, trackChildren, tableSizes, tableToGbdbFiles, noTableTracks, gbdbSizes): " return list of mysql tables and gbdb files for a given trackName " tableNames = [] gbdbFiles = [] # these tracks don't have any tables, but all gbdb files if trackName=="allGbdb": - return [], list(gbdbSizes.keys()) + return [], gbdbSizes.keys() elif trackName=="liftOver": return [], tableToGbdbFiles.get("liftOver", []) elif trackName=="nonTrackTables": return trackChildren.get("nonTrackTables", []), [] trackTableNames = getTrackTables(trackName, trackTables, trackChildren) # this predef track doesn't have any gbdb files if trackName=="allTables": return trackTableNames, [] for table in trackTableNames: if table in noTableTracks: continue tableNames.append(table) @@ -647,31 +652,31 @@ gbdbSizeStr = humanReadable(gbdbSize) gbdbFCount = len(gbdbFnames) gbdbStr="" if gbdbFCount!=0: gbdbStr = " + %(gbdbFCount)d gbdb files, %(gbdbSizeStr)s" % locals() debug("tables: "+str(trackTableNames)) debug("gbdbFiles: "+str(gbdbFnames)) outStr = '%(indentStr)s'\ '%(label)s ' \ '(%(trackName)s): %(tableStr)s, %(sizeStr)s%(gbdbStr)s%(status)s %(addNote)s
' % locals() if isGrey: outStr = ''+outStr+'' - print(outStr) + print outStr indent += 1 if showSubtracks: for subTrack in trackChildren.get(trackName, []): printTrackInfo(db, subTrack, trackTables, trackLabels, localSizes, trackChildren, \ tableSizes, trackToGbdbFiles, gbdbSizes, \ showSubtracks, indent, noTableTracks) def getRevokedTracks(db): " many encode tracks have been revoked or renamed. We ignore them. " outFname = join(TMPDIR,"%s.revokedTrackes" % db) if isfile(outFname): tables = set(open(outFname).read().splitlines()) else: @@ -696,71 +701,71 @@ return tables def htmlDropBox(selName, elList, selKey): """ print html dropdown box with a selected default element and auto-reload on selection print html dropbox box with (key, desc) elList, selKey is the key of the selected element. the name of the dropbox variable is selName and upon selection, a page refresh will be issued with the new value in selName """ addStr = "" if selName=="clade": addStr += "document.orgForm.org.value = 0; " if selName=="org" or selName=="clade": addStr += "document.orgForm.db.value = 0; " - print('''''' % (selName, selName) jsOnEventByIdF("change", selName, "document.orgForm.%s.value = document.mainForm.%s.options[document.mainForm.%s.selectedIndex].value; %s document.orgForm.submit();", selName, selName, selName, addStr) for row in elList: elKey, desc = row selStr = "" if elKey==selKey: selStr = " SELECTED" - print('%s' % (selStr, elKey, desc)) - print('') + print '%s' % (selStr, elKey, desc) + print '' def printHiddenForm(clade, org, db): " hidden form is needed for javascript " - print(""" + print """ - """ % (clade, org, db)) + """ % (clade, org, db) def getHgDownloadDbs(): " get all dbs available from hgdownload " fname = TMPDIR+"hgdownload.dbs.txt" if not isfile(fname): proc = subprocess.Popen(['rsync','hgdownload.soe.ucsc.edu::mysql'],stdout=subprocess.PIPE) dbs = set() for line in proc.stdout: db = line.rstrip("\n").split()[-1] dbs.add(db) open(fname, "w").write("\n".join(dbs)) else: dbs = set(open(fname).read().splitlines()) return dbs def getOrgInfo(validDbs): " get db info, either from local db or cached version " fname = TMPDIR+"orgInfo.cache" if isfile(fname): - return pickle.load(open(fname)) + return cPickle.load(open(fname)) defaultClade = "mammal" conn = sqlConnect(hgConf.get("central.db", "hgcentral"), "local") cur = conn.cursor() # list of all clade descriptions ("mammal", "Mammal"), ... cladeList = [] cur.execute("SELECT name, label FROM clade ORDER BY priority") for row in cur.fetchall(): cladeList.append((row[0], row[1])) # dict of clade -> list (orgName, orgName) cladeToOrgs = defaultdict(list) #cur.execute('SELECT clade, genome FROM genomeClade ORDER BY genome') cur.execute('SELECT clade, genomeClade.genome, name FROM genomeClade, dbDb where dbDb.genome=genomeClade.genome AND active=1 ORDER BY orderKey;') doneOrgs = set() @@ -793,177 +798,177 @@ # dict of db -> description orgToDbs = defaultdict(list) cur.execute('SELECT genome, name, description FROM dbDb WHERE active=1 order by orderKey;') for row in cur.fetchall(): rowDb = row[1] if rowDb not in validDbs: continue orgToDbs[row[0]].append((row[1], row[2])) # default db per orgName, {"Human":"hg19"} orgToBestDb = {} cur.execute('SELECT genome, name FROM defaultDb') for row in cur.fetchall(): orgToBestDb[row[0]] = row[1] # add every genome that has only one db - for org, dbs in orgToDbs.items(): + for org, dbs in orgToDbs.iteritems(): if len(dbs)==1 and org not in orgToBestDb: orgToBestDb[org] = dbs[0][0] cur.close() conn.close() orgInfo = {} orgInfo["clades"] = cladeList #orgInfo["orgs"] = orgList orgInfo["cladeToOrgs"] = cladeToOrgs orgInfo["orgToDbs"] = orgToDbs orgInfo["orgToBestDb"] = orgToBestDb orgInfo["cladeToBestOrg"] = cladeToBestOrg orgInfo["defaultClade"] = defaultClade - pickle.dump(orgInfo, open(fname, "w")) + cPickle.dump(orgInfo, open(fname, "w")) return orgInfo def getCladeAssemblyDb(clade, org, db): """ return the list of clades, organisms and assemblies and default DBs. any of clade, org or db can be None. Return values for clade, org, db so all are valid strings """ if not mysqlDbLoaded: print("info: MySQLDb not installed, cannot retrieve hgcentral.dbDb, using internal defaults") orgInfo = {} orgInfo["clades"] = [("mammal", "Mammal")] orgInfo["cladeToOrgs"] = {"mammal":[("Human", "Human")]} orgInfo["cladeToBestOrg"] = {"mammal":"Human"} orgInfo["orgToDbs"] = {"Human":["hg19"]} orgInfo["dbs"] = [("hg19", "Human (GrCh37)"), ("mm9", "Mouse (NCBI37)")] return orgInfo, "mammal", "Human", "hg19" validDbs = getHgDownloadDbs() orgInfo = getOrgInfo(validDbs) if db is not None and clade is None and org is None: dbToClade = dict() # search for the right org for this db - for o, dbList in list(orgInfo["orgToDbs"].items()): + for o, dbList in orgInfo["orgToDbs"].items(): for orgDb, desc in dbList: if orgDb==db: org = o break # search for the right clade for this org - for c, orgList in list(orgInfo["cladeToOrgs"].items()): + for c, orgList in orgInfo["cladeToOrgs"].items(): for o, _ in orgList: if o==org: clade = c break if clade is None or clade=="0": clade = orgInfo["defaultClade"] if org is None or org=="0" and clade not in [None, "0"]: org = orgInfo["cladeToBestOrg"][clade] if (db==None or db=="0"): if org not in orgInfo["orgToBestDb"]: - print("organism is not valid:", org) + print "organism is not valid:", org sys.exit(0) db = orgInfo["orgToBestDb"][org] if not db in validDbs: - print("error: db %s does not exist on hgdownload" % db) + print "error: db %s does not exist on hgdownload" % db org = "Human" clade = "mammal" db = "hg19" return orgInfo, clade, org, db def htmlDbSelector(orgInfo, clade, org, db): " print dropdown boxes with clade, assembly, DBs and refresh when selected " - print('') + print '' + print '' printHiddenForm(clade, org, db) def htmlTrackTable(db, trackLabels, trackTables, \ trackParents, trackChildren, groupList, groupToTopTracks, \ tableSizes, localSizes, gbdbSizes, trackToGbdbFiles, showSubtracks, noTableTracks): " print list of track sizes/tablecount as a html form, sorted by group " myUrl = basename(__file__) - print('Locally mirrored tracks are faster to browse than tracks that are accessed through the internet.
') - print('Select any number of tracks from the list below and click "Download" when finished.
') - print('The data will be downloaded from the UCSC servers with rsync and copied to the local mysql database and %s.

' % getGbdbDir()) + print 'Locally mirrored tracks are faster to browse than tracks that are accessed through the internet.
' + print 'Select any number of tracks from the list below and click "Download" when finished.
' + print 'The data will be downloaded from the UCSC servers with rsync and copied to the local mysql database and %s.

' % getGbdbDir() htmlStats(localSizes, gbdbSizes, tableSizes) - print('

' % myUrl) - print('') + print '' % myUrl + print '' if showSubtracks: - print('hide subtracks and show predefined groups
' % (myUrl, db)) + print 'hide subtracks and show predefined groups
' % (myUrl, db) del groupToTopTracks["special"] else: - print('show subtracks
' % (myUrl, db)) + print 'show subtracks
' % (myUrl, db) - print("

Track groups

") - print("

Track groups

{}

{}

" for groupName, groupLabel in groupList: # skip empty groups like custom if len(groupToTopTracks[groupName])==0: continue - print('

Group: %s

' % ( groupName, groupLabel)) + print '

Group: %s

' % ( groupName, groupLabel) for trackName in groupToTopTracks[groupName]: printTrackInfo(db, trackName, trackTables, trackLabels, localSizes, \ trackChildren, tableSizes, trackToGbdbFiles, gbdbSizes, showSubtracks, 0, noTableTracks) - print('

') - print('' % db) - print('') - print('

') + print '

' + print '' % db + print '' + print '' def downloadCache(url, cacheFname): " download file from url or open local cached copy. Return list of lines " cachePath = TMPDIR+cacheFname if isfile(cachePath): return open(cachePath).read().splitlines() try: - data = urllib.request.urlopen(url).read() - except urllib.error.HTTPError: - print("info: Could not find %s. bigWig/bigBed/bam files will be skipped.
" % url) + data = urllib2.urlopen(url).read() + except urllib2.HTTPError: + print "info: Could not find %s. bigWig/bigBed/bam files will be skipped.
" % url data = None if data is not None and url.endswith(".gz"): - data = io.StringIO(data) # gunzipping requires to wrap a pseudo-file around the gzip data + data = StringIO.StringIO(data) # gunzipping requires to wrap a pseudo-file around the gzip data data = gzip.GzipFile(fileobj=data).read() # only create cache file if we got some data if data == None: data = "" else: cacheFh = open(cachePath, "wb") cacheFh.write(data) cacheFh.close() return data.splitlines() def linkTrackToGbdb(fnames, db, tableNames, bigDataFiles): """ do some educated guessing on the gbdb files<->track links. returns a dict track -> list of files @@ -1048,61 +1053,61 @@ #if "liftOver" in fname: #print fname, "
" #print "table", table, "
" orphanFnames = set(fnames) - set(assignedFnames) for fname in sorted(orphanFnames): debug("unassigned gbdb file: "+fname) #print assignedFnames misassignedTables = set(tableFiles) - set(tableNames) for table in sorted(misassignedTables): debug("not existing table: "+table) debug("for files: "+",".join(tableFiles[table])) # add the bigDataUrl files from trackDb - for trackName, bigDataFile in bigDataFiles.items(): + for trackName, bigDataFile in bigDataFiles.iteritems(): fname = bigDataFile.replace("/gbdb/%s/" % db, "") tableFiles[trackName].append(fname) # also add indexes for vcf or bam files tbiFname = fname+".tbi" baiFname = fname+".bai" if tbiFname in fnames: tableFiles[trackName].append(tbiFname) if baiFname in fnames: tableFiles[trackName].append(baiFname) return tableFiles def getRsyncSizes(dataType, db): """ debug("rsync sizes") if dataType is "mysql: return dict with tableName:size for given db (includes indexes, frm + data) if dataType is "gbdb": return dict with filaname:size for given db """ # run rsync command tmpFname = TMPDIR+"%s_%s.rsync.txt" % (dataType, db) if not isfile(tmpFname): cmd = "rsync -nav rsync://hgdownload.soe.ucsc.edu/%s/%s/ > %s.new && mv %s.new %s" % (dataType, db, tmpFname, tmpFname, tmpFname) ret = runCmd(cmd, mustRun=False) if ret!=0: - print("

") - print("Cannot run %s
" % cmd) - print("

") - print("It seems that the rsync server hgdownload.soe.ucsc.edu is not available
") - print("Please check your network connection. If this problem persists send email to genome@soe.ucsc.edu
") + print "

" + print "Cannot run %s
" % cmd + print "

" + print "It seems that the rsync server hgdownload.soe.ucsc.edu is not available
" + print "Please check your network connection. If this problem persists send email to genome@soe.ucsc.edu
" sys.exit(0) # parse rsync output file tableSizes = collections.defaultdict(int) for line in open(tmpFname): ## rsync output looks like this: # receiving incremental file list # drwxr-xr-x 1875968 2013/11/23 23:37:59 . # -rw-rw-r-- 2031084 2011/01/04 14:55:26 HInv.MYD fields = line.rstrip("\n").split() if len(fields)!=5: continue if fields[0][0]=="d": continue @@ -1136,31 +1141,31 @@ #var = fields[1] #targets = set(fields[-1].split(",")) #resolvedTargets = [] #for t in targets: #if t.startswith("$"): #resolvedTargets.extend(sets[t[1:]]) #else: #resolvedTargets.append(t) #sets[var] = resolvedTargets def udrIsUsable(): " return true if we can use udr " # http://stackoverflow.com/a/12611523/233871 # this is doing he same as the unix which command if spawn.find_executable("rsync") is None: - print("ERROR: could not find the rsync executable in the PATH") + print "ERROR: could not find the rsync executable in the PATH" sys.exit(0) # we currently don't use udr: it often fails as the port is not open # it's very hard to detect if it fails or not # return False # check if we can find udr binary udrPath = spawn.find_executable("udr") if udrPath is None: return False # created by /etc/rc.local if isfile(TMPDIR+"useUdr"): return True @@ -1234,156 +1239,156 @@ logFh.write("\nrsync output:\n") logFh.close() # write jobId to status file idFh = open(TMPDIR+"lastJob.log", "w") idFh.write(str(jobId)) idFh.close() # run commands print("Starting udr/rsync command script...") runCmd(cmd) def refreshPage(paramStr, delay=5, addNote=False): " refresh current CGI page using javascript " newUrl = basename(__file__)+"?"+paramStr - print(""" + print """ - """ % (getNonce(), newUrl, delay)) + """ % (getNonce(), newUrl, delay) if addNote: - print(("""Redirecting to

%s""" % (newUrl, newUrl))) + print("""Redirecting to %s""" % (newUrl, newUrl)) def jobsAreDone(): " True if no jobs currently in the 'at' queue " cmd = "at -l | wc -l > %satWc.txt" % TMPDIR runCmd(cmd) lineCount = int(open(TMPDIR+"atWc.txt").read().strip()) os.remove(TMPDIR+"atWc.txt") return lineCount==0 def printLog(jobId): " print rsync log for given jobId to stdout " jobFname = TMPDIR+"%d.log" % int(jobId) - print("rsync download commands:

") - print("

")
+    print "rsync download commands:"
+    print "
"
     if not isfile(jobFname):
-        print("This download job is not active anymore.")
-        print('Start a new download')
+        print "This download job is not active anymore.
"
+        print 'Start a new download'
         sys.exit(0)
 
     lines = open(jobFname).read().splitlines()
     for line in lines[-30:]:
-        print(line)
+        print line
 
-    print("")
+    print "

" if jobsAreDone(): - print('Back to Genome Browser

') - print('Download more tracks') + print 'Back to Genome Browser

' + print 'Download more tracks' else: - print('Downloading files... Page will reload every 4 seconds until download is complete.') - print('Cancel download now') + print 'Downloading files... Page will reload every 4 seconds until download is complete.' + print 'Cancel download now' refreshPage("jobId=%d" % jobId, delay=4000, addNote=False) def removeSomeTracksFromSearch(conn): cur = conn.cursor() for table in REMOVESEARCH: query = "DELETE FROM hgFindSpec WHERE searchTable='%s';" % table try: cur.execute(query) except MySQLdb.ProgrammingError: pass cur.close() def hideSomeTracks(db, conn, trackNames): """ hide some notoriously slow tracks by default """ if not mysqlDbLoaded: - print("warning: cannot hide some tracks, module mysqldb not installed") + print "warning: cannot hide some tracks, module mysqldb not installed" return # do we need this? # if "trackDb" not in localTables: #return # find all conservation tracks hideTracks = [] for t in trackNames: if (t.startswith("multiz") or t.startswith("cons")) and t.endswith("way"): hideTracks.append(t) elif t in FORCEHIDE: hideTracks.append(t) hideList = ["'"+s+"'" for s in hideTracks] hideStr = ", ".join(hideList) hideStr = "(%s)" % hideStr cur = conn.cursor() try: cur.execute("SELECT 1 FROM trackDb LIMIT 1;") except: return query = "UPDATE trackDb SET visibility=0 WHERE tableName in %s" % hideStr try: cur.execute(query) except: - print("could not execute query %s in database %s" % (query, db)) + print "could not execute query %s in database %s" % (query, db) cur.close() def getGbdbDir(): " return local gbdb dir without trailing slash " gbdbLoc = parseHgConf().get("gbdbLoc1", "/gbdb") gbdbLoc = gbdbLoc.rstrip("/") return gbdbLoc def getLocalSizes(db): """ return a dict with table name -> total size of a mysql directory and filename -> size for the gbdb dir (gbdb filenames are relative to the gbdb/db directory) """ path = join(MYSQLDIR, db) sizes = defaultdict(int) if runningAtUcsc(): return sizes sqlFh = tempfile.NamedTemporaryFile() sqlFname = sqlFh.name cmd = 'sudo -u mysql /bin/ls -l %s > %s' % (path, sqlFname) ret = runCmd(cmd, mustRun=False) if ret!=0: if runningAtUcsc(): - print("info: cannot read file sizes from %s, local file sizes unknown
" % path) + print "info: cannot read file sizes from %s, local file sizes unknown
" % path return sizes for line in open(sqlFname): fields = line.strip().split() if len(fields)<4: continue size = int(fields[4]) fname = fields[-1] fileNoExt = splitext(basename(fname))[0] sizes[fileNoExt] += size if len(sizes)==0: - print("(warning: local directory %s seems to be empty)" % path) + print "(warning: local directory %s seems to be empty)" % path gbdbDir = join(getGbdbDir(), db) if isdir(gbdbDir): # get the size of all gbdb files gbdbFh = tempfile.NamedTemporaryFile() gbdbFname = gbdbFh.name # trailing slash is important cmd = 'find %s/ -type f > %s ' % (gbdbDir, gbdbFname) runCmd(cmd) fnames = open(gbdbFname).read().splitlines() for fname in fnames: relName = fname.replace(gbdbDir+"/", "") sizes[relName] = getsize(fname) return sizes @@ -1396,61 +1401,61 @@ runCmd(cmd) def checkGbdbMysqlAccess(): """ check if we have write access to gbdb and mysql dir and can run the at command """ msg = """

        www-data     ALL = (mysql:mysql) NOPASSWD: /usr/local/bin/udr,/bin/ls,/usr/bin/rsync,/bin/rm
         www-data     ALL = (root:root) NOPASSWD: /bin/mkdir /data/gbdb
         www-data     ALL = (root:root) NOPASSWD: /bin/chown www-data.www-data /data/gbdb

""" # check if we can write to gbdb tmpFname = "%s/test.tmp" % getGbdbDir() try: open(tmpFname, "w") except IOError: - print("This program cannot write to the %s directory. Please make sure that the apache user has permission to write to %s
" % (getGbdbDir(), getGbdbDir())) - print('Use "sudo visudo" to add these lines to /etc/sudoers:
') - print(msg) - print('Or check the directory permissions if the directory already exists.
') + print "This program cannot write to the %s directory. Please make sure that the apache user has permission to write to %s
" % (getGbdbDir(), getGbdbDir()) + print 'Use "sudo visudo" to add these lines to /etc/sudoers:
' + print msg + print 'Or check the directory permissions if the directory already exists.
' sys.exit(0) # check if we can rsync to mysql tmpFname2= "%s/test.tmp" % MYSQLDIR cmd = "sudo -u mysql rsync %s %s" % (tmpFname, tmpFname2) ret = runCmd(cmd, mustRun=False) if ret!=0: - print("Could not run %s
" % cmd) - print("""Cannot run rsync as the mysql user. Please make sure that you have these lines in /etc/sudoers:
""") - print(msg) + print "Could not run %s
" % cmd + print """Cannot run rsync as the mysql user. Please make sure that you have these lines in /etc/sudoers:
""" + print msg sys.exit(0) # cleanup the two tmp files os.remove(tmpFname) cmd = "sudo -u mysql rm %s" % tmpFname2 ret = runCmd(cmd, mustRun=False) if ret!=0: os.remove(tmpFname2) # check if we can run "at" cmd = "echo echo hi | at -M now" ret = runCmd(cmd, mustRun=False) if ret != 0: - print("Could not run %s
" % cmd) - print("It looks like we cannot run the 'at' command
") - print("You might have to remove %s from /etc/at.deny" % APACHEUSER) + print "Could not run %s
" % cmd + print "It looks like we cannot run the 'at' command
" + print "You might have to remove %s from /etc/at.deny" % APACHEUSER sys.exit(0) def trackToTableNames(tableNames): " handle split tables and some spec cases by creating a dictionary trackName -> tableNames " # as a heuristic to detect split tables, first get the counts of all suffixes # after the underscore suffixCounts = defaultdict(int) for t in tableNames: if "_" in t and not t.startswith("uniGene"): prefix, track = string.split(t, "_", maxsplit=1) suffixCounts[track] += 1 suffixDict = defaultdict(set) for t in tableNames: if "_" in t and not t.startswith("uniGene"): @@ -1482,149 +1487,149 @@ def findImportantTables(trackSizes): """ some tables are important for some assemblies, we always add them """ tables = set() for t in FORCETABLES: if t in trackSizes: tables.add(t) return tables def htmlStats(localSizes, gbdbSizes, tableSizes): locTotal = humanReadable(sum(localSizes.values())) gbdbTotal = humanReadable(sum(gbdbSizes.values())) tableTotal = humanReadable(sum(tableSizes.values())) - print(("Total size at UCSC: mysql tables %(tableTotal)s, gbdb files %(gbdbTotal)s
" % locals())) - print(("Total size of tables and gbdb on local disk: %(locTotal)s

" % locals())) + print("Total size at UCSC: mysql tables %(tableTotal)s, gbdb files %(gbdbTotal)s
" % locals()) + print("Total size of tables and gbdb on local disk: %(locTotal)s

" % locals()) def addPredefined(trackVis, superTracks, groupList, groupToTopTracks, trackLabels, \ trackChildren, tableToGbdbFiles, trackTables, tableSizes, revokedTracks): """ add special predefined track groups in place, modifies the last five parameters These groups are special "tracks" that don't exist, their children are the real tracks """ # get all track that are not hidden defaultTracks = [] - for track, vis in trackVis.items(): + for track, vis in trackVis.iteritems(): if vis!="hide": defaultTracks.append(track) # go down the hierarchy and let "hide" on higher levels override the lower ones # remove all tracks that are somehow hidden by higher tracks from the first list defaultTracks = set(defaultTracks) - for topTracks in list(groupToTopTracks.values()): + for topTracks in groupToTopTracks.values(): for topTrack in topTracks: if trackVis.get(topTrack, None)=="hide": for child in getAllChildren(topTrack, trackChildren): if child in defaultTracks: defaultTracks.remove(child) # also remove all superTracks from this list, otherwise will pull in all of encode again # somehow visibility/inheritance works this way defaultTracks = defaultTracks-set(superTracks) # need lists, not sets defaultTracks = list(defaultTracks) debug(",".join(defaultTracks)) # create the two other sets of tracks, based on defaultTracks: defaultNoCons = [] for t in defaultTracks: if "multiz" not in t and not t.startswith("cons"): defaultNoCons.append(t) nonEncode = [] for track in trackVis: if track in defaultTracks or not track.startswith("wgEncode"): nonEncode.append(track) #nonEncAllTables = [t for t in tableSizes.keys() if not t.startswith("wgEncode")] # create a set of table names that are not linked to any trackDb entry # e.g. gbExtFile or extFile tables allTrackTables = set() - for tableList in list(trackTables.values()): # flatten list + for tableList in trackTables.values(): # flatten list allTrackTables.update(tableList) #print "

", revokedTracks nonTrackTables = list(set(tableSizes) - set(allTrackTables) - revokedTracks) # hack to remove the ATTIC removed encode files, they're not in metaDb nonTrackTables = [t for t in nonTrackTables if not t.startswith("wgEncode") and not "Gencode" in t] #print "

", nonTrackTables # XX groupList.insert(0, ("special", "Predefined tracks sets")) groupToTopTracks["special"] = ["defaultNoCons", "defaultConsTables", \ "default", "nonEncode", "liftOver", "nonTrackTables", "allTables", "allGbdb"] trackLabels["defaultNoCons"] = "Default tracks without conservation" trackLabels["defaultConsTables"] = "Default tracks with conservation tables, but no alignments" trackLabels["default"] = "Default tracks" trackLabels["nonEncode"] = "Default tracks plus all non-Encode tracks" trackLabels["liftOver"] = "Liftover files" #trackLabels["nonEncAllTables"] = "Non-Encode tracks" trackLabels["nonTrackTables"] = "Secondary database tables not assigned to any track" trackLabels["allTables"] = "All database tables" trackLabels["allGbdb"] = "All non-database binary/text files" trackChildren["defaultNoCons"] = defaultNoCons trackChildren["defaultConsTables"] = defaultTracks trackChildren["default"] = defaultTracks trackChildren["nonEncode"] = nonEncode #trackChildren["liftOver"] = trackToGbdbFiles["liftOver"] #trackChildren["nonEncAllTables"] = nonEncAllTables trackChildren["nonTrackTables"] = nonTrackTables - trackChildren["allTables"] = list(tableSizes.keys()) + trackChildren["allTables"] = tableSizes.keys() def stopAllJobs(): """ stop all waiting at jobs and the currently running download job """ cmd = "atrm `at -l | cut -f1`" runCmd(cmd) if isfile(TMPDIR+"lastJob.pid"): lastPid = open(TMPDIR+"lastJob.pid").read().strip() cmd = "sudo kill -- -%s" % lastPid runCmd(cmd) # make sure mysql is started again cmd = 'sudo service mysql start' runCmd(cmd) def assertEmptyQueue(): """ if the at queue is not empty, show link to log file of running job and stop program """ if jobsAreDone(): return jobId = open(TMPDIR+"lastJob.log").read() - print("There is still a download job running
") - print('Show download job' % jobId) + print "There is still a download job running
" + print 'Show download job' % jobId sys.exit(0) # the list of allowed chars in cgi args: digits, letters and dashes legalChars = set(string.digits) -legalChars.update(set(string.ascii_letters)) +legalChars.update(set(string.letters)) legalChars.update("_-.()/: ") def mustBeClean(str): """ make sure a string contains only letters and digits """ if str==None: return str - str = urllib.parse.unquote(str) + str = urllib.unquote(str) str = str.strip() for s in str: if s not in legalChars: - print("illegal character in CGI parameter") + print "illegal character in CGI parameter" sys.exit(0) return str #def getDb(args, org): #""" get Db from either CGI args or hgcentral, makes sure that DB actually exists """ #db = mustBeClean(args.getvalue("db", default=None)) #if db=="0": #db = None # #if db!=None: #if mysqlDbLoaded: #import _mysql_exceptions #try: #conn = sqlConnect(db, "public") #except _mysql_exceptions.OperationalError: @@ -1666,173 +1671,173 @@ cur.execute(query) except MySQLdb.ProgrammingError: return None for row in cur.fetchall(): names.append(row[0]) return names def postRsyncChanges(db, localSizes): " remove some tracks from trackDb, others from hgFindSpec and add a few row to tableList" if not mysqlDbLoaded: return try: conn = sqlConnect(db, "local") tracks = getAllTrackNames(conn) if tracks is None: - print(("mysql database %s has no trackDb" % db)) + print("mysql database %s has no trackDb" % db) else: hideSomeTracks(db, conn, tracks) removeSomeTracksFromSearch(conn) addTableList(db, conn) conn.close() - except MySQLdb.OperationalError as e: - print("Could not run postRsyncChanges on db %s" % db) + except MySQLdb.OperationalError, e: + print "Could not run postRsyncChanges on db %s" % db return def reloadPage(): " trigger page reload with javascript " - print(""" + print """ - """ % (getNonce())) + """ % (getNonce()) sys.stdout.flush() sys.exit(0) def htmlMiddle(args): " print html middle part " clade = getCgiVar(args, "clade") org = getCgiVar(args, "org") db = getCgiVar(args, "db", defaultDb) orgInfo, clade, org, db = getCladeAssemblyDb(clade, org, db) - if not "jobId" in list(args.keys()): + if not "jobId" in args.keys(): # load all the trackDb info, track<->file relationships and file sizes cacheFname = TMPDIR+"%s.trackDataCache" % db marshalError = False if isfile(cacheFname): # use cached results, if we have them try: groupList, trackLabels, trackTables, trackParents, groupToTopTracks, noTableTracks,\ trackVis, superTracks, tableSizes, trackChildren, gbdbSizes, \ - tableToGbdbFiles = pickle.load(open(cacheFname)) + tableToGbdbFiles = cPickle.load(open(cacheFname)) except ValueError: os.remove(cacheFname) reloadPage() else: # parse trackDb: takes up to 10 seconds via transatlantic link so save the result debug("parsing") - print(("Creating list of downloadable tracks for database %s ... please wait ..." % db)) + print("Creating list of downloadable tracks for database %s ... please wait ..." % db) sys.stdout.flush() groupList = loadGroups(db) tableSizes = getRsyncSizes("mysql", db) gbdbSizes = getRsyncSizes("gbdb", db) trackLabels, trackTables, trackParents, groupToTopTracks, \ noTableTracks, trackVis, superTracks, bigDataFiles = parseTrackDb(db, tableSizes) trackChildren = makeTrackHierarchy(trackParents) tableToGbdbFiles = linkTrackToGbdb(gbdbSizes, db, tableSizes, bigDataFiles) allData = groupList, trackLabels, trackTables, trackParents, \ groupToTopTracks, noTableTracks, trackVis, superTracks, \ tableSizes, trackChildren, gbdbSizes, tableToGbdbFiles - pickle.dump(allData, open(cacheFname, "w")) + cPickle.dump(allData, open(cacheFname, "w")) reloadPage() sys.exit(0) # when user has clicked OK, run rsync and refresh with jobId - if "submit" in list(args.keys()) and args["submit"].value=="Download": + if "submit" in args.keys() and args["submit"].value=="Download": if runningAtUcsc(): - print("
Cannot do this on hgwdev. Copying things from hgdownload to hgwdev is not a good idea.") + print "
Cannot do this on hgwdev. Copying things from hgdownload to hgwdev is not a good idea." sys.exit(0) makeGbdb() checkGbdbMysqlAccess() trackList = set(args.keys()) for t in trackList: mustBeClean(t) trackList.remove("submit") trackList.remove("db") forceTables = findImportantTables(tableSizes) jobId = int(random.random()*1000000) revokedTracks = getRevokedTracks(db) addPredefined(trackVis, superTracks, groupList, groupToTopTracks, trackLabels, trackChildren, \ tableToGbdbFiles, trackTables, tableSizes, revokedTracks) listFname, gbdbFname, fixedFname = makeTableFileList(jobId, db, trackList, trackTables, trackChildren, \ tableToGbdbFiles, tableSizes, forceTables, noTableTracks, gbdbSizes) runRsyncJobs(jobId, db, listFname, gbdbFname, fixedFname) refreshPage("jobId=%d" % jobId, addNote=True) # show list of files if told to do so - elif "showFiles" in list(args.keys()) and "db" in list(args.keys()): + elif "showFiles" in args.keys() and "db" in args.keys(): revokedTracks = getRevokedTracks(db) addPredefined(trackVis, superTracks, groupList, groupToTopTracks, trackLabels, trackChildren, \ tableToGbdbFiles, trackTables, tableSizes, revokedTracks) trackName = args["showFiles"].value tables, gbdbFiles = trackToFiles(trackName, trackTables, trackChildren, \ tableSizes, tableToGbdbFiles, noTableTracks, gbdbSizes) totalSize = 0 - print("

MySQL tables linked to %s

" % trackName) + print "

MySQL tables linked to %s

" % trackName for table in tables: if table not in tableSizes: continue size = tableSizes[table] sizeStr = humanReadable(size) totalSize += size - print(table+" (%s)
" % sizeStr) + print table+" (%s)
" % sizeStr - print("

GBDB files linked to %s

" % trackName) + print "

GBDB files linked to %s

" % trackName for gbdbFname in gbdbFiles: size = gbdbSizes[gbdbFname] totalSize += size sizeStr = humanReadable(size) - print(gbdbFname+" (%s)
" % sizeStr) + print gbdbFname+" (%s)
" % sizeStr totalSizeStr = humanReadable(totalSize) - print("

Total size: %s

" % totalSizeStr) + print "

Total size: %s

" % totalSizeStr sys.exit(0) # if we have a jobId in URL, show the rsync log - elif "jobId" in list(args.keys()): + elif "jobId" in args.keys(): if runningAtUcsc(): - print("cannot do this on hgwdev") + print "cannot do this on hgwdev" sys.exit(0) jobId = int(args["jobId"].value) printLog(jobId) # stop job if requested to do so - elif "stopAllJobs" in list(args.keys()): + elif "stopAllJobs" in args.keys(): stopAllJobs() - print("All download jobs have been stopped
") - print('Return to track selection') + print "All download jobs have been stopped
" + print 'Return to track selection' sys.exit(0) # show tracklist and change default tracks if no param else: assertEmptyQueue() - print('

UCSC genome browser track download tool

') + print '

UCSC genome browser track download tool

' htmlDbSelector(orgInfo, clade, org, db) showSubtracks = bool(int(args.getfirst("showSubtracks", "0"))) localSizes = getLocalSizes(db) revokedTracks = getRevokedTracks(db) addPredefined(trackVis, superTracks, groupList, groupToTopTracks, trackLabels, trackChildren, \ tableToGbdbFiles, trackTables, tableSizes, revokedTracks) htmlTrackTable(db, trackLabels, trackTables, trackParents, trackChildren, \ groupList, groupToTopTracks, \ tableSizes, localSizes, gbdbSizes, tableToGbdbFiles, showSubtracks, noTableTracks) #postRsyncChanges(db, trackTables) @@ -1846,45 +1851,45 @@ dbSet = set() for row in rows: dbSet.add(row[0]) # rows are always a list, even if just one value conn.close() dbSet = dbSet - set(['information_schema', 'customTrash', 'hgFixed', 'hgTemp', 'hgcentral', 'mysql','performance_schema']) return dbSet def getCookies(cookieDb): """ return db cookie value. Called usually before headers are printed. Default value can be sent as argument. """ cookie_string = os.environ.get('HTTP_COOKIE') if cookie_string: - cookie = http.cookies.SimpleCookie() + cookie = Cookie.SimpleCookie() cookie.load(cookie_string) if "db" in cookie: cookieDb = cookie["db"].value return cookieDb def setCookies(db): """ Send a cookie header to set the "db" cookie to the value specified """ - cookie = http.cookies.SimpleCookie() + cookie = Cookie.SimpleCookie() cookie['db'] = db cookie['db']['expires'] = 30 * 24 * 60 * 60 cookie['db']['comment'] = 'holds the last hgMirror database' - print(cookie) + print cookie def setDefaultDb(genome, db): " change the default db " query = 'UPDATE hgcentral SET name="%s" where genome="%s"' % (genome, db) conn = sqlConnect(hgConf.get("central.db", "hgcentral"), "local") cur = conn.cursor() cur.execute(query) conn.close() def main(): # hide some tracks if any argument specified. makes this tool usable from command line, # from cronjobs that only need to run this function if len(sys.argv)>1: dbSet = getAllLocalDbs() for db in dbSet: @@ -1895,42 +1900,42 @@ # the code above is not creating or using TMPDIR, as from the cronjob, we're # root but as a CGI we're the apache user and cannot write to root-owned directories. # Only create this directory when we're a CGI. if not isdir(TMPDIR): os.makedirs(TMPDIR) global defaultDb defaultDb = getCookies(defaultDb) args = cgi.FieldStorage() if "db" in args: setCookies(args["db"].value) - print("Content-type: text/html") - print() + print "Content-type: text/html" + print parseHgConf() if hgConf==None or not hgConf.get("allowHgMirror", "0").lower() in ["1", "yes", "true", "on"]: print("hgMirror is not activated on this machine
") print("Set allowHgMirror=1 in your cgi-bin/hg.conf file.
") print("In the Genome-Browser-in-a-box VM, use the command gbibMirrorTracksOn.") sys.exit(0) - if "debug" in list(args.keys()): + if "debug" in args.keys(): global DEBUG DEBUG = True - if "reset" in list(args.keys()): + if "reset" in args.keys(): for fname in glob.glob(TMPDIR+"*"): os.remove(fname) os.rmdir(TMPDIR) - print("temporary files deleted") + print "temporary files deleted" sys.exit(0) htmlHeader() htmlMiddle(args) jsInlineFinish() htmlFooter() main()