05e67c59a20a5d00b810a981aef3b00c5bef82e1
max
Fri Sep 20 06:03:18 2024 -0700
more features to hubtools: search in both parent and subdirs, better docs
diff --git src/hg/hgMirror/hgMirror src/hg/hgMirror/hgMirror
index 396bd17..98e25fb 100755
--- src/hg/hgMirror/hgMirror
+++ src/hg/hgMirror/hgMirror
@@ -1,86 +1,81 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python3
# A little CGI interface to download the tables for a set of tracks via udr
# to local machine. This is mostly useful when setting up a mirror or a VM
# of the browser. It does not run on hgwdev.
# This script does the following:
# - get trackDb and grp table from hgDownload
# - get table and gbdb sizes from ucsc rsync server
# - get list with track<->filename for all bigfile tracks from hgwdev
# - try to assign table names to gbdb files using this list and some hacky rules
# - parse hg.conf to find mysql server and hide a few tracks in its trackDb
# - infer track/subtrack hierarchy by parsing trackDb
# - generate HTML table with labels/sizes/tablecounts for all tracks and their child tracks
# - when user clicks submit, start rsync transfer and redirect to page that shows progress
# - handles non-existing tables and some hgFixed tables
# When run from the command line, this CGI hides some tracks and removes tracks from
# track search on hg19. This is usually run from a cronjob after trackDb updates.
# This script requires the following setup
-# - mysqldb python module
# - "rsync" in path
# - "at" in path
# To allow this on ubuntu, add these lines to /etc/sudoers:
#www-data ALL = (mysql:mysql) NOPASSWD: /bin/ls,/usr/bin/rsync,/bin/rm
#www-data ALL = (root:root) NOPASSWD: /bin/mkdir /gbdb
#www-data ALL = (root:root) NOPASSWD: /bin/chown www-data.www-data /gbdb
# - the apache user has to be able to run 'at' jobs.
# To allow this on ubuntu, need to run this command to remove www-data from /etc/at.deny
# sudo sed -i s/www-data//g /etc/at.deny
# This script does not handle:
# tables joined to other tables are not downloaded. Would have to parse all.joiner for that.
# format python errors in html, as we're a CGI script
import cgi
import cgitb; cgitb.enable()
# these are default python modules on python 2.7, no errors expected here
-import urllib, urllib2, zlib, collections, StringIO, gzip, string, sys, os, random, \
- subprocess, re, types, socket, cPickle, copy, glob, tempfile, Cookie
+import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, zlib, collections, io, gzip, string, sys, os, random, \
+ subprocess, re, types, socket, pickle, copy, glob, tempfile, http.cookies
from collections import defaultdict, namedtuple
from os.path import *
from distutils import spawn
# import the UCSC-specific library
sys.path.append(join(dirname(__file__), "pyLib"))
try:
- from hgLib import getNonce, getCspMetaHeader, jsOnEventByIdF, jsInlineFinish
+ from hgLib3 import getNonce, getCspMetaHeader, jsOnEventByIdF, jsInlineFinish
# cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \
# sqlConnect, sqlQuery, errAbort, cfgOption, runCmd, cgiGetAll, printHgcHeader, \
# printHgcSection,
# webStartGbNoBanner, htmlPageEnd, hConnectCentral, sqlTableExists, \
# readSmallFile
except:
print("Content-type: text/html\n")
print("Cannot find the directory cgi-bin/pyLib in Apache. This is an installation error.")
print("All all parts of cgi-bin installed? Did you do 'make' in kent/src/hg/pyLib?")
defaultDb = "hg19"
-# the mysqldb module has to be installed with one of these commands:
-# - many common linuxes and OSX: pip install mysqldb
-# - debian: sudo apt-get install python-mysqldb
-# - fedora/centos/redhat: sudo yum install python-mysqldb
-# The script works without the mysqldb module but cannot auto-hide some tracks.
+# The script works without mysql module but cannot auto-hide some tracks.
mysqlDbLoaded = True
try:
- import MySQLdb
+ import pymysql.cursors
except:
mysqlDbLoaded = False
# default mysql data dir on debian-based distros
MYSQLDIR = "/var/lib/mysql"
# can probably autodetect this, but hardcoded here
APACHEUSER = "www-data"
# optional file with rows to add to tableList
TABLELISTADD = "/root/tableListAdd.hg19.tab"
# directory for temporary files, keep trailing slash
TMPDIR = "/tmp/hgMirror/"
@@ -156,95 +151,95 @@
return hgConf
def sqlConnect(db, name):
""" connect to sql """
if name=="public":
host, user, passwd = "genome-mysql.soe.ucsc.edu", "genomep", "password"
elif name=="local":
cfg = parseHgConf()
host, user, passwd = cfg["db.host"], cfg["db.user"], cfg["db.password"]
conn = MySQLdb.connect(host=host, user=user, passwd=passwd, db=db)
return conn
def debug(msg):
if DEBUG:
- print(msg+"
")
+ print((msg+"
"))
sys.stdout.flush()
def runningAtUcsc():
if "hgwdev" in socket.gethostname():
return True
return False
def runCmd(cmd, mustRun=True):
" wrapper around os.system that makes sure sudo is not called "
if runningAtUcsc() and cmd.startswith("sudo"):
return 0
ret = os.system(cmd)
if ret!=0 and mustRun:
- print "Could not run command %s" % cmd
+ print("Could not run command %s" % cmd)
sys.exit(0)
return ret
def loadGroups(db):
""" load grp table via mysql and return as a list of tuples (name, label)"""
groups = []
if mysqlDbLoaded:
conn = sqlConnect(db, "public")
cur = conn.cursor()
cur.execute("SELECT name, label from grp order by priority")
groups = []
for row in cur.fetchall():
groups.append((row[0], row[1]))
cur.close()
conn.close()
else:
for row in downloadTable(db, "grp"):
groups.append((row[0], row[1]))
return groups
def downloadTable(db, table):
"""
download table from hgdownload by parsing sql file first to get the field
names, then the tab sep file. Returns a list of objects, with field names
as attributes and their values from the tab sep file.
"""
baseUrl = 'http://hgdownload.soe.ucsc.edu/goldenPath/%s/database/' % db
# parse the .sql file and create a namedtuple "struct" for it
sqlUrl = baseUrl+table+".sql"
- sqlLines = urllib2.urlopen(sqlUrl).read().splitlines()
+ sqlLines = urllib.request.urlopen(sqlUrl).read().splitlines()
fieldNames = []
for l in sqlLines:
if l.strip().startswith("PRIMARY KEY"):
continue
if l.startswith(" "):
fieldName = l.split()[0].strip("`")
fieldNames.append(fieldName)
Struct = namedtuple("rec", fieldNames)
# read the tab-sep data
# can use a cached copy from /tmp
tmpFname = TMPDIR+db+"."+table+".txt.gz"
if isfile(tmpFname):
data = open(tmpFname)
else:
dataUrl = baseUrl+table+".txt.gz"
- remoteData = urllib2.urlopen(dataUrl).read()
- data = StringIO.StringIO(remoteData) # gunzipping requires to wrap a pseudo-file around the gzip data
+ remoteData = urllib.request.urlopen(dataUrl).read()
+ data = io.StringIO(remoteData) # gunzipping requires to wrap a pseudo-file around the gzip data
# write to cache file
tmpFh = open(tmpFname, "w")
tmpFh.write(remoteData)
tmpFh.close()
data = gzip.GzipFile(fileobj=data).read()
data = data.replace("\\\n", "\a") # translate escaped mysql newline to \a
data = data.replace("\\\t", "\b") # translate escaped mysql tab to \b
lines = data.split("\n")
# convert tab-sep lines to namedtuples (=objects)
rows = []
for line in lines:
if len(line)==0:
continue
fields = line.split("\t")
@@ -410,79 +405,79 @@
trackTables[track] = [track]
# same for bigGeneDataUrl, only used on knownGene right now
if "bigGeneDataUrl" in settings:
trackBigDataFiles[track] = settings["bigGeneDataUrl"]
trackTables[track] = [track]
if track=="bacEndPairs":
trackTables[track].add("all_bacends")
return trackLabels, trackTables, trackParents, groups, pseudos, \
trackVis, superTracks, trackBigDataFiles
def htmlHeader():
" print start of page "
- print """
+ print("""
' % getGbdbDir()
+ print('Locally mirrored tracks are faster to browse than tracks that are accessed through the internet.
')
+ print('Select any number of tracks from the list below and click "Download" when finished.
')
+ print('The data will be downloaded from the UCSC servers with rsync and copied to the local mysql database and %s.
' % getGbdbDir()) htmlStats(localSizes, gbdbSizes, tableSizes) - print '
' + print('')
+ print('' % db)
+ print('')
+ print('')
def downloadCache(url, cacheFname):
" download file from url or open local cached copy. Return list of lines "
cachePath = TMPDIR+cacheFname
if isfile(cachePath):
return open(cachePath).read().splitlines()
try:
- data = urllib2.urlopen(url).read()
- except urllib2.HTTPError:
- print "info: Could not find %s. bigWig/bigBed/bam files will be skipped.
" % url
+ data = urllib.request.urlopen(url).read()
+ except urllib.error.HTTPError:
+ print("info: Could not find %s. bigWig/bigBed/bam files will be skipped.
" % url)
data = None
if data is not None and url.endswith(".gz"):
- data = StringIO.StringIO(data) # gunzipping requires to wrap a pseudo-file around the gzip data
+ data = io.StringIO(data) # gunzipping requires to wrap a pseudo-file around the gzip data
data = gzip.GzipFile(fileobj=data).read()
# only create cache file if we got some data
if data == None:
data = ""
else:
cacheFh = open(cachePath, "wb")
cacheFh.write(data)
cacheFh.close()
return data.splitlines()
def linkTrackToGbdb(fnames, db, tableNames, bigDataFiles):
"""
do some educated guessing on the gbdb files<->track links. returns a dict track -> list of files
@@ -1053,61 +1048,61 @@
#if "liftOver" in fname:
#print fname, "
"
#print "table", table, "
"
orphanFnames = set(fnames) - set(assignedFnames)
for fname in sorted(orphanFnames):
debug("unassigned gbdb file: "+fname)
#print assignedFnames
misassignedTables = set(tableFiles) - set(tableNames)
for table in sorted(misassignedTables):
debug("not existing table: "+table)
debug("for files: "+",".join(tableFiles[table]))
# add the bigDataUrl files from trackDb
- for trackName, bigDataFile in bigDataFiles.iteritems():
+ for trackName, bigDataFile in bigDataFiles.items():
fname = bigDataFile.replace("/gbdb/%s/" % db, "")
tableFiles[trackName].append(fname)
# also add indexes for vcf or bam files
tbiFname = fname+".tbi"
baiFname = fname+".bai"
if tbiFname in fnames:
tableFiles[trackName].append(tbiFname)
if baiFname in fnames:
tableFiles[trackName].append(baiFname)
return tableFiles
def getRsyncSizes(dataType, db):
"""
debug("rsync sizes")
if dataType is "mysql: return dict with tableName:size for given db (includes indexes, frm + data)
if dataType is "gbdb": return dict with filaname:size for given db
"""
# run rsync command
tmpFname = TMPDIR+"%s_%s.rsync.txt" % (dataType, db)
if not isfile(tmpFname):
cmd = "rsync -nav rsync://hgdownload.soe.ucsc.edu/%s/%s/ > %s.new && mv %s.new %s" % (dataType, db, tmpFname, tmpFname, tmpFname)
ret = runCmd(cmd, mustRun=False)
if ret!=0:
- print "
"
- print "Cannot run %s
" % cmd
- print "
")
+ print("Cannot run %s
" % cmd)
+ print("
" - print "
" + print("rsync download commands:") if jobsAreDone(): - print 'Back to Genome Browser") + print("
") if not isfile(jobFname): - print "This download job is not active anymore." + print("" - print 'Start a new download' + print("This download job is not active anymore.
") + print('Start a new download') sys.exit(0) lines = open(jobFname).read().splitlines() for line in lines[-30:]: - print line + print(line) - print "
' - print 'Download more tracks' + print('Back to Genome Browser
')
+ print('Download more tracks')
else:
- print 'Downloading files... Page will reload every 4 seconds until download is complete.'
- print 'Cancel download now'
+ print('Downloading files... Page will reload every 4 seconds until download is complete.')
+ print('Cancel download now')
refreshPage("jobId=%d" % jobId, delay=4000, addNote=False)
def removeSomeTracksFromSearch(conn):
cur = conn.cursor()
for table in REMOVESEARCH:
query = "DELETE FROM hgFindSpec WHERE searchTable='%s';" % table
try:
cur.execute(query)
except MySQLdb.ProgrammingError:
pass
cur.close()
def hideSomeTracks(db, conn, trackNames):
"""
hide some notoriously slow tracks by default
"""
if not mysqlDbLoaded:
- print "warning: cannot hide some tracks, module mysqldb not installed"
+ print("warning: cannot hide some tracks, module mysqldb not installed")
return
# do we need this?
# if "trackDb" not in localTables:
#return
# find all conservation tracks
hideTracks = []
for t in trackNames:
if (t.startswith("multiz") or t.startswith("cons")) and t.endswith("way"):
hideTracks.append(t)
elif t in FORCEHIDE:
hideTracks.append(t)
hideList = ["'"+s+"'" for s in hideTracks]
hideStr = ", ".join(hideList)
hideStr = "(%s)" % hideStr
cur = conn.cursor()
try:
cur.execute("SELECT 1 FROM trackDb LIMIT 1;")
except:
return
query = "UPDATE trackDb SET visibility=0 WHERE tableName in %s" % hideStr
try:
cur.execute(query)
except:
- print "could not execute query %s in database %s" % (query, db)
+ print("could not execute query %s in database %s" % (query, db))
cur.close()
def getGbdbDir():
" return local gbdb dir without trailing slash "
gbdbLoc = parseHgConf().get("gbdbLoc1", "/gbdb")
gbdbLoc = gbdbLoc.rstrip("/")
return gbdbLoc
def getLocalSizes(db):
"""
return a dict with table name -> total size of a mysql directory and filename -> size for the gbdb dir
(gbdb filenames are relative to the gbdb/db directory)
"""
path = join(MYSQLDIR, db)
sizes = defaultdict(int)
if runningAtUcsc():
return sizes
sqlFh = tempfile.NamedTemporaryFile()
sqlFname = sqlFh.name
cmd = 'sudo -u mysql /bin/ls -l %s > %s' % (path, sqlFname)
ret = runCmd(cmd, mustRun=False)
if ret!=0:
if runningAtUcsc():
- print "info: cannot read file sizes from %s, local file sizes unknown
" % path
+ print("info: cannot read file sizes from %s, local file sizes unknown
" % path)
return sizes
for line in open(sqlFname):
fields = line.strip().split()
if len(fields)<4:
continue
size = int(fields[4])
fname = fields[-1]
fileNoExt = splitext(basename(fname))[0]
sizes[fileNoExt] += size
if len(sizes)==0:
- print "(warning: local directory %s seems to be empty)" % path
+ print("(warning: local directory %s seems to be empty)" % path)
gbdbDir = join(getGbdbDir(), db)
if isdir(gbdbDir):
# get the size of all gbdb files
gbdbFh = tempfile.NamedTemporaryFile()
gbdbFname = gbdbFh.name
# trailing slash is important
cmd = 'find %s/ -type f > %s ' % (gbdbDir, gbdbFname)
runCmd(cmd)
fnames = open(gbdbFname).read().splitlines()
for fname in fnames:
relName = fname.replace(gbdbDir+"/", "")
sizes[relName] = getsize(fname)
return sizes
@@ -1401,61 +1396,61 @@
runCmd(cmd)
def checkGbdbMysqlAccess():
""" check if we have write access to gbdb and mysql dir and can run the at command """
msg = """
www-data ALL = (mysql:mysql) NOPASSWD: /usr/local/bin/udr,/bin/ls,/usr/bin/rsync,/bin/rm www-data ALL = (root:root) NOPASSWD: /bin/mkdir /data/gbdb www-data ALL = (root:root) NOPASSWD: /bin/chown www-data.www-data /data/gbdb""" # check if we can write to gbdb tmpFname = "%s/test.tmp" % getGbdbDir() try: open(tmpFname, "w") except IOError: - print "This program cannot write to the %s directory. Please make sure that the apache user has permission to write to %s
" % locals())
+ print(("Total size at UCSC: mysql tables %(tableTotal)s, gbdb files %(gbdbTotal)s
" % locals()))
+ print(("Total size of tables and gbdb on local disk: %(locTotal)s
" % locals())) def addPredefined(trackVis, superTracks, groupList, groupToTopTracks, trackLabels, \ trackChildren, tableToGbdbFiles, trackTables, tableSizes, revokedTracks): """ add special predefined track groups in place, modifies the last five parameters These groups are special "tracks" that don't exist, their children are the real tracks """ # get all track that are not hidden defaultTracks = [] - for track, vis in trackVis.iteritems(): + for track, vis in trackVis.items(): if vis!="hide": defaultTracks.append(track) # go down the hierarchy and let "hide" on higher levels override the lower ones # remove all tracks that are somehow hidden by higher tracks from the first list defaultTracks = set(defaultTracks) - for topTracks in groupToTopTracks.values(): + for topTracks in list(groupToTopTracks.values()): for topTrack in topTracks: if trackVis.get(topTrack, None)=="hide": for child in getAllChildren(topTrack, trackChildren): if child in defaultTracks: defaultTracks.remove(child) # also remove all superTracks from this list, otherwise will pull in all of encode again # somehow visibility/inheritance works this way defaultTracks = defaultTracks-set(superTracks) # need lists, not sets defaultTracks = list(defaultTracks) debug(",".join(defaultTracks)) # create the two other sets of tracks, based on defaultTracks: defaultNoCons = [] for t in defaultTracks: if "multiz" not in t and not t.startswith("cons"): defaultNoCons.append(t) nonEncode = [] for track in trackVis: if track in defaultTracks or not track.startswith("wgEncode"): nonEncode.append(track) #nonEncAllTables = [t for t in tableSizes.keys() if not t.startswith("wgEncode")] # create a set of table names that are not linked to any trackDb entry # e.g. gbExtFile or extFile tables allTrackTables = set() - for tableList in trackTables.values(): # flatten list + for tableList in list(trackTables.values()): # flatten list allTrackTables.update(tableList) #print "