src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes 1.5
1.5 2010/04/02 19:28:23 krish
added more usage message and README
Index: src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes 31 Mar 2010 22:55:51 -0000 1.4
+++ src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes 2 Apr 2010 19:28:23 -0000 1.5
@@ -1,420 +1,420 @@
#!/bin/env python
import sys
import optparse
import os
import re
import subprocess
import md5
#### Classes ###################################################################
#### Functions #################################################################
def get_file_set(path, regexp):
"""return a set of the files in the path matching a regexp"""
expression = re.compile(regexp)
file_set = set()
for file in os.listdir(path):
if expression.match(file):
file_set.add(file)
return file_set
def table_exists(database, table):
"""check if a table exists in a database"""
if "\"" in table:
raise ValueError, "table name contains a \""
command = "hgsql %s -e \"DESC %s;\"" % (database, table)
proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE,stderr=subprocess.PIPE)
proc.communicate()
status_code = proc.returncode;
return status_code == 0
def parse_version(name):
"""parse out a the version in name like tableNameV4"""
expression = re.compile("(.*)V([0-9]+)$")
match = expression.match(name)
if match == None:
return name, 1
else:
return match.group(1), int(match.group(2))
def prev_version(name, version):
"""return the prev version of a name/version pair"""
if version == 1:
raise ValueError, "there is no previous version of table %s" % name
else:
return "%sV%d" % (name, version - 1)
def next_version(name, version):
"""return the next version of a name/version pair"""
return "%sV%d" % (name, version + 1)
def get_wib_pathname(database, table_name):
"""extract the wib pathaname of a wiggle table"""
query = "SELECT file FROM %s LIMIT 1;" % table_name
command = "hgsql %s --skip-column-names -e \"%s;\"" % (database, query)
proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
stdin=subprocess.PIPE,
stderr=subprocess.PIPE)
name = proc.communicate()[0].rstrip()
status_code = proc.returncode;
if status_code == 0:
return name
else:
return None
def same_file(x, y):
"""checks if two files are the same with various definitions of same"""
if os.path.samefile(x, y):
return True # same inode
else:
if os.path.getsize(x) != os.path.getsize(y):
return False # different sizes
else: # now check md5s of the two files
md5_x = md5.new()
md5_x.update(open(x).read())
md5_y = md5.new()
md5_y.update(open(y).read())
return md5_x.digest() == md5_y.digest()
#### Main ######################################################################
def main(argv=None):
""" Generate a human readable file describing the changes between two
releases of an ENCODE track.
"""
if argv is None: argv = sys.argv
# parse the args
parser = optparse.OptionParser(usage="%prog [options] database current_release (prev_release|-)",
version="%prog 0.9")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False)
parser.add_option("-t", "--composite-name", dest="composite_name",
help="the name of the composite track by default this is the name of the current directory",
metavar="N", default=None)
parser.add_option("-n", "--track-name", dest="track_name",
- help="the English name of track", metavar="N", default=None)
+ help="the English name of track, by default this is \"ENCODE [composite-name]\"", metavar="N", default=None)
parser.add_option("--files", dest="files_path", help="dump list of new files to F", metavar="F")
parser.add_option("--tables", dest="tables_path", help="dump list of new tablesto F", metavar="F")
global options
(options, args) = parser.parse_args()
# output a usage message
if len(args) != 3:
parser.print_help()
sys.exit(10)
# default track name is the current direcotry name
if options.composite_name == None:
options.composite_name = os.path.basename(os.getcwd())
# default composite name is "ENCODE composite_name"
if options.track_name == None:
options.track_name = "ENCODE %s" % options.composite_name
# get the positional args
database = args[0]
current_release_dir = args[1]
prev_release_dir = args[2]
# some re we will be using
table_and_file = re.compile("^(narrowPeaks|narrowPeak|broadPeak|gtf|bedGraph\d+|bed\d+)$")
wig = re.compile("^(wig)$")
file_only = re.compile("^(tagAlign|fastq|fasta|rpkm|bowtie|psl)$")
# if new relase, add the full path to all files
path_prefix = ""
if prev_release_dir == "-":
path_prefix = "/usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" \
% (database, options.composite_name, current_release_dir)
# generate the list of files
current_files = get_file_set(current_release_dir, ".*\.gz$")
if prev_release_dir == "-":
prev_files = set()
prev_release_dir = ""
else:
prev_files = get_file_set(prev_release_dir, ".*\.gz$")
# form the three derived sets
removed_files = prev_files - current_files
unchanged_files = current_files & prev_files
new_files = current_files - prev_files
# warnings
warnings = []
# the list of files that we'll be printing
unchanged_tables_list = []
unchanged_files_list = []
unchanged_wibs_list = []
removed_tables_list = []
removed_files_list = []
removed_wibs_list = []
new_tables_list = []
new_files_list = []
new_wibs_list = []
# process the list of unchanged files
for f in unchanged_files:
name, type, extension = f.split(".")
assert extension == "gz"
# we don't deal with revisions yet
stem, version = parse_version(name)
if next_version(stem, version) in new_files:
raise ValueError, "Unimplemented: newer version of %s found, can't deal with this yet" % name
# check to make sure the files are really the same
if not same_file(os.path.join(current_release_dir, f), os.path.join(prev_release_dir, f)):
warn = "file %s in %s and %s don't appear to be the same (type=%s)" % \
(name, current_release_dir, prev_release_dir, type)
warnings.append(warn)
print >>sys.stderr, warn
if wig.match(type):
wib_path = get_wib_pathname(database, name)
if not os.path.exists(wib_path):
warn = "could not find %s wib file for wig %s" % (wib_path, f)
warnings.append(warn)
print >>sys.stderr, warn
if not table_exists(database, name):
warn = "table %s does not exist, from filetype %s" % (name, type)
warnings.append(warn)
print >>sys.stderr, warn
unchanged_tables_list.append(name)
unchanged_files_list.append(path_prefix + f)
unchanged_wibs_list.append(wib_path)
elif table_and_file.match(type):
if not table_exists(database, name):
warn = "table %s does not exist, from filetype %s" % (name, type)
warnings.append(warn)
print >>sys.stderr, warn
unchanged_tables_list.append(name)
unchanged_files_list.append(path_prefix + f)
elif file_only.match(type):
unchanged_files_list.append(path_prefix + f)
else:
raise ValueError, "unknown type %s of file %s" % (type, f)
# process the list of removed files
for f in removed_files:
name, type, extension = f.split(".")
assert extension == "gz"
if wig.match(type):
wib_path = get_wib_pathname(database, name)
if not os.path.exists(wib_path):
warn = "could not find %s wib file for wig %s" % (wib_path, f)
warnings.append(warn)
print >>sys.stderr, warn
if not table_exists(database, name):
warn = "table %s does not exist, from filetype %s" % (name, type)
warnings.append(warn)
print >>sys.stderr, warn
removed_tables_list.append(name)
removed_files_list.append(path_prefix + f)
removed_wibs_list.append(wib_path)
elif table_and_file.match(type):
if not table_exists(database, name):
warn = "table %s does not exist, from filetype %s" % (name, type)
warnings.append(warn)
print >>sys.stderr, warn
removed_tables_list.append(name)
removed_files_list.append(f + path_prefix)
elif file_only.match(type):
removed_files_list.append(path_prefix + f)
else:
raise ValueError, "unknown type %s of file %s" % (type, f)
# process the list of new files
for f in new_files:
name, type, extension = f.split(".")
assert extension == "gz"
if wig.match(type):
wib_path = get_wib_pathname(database, name)
if not os.path.exists(wib_path):
warn = "could not find %s wib file for wig %s" % (wib_path, f)
warnings.append(warn)
print >>sys.stderr, warn
if not table_exists(database, name):
warn = "table %s does not exist, from filetype %s" % (name, type)
warnings.append(warn)
print >>sys.stderr, warn
new_tables_list.append(name)
new_files_list.append(path_prefix + f)
new_wibs_list.append(wib_path)
elif table_and_file.match(type):
if not table_exists(database, name):
warn = "table %s does not exist, from filetype %s" % (name, type)
warnings.append(warn)
print >>sys.stderr, warn
new_tables_list.append(name)
new_files_list.append(path_prefix + f)
elif file_only.match(type):
new_files_list.append(path_prefix + f)
else:
raise ValueError, "unknown type %s of file %s" % (type, f)
# output some basic stats
if options.verbose:
print >>sys.stderr, "Counts:"
print >>sys.stderr, " unchanged tables: %d" % len(unchanged_tables_list)
print >>sys.stderr, " unchanged files: %d" % len(unchanged_files_list)
print >>sys.stderr, " unchanged wibs: %d" % len(unchanged_wibs_list)
print >>sys.stderr, " removed tables: %d" % len(removed_tables_list)
print >>sys.stderr, " removed files: %d" % len(removed_files_list)
print >>sys.stderr, " removed wibs: %d" % len(removed_wibs_list)
print >>sys.stderr, " new tables: %d" % len(new_tables_list)
print >>sys.stderr, " new files: %d" % len(new_files_list)
print >>sys.stderr, " new wibs: %d" % len(new_wibs_list)
# if asked, save the list of new files
if options.files_path:
new_files_file = open(options.files_path, "w")
new_files_list.sort()
for i in new_files_list:
print >>new_files_file, i
print >>new_files_file
new_wibs_list.sort()
for i in new_wibs_list:
print >>new_files_file, i
new_files_file.close()
# if asked, generate list of new tables
if options.tables_path:
new_tables_file = open(options.tables_path, "w")
new_tables_list.sort()
for i in new_tables_list:
print >>new_tables_file, i
print >>new_tables_file
new_tables_file.close()
# generate the header
print "# generated with %s" % parser.get_version()
print "This is a %s of the \"%s\"" % (current_release_dir, options.track_name)
print "The composite track is %s" % options.composite_name
if len(warnings) > 0:
warn_header = "# WARNINGS "
print warn_header, "#" * len(warn_header)
c = 1
for w in warnings:
print "%0d - %s" % (c, w)
c += 1
print "#" * 60
print """
Categories of tables and files('):
A) Untouched - are on public browser and should remain
B) Deprecated - are currently on RR but will no longer be needed and should not be referenced by the public site.
NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR).
This list is provided for completeness. Any files marked here as in gbdb may be eliminated.
C) New - are only currently on test but will need to be pushed to the RR.
D) Additional items of note
"""
# some summary counts of current files, i.e. new + untouched
print "Summary total counts for %s (new+untouched):" % current_release_dir
print " Tables: %d" % (len(unchanged_tables_list) + len(new_tables_list))
print " Files: %d" % (len(unchanged_files_list) + len(new_files_list))
print " Wibs: %d" % (len(unchanged_wibs_list) + len(new_wibs_list))
print
# untouched list
print "A) Untouched Tables (%d):" % len(unchanged_tables_list)
unchanged_tables_list.sort()
for i in unchanged_tables_list:
print i
print
print "A') Untouched Files (%d downloadables, %d wibs):" % (len(unchanged_files_list),
len(unchanged_wibs_list))
if prev_release_dir == "":
assert len(unchanged_files_list) == 0
assert len(unchanged_wibs_list) == 0
else:
print " current location on alpha:"
print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
print " on RR:"
print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
print
unchanged_files_list.sort()
for i in unchanged_files_list:
print i
print
unchanged_wibs_list.sort()
for i in unchanged_wibs_list:
print i
print
# eliminated list
print "B) Deprecated tables (%d):" % len(removed_tables_list)
removed_tables_list.sort()
for i in removed_tables_list:
print i
print
print "B') Deprecated files (%d downloadables, %d wibs):" %(len(removed_files_list),
len(removed_wibs_list))
if prev_release_dir == "":
assert len(removed_files_list) == 0
assert len(removed_wibs_list) == 0
else:
print " NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR)."
print " This list is provided for completeness. Any files marked here as in gbdb may be eliminated."
print " current location on alpha:"
print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, prev_release_dir)
print " on RR:"
print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
print
removed_files_list.sort()
for i in removed_files_list:
print i
print
removed_wibs_list.sort()
for i in removed_wibs_list:
print i
print
# new list
print "C) New tables (%d):" % len(new_tables_list)
new_tables_list.sort()
for i in new_tables_list:
print i
print
print "C') New files (%d downloadables, %d wibs):" % (len(new_files_list),
len(new_wibs_list))
print " current location on alpha:"
print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
print " NOT on RR but must be placed in:"
print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
print
new_files_list.sort()
for i in new_files_list:
print i
print
new_wibs_list.sort()
for i in new_wibs_list:
print i
print
print "D) Additional items:"
print " current location on alpha:"
print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
print " should be placed on the RR in (overwritting any existing copy):"
print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
print
print "index.html"
print "files.txt"
print "md5sum.txt"
#### Module ####################################################################
if __name__ == "__main__":
sys.exit(main())