src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes 1.1

1.1 2010/03/15 22:58:29 krish
added script to generate changes notes for ENCODE tracks
Index: src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes
===================================================================
RCS file: src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes
diff -N src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes	15 Mar 2010 22:58:29 -0000	1.1
@@ -0,0 +1,333 @@
+#!/bin/env python
+
+import sys
+import optparse
+import os
+import re
+import subprocess
+import md5
+
+# NOTES:
+#   fullpath name to ALL files
+#
+
+#### Classes ###################################################################
+
+#### Functions #################################################################
+
+def get_file_set(path, regexp):
+    """return a set of the files in the path matching a regexp"""
+    expression = re.compile(regexp)
+    file_set = set()
+    for file in os.listdir(path):
+        if expression.match(file):
+            file_set.add(file)
+    return file_set
+
+def table_exists(database, table):
+    """check if a table exists in a database"""
+    if "\"" in table:
+        raise ValueError, "table name contains a \""
+    command = "hgsql %s -e \"DESC %s;\"" % (database, table)
+    proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE,stderr=subprocess.PIPE)
+    proc.communicate()
+    status_code = proc.returncode;
+    return status_code == 0
+
+def parse_version(name):
+    """parse out a the version in name like tableNameV4"""
+    expression = re.compile("(.*)V([0-9]+)$")
+    match = expression.match(name)
+    if match == None:
+        return name, 1
+    else:
+        return match.group(1), int(match.group(2))
+
+def prev_version(name, version):
+    """return the prev version of a name/version pair"""
+    if version == 1:
+        raise ValueError, "there is no previous version of table %s" % name
+    else:
+        return "%sV%d" % (name, version - 1)
+
+def next_version(name, version):
+    """return the next version of a name/version pair"""
+    return "%sV%d" % (name, version + 1)
+
+def get_wib_pathname(database, table_name):
+    """extract the wib pathaname of a wiggle table"""
+    query = "SELECT file FROM %s LIMIT 1;" % table_name
+    command = "hgsql %s --skip-column-names -e \"%s;\"" % (database, query)
+    proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
+                                                 stdin=subprocess.PIPE,
+                                                 stderr=subprocess.PIPE)
+    name = proc.communicate()[0].rstrip()
+    status_code = proc.returncode;
+    if status_code == 0:
+        return name
+    else:
+        return None
+
+def same_file(x, y):
+    """checks if two files are the same with various definitions of same"""
+    if os.path.samefile(x, y):
+        return True     # same inode
+    else:
+        if os.path.getsize(x) != os.path.getsize(y):
+            return False    # different sizes
+        else:   # now check md5s of the two files
+            md5_x = md5.new()
+            md5_x.update(open(x).read())
+            md5_y = md5.new()
+            md5_y.update(open(y).read())
+            return md5_x.digest() == md5_y.digest()
+
+#### Main ######################################################################
+
+def main(argv=None):
+    """ Generate a human readable file describing the changes between two
+        releases of an ENCODE track.
+    """
+    if argv is None: argv = sys.argv
+    # parse the args
+    parser = optparse.OptionParser(usage="Usage: %prog [options] database current_release prev_release",
+        version="%prog 0.9")
+    parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False)
+    parser.add_option("-t", "--track-name", dest="track_name",
+        help="the name of the composite track by default this is the name of the current directory", metavar="N")
+    parser.add_option("-n", "--name", dest="name",
+        help="the English name of track", metavar="N", default="Untitled")
+
+    global options
+    (options, args) = parser.parse_args()
+
+    # output a usage message
+    if len(args) != 3:
+        parser.print_help()
+        sys.exit(10)
+
+    if options.track_name == None:
+        options.track_name = os.path.basename(os.getcwd())
+
+    # get the positional args
+    database = args[0]
+    current_release_dir = args[1]
+    prev_release_dir = args[2]
+
+    # generate the list of files
+    current_files = get_file_set(current_release_dir, ".*\.gz$")
+    if prev_release_dir == "-":
+        prev_files = set()
+    else:
+        prev_files = get_file_set(prev_release_dir, ".*\.gz$")
+
+    # form the three derived sets
+    removed_files = prev_files - current_files
+    unchanged_files = current_files & prev_files
+    new_files = current_files - prev_files
+
+    # the list of files that we'll be printing
+    unchanged_tables_list = []
+    unchanged_files_list  = []
+    unchanged_wibs_list  = []
+    removed_tables_list = []
+    removed_files_list  = []
+    removed_wibs_list  = []
+    new_tables_list = []
+    new_files_list  = []
+    new_wibs_list  = []
+
+    # process the list of unchanged files
+    for f in unchanged_files:
+        name, type, extension = f.split(".")
+        assert extension == "gz"
+
+        # we don't deal with revisions yet
+        stem, version = parse_version(name)
+        if next_version(stem, version) in new_files:
+            raise ValueError, "Unimplemented: newer version of %s found, can't deal with this yet" % name
+        # check to make sure the files are really the same
+        if not same_file(os.path.join(current_release_dir, f), os.path.join(prev_release_dir, f)):
+            raise ValueError, "file %s in %s and %s don't appear to be the same" \
+                % (name, current_release_dir, prev_release_dir)
+
+        if type == "wig":
+            wib_path = get_wib_pathname(database, name)
+            if not os.path.exists(wib_path):
+                raise ValueError, "could not find %s wib file for wig %s" % (wib_path, f)
+            if not table_exists(database, name):
+                raise ValueError, "table %s does not exist, from filetype %s" % (name, type)
+            unchanged_tables_list.append(name)
+            unchanged_files_list.append(f)
+            unchanged_wibs_list.append(wib_path)
+        elif type == "narrowPeak":
+            if not table_exists(database, name):
+                raise ValueError,  "table %s does not exist, from filetype %s" % (name, type)
+            unchanged_tables_list.append(name)
+            unchanged_files_list.append(f)
+        elif type == "tagAlign" or type == "fastq":
+            unchanged_files_list.append(f)
+        else:
+            raise ValueError, "unknown type %s" % type
+
+    # process the list of removed files
+    for f in removed_files:
+        name, type, extension = f.split(".")
+        assert extension == "gz"
+
+        if type == "wig":
+            wib_path = get_wib_pathname(database, name)
+            if not os.path.exists(wib_path):
+                raise ValueError, "could not find %s wib file for wig %s" % (wib_path, f)
+            if not table_exists(database, name):
+                raise ValueError, "table %s does not exist, from filetype %s" % (name, type)
+            removed_tables_list.append(name)
+            removed_files_list.append(f)
+            removed_wibs_list.append(wib_path)
+        elif type == "narrowPeak":
+            if not table_exists(database, name):
+                raise ValueError,  "table %s does not exist, from filetype %s" % (name, type)
+            removed_tables_list.append(name)
+            removed_files_list.append(f)
+        elif type == "tagAlign" or type == "fastq":
+            removed_files_list.append(f)
+        else:
+            raise ValueError, "unknown type %s" % type
+
+    # process the list of new files
+    for f in new_files:
+        name, type, extension = f.split(".")
+        assert extension == "gz"
+
+        if type == "wig":
+            wib_path = get_wib_pathname(database, name)
+            if not os.path.exists(wib_path):
+                raise ValueError, "could not find %s wib file for wig %s" % (wib_path, f)
+            if not table_exists(database, name):
+                raise ValueError, "table %s does not exist, from filetype %s" % (name, type)
+            new_tables_list.append(name)
+            new_files_list.append(os.path.join("/usr/local/apache/htdocs/goldenPath/hg18/encodeDCC", options.track_name, current_release_dir, f))
+            new_wibs_list.append(wib_path)
+        elif type == "narrowPeak":
+            if not table_exists(database, name):
+                raise ValueError,  "table %s does not exist, from filetype %s" % (name, type)
+            new_tables_list.append(name)
+            new_files_list.append(os.path.join("/usr/local/apache/htdocs/goldenPath/hg18/encodeDCC", options.track_name, current_release_dir, f))
+        elif type == "tagAlign" or type == "fastq":
+            new_files_list.append(os.path.join("/usr/local/apache/htdocs/goldenPath/hg18/encodeDCC", options.track_name, current_release_dir, f))
+        else:
+            raise ValueError, "unknown type %s" % type
+
+    # output some basic stats
+    if options.verbose:
+        print >>sys.stderr, "Counts:"
+        print >>sys.stderr, "  unchanged tables: %d" % len(unchanged_tables_list)
+        print >>sys.stderr, "  unchanged files: %d" % len(unchanged_files_list)
+        print >>sys.stderr, "  unchanged wibs: %d" % len(unchanged_wibs_list)
+        print >>sys.stderr, "  removed tables: %d" % len(removed_tables_list)
+        print >>sys.stderr, "  removed files: %d" % len(removed_files_list)
+        print >>sys.stderr, "  removed wibs: %d" % len(removed_wibs_list)
+        print >>sys.stderr, "  new tables: %d" % len(new_tables_list)
+        print >>sys.stderr, "  new files: %d" % len(new_files_list)
+        print >>sys.stderr, "  new wibs: %d" % len(new_wibs_list)
+
+    # generate the header
+    print "# Generated with %s %s" % (os.path.basename(sys.argv[0]), options.get_version())
+    print "This is a %s of the \"%s\"" % (current_release_dir, options.name)
+    print "The composite track is %s" % options.track_name
+    print """
+Categories of tables and files('):
+A) Untouched - are on public browser and should remain
+B) Deprecated - are currently on RR but will no longer be needed and should not be referenced by the public site.
+   NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR).  
+   This list is provided for completeness. Any files marked here as in gbdb may be eliminated.
+C) New - are only currently on test but will need to be pushed to the RR.
+D) Additional items of note
+"""
+    # untouched list
+    print "A) Untouched Tables (%d):" % len(unchanged_tables_list)
+    unchanged_tables_list.sort()
+    for i in unchanged_tables_list:
+        print "    %s" %i
+
+    print "A') Untouched Files (%d downloadables, %d wibs):" % (len(unchanged_files_list),
+                                                           len(unchanged_wibs_list))
+    print "   current location on alpha:"
+    print "     /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/%s/%s/" % (options.track_name, current_release_dir)
+    print "   on RR:"
+    print "     {...}/goldenPath/encodeDCC/%s/" % options.track_name
+    print
+    unchanged_files_list.sort()
+    for i in unchanged_files_list:
+        print "    %s" %i
+    print
+    unchanged_wibs_list.sort()
+    for i in unchanged_wibs_list:
+        print "    %s" %i
+
+    print
+
+    # eliminated list
+    print "B) Deprecated tables (%d):" % len(removed_tables_list)
+    removed_tables_list.sort()
+    for i in removed_tables_list:
+        print "    %s" %i
+
+    print "B') Deprecated files (%d downloadables, %d wibs):" %(len(removed_files_list),
+                                                                len(removed_wibs_list))
+    print "   NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR)."
+    print "   This list is provided for completeness. Any files marked here as in gbdb may be eliminated."
+    print "   current location on alpha:"
+    print "     /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/%s/%s/" % (options.track_name, prev_release_dir)
+    print "   on RR:"
+    print "     {...}/goldenPath/encodeDCC/%s/" % options.track_name
+    print
+
+    removed_files_list.sort()
+    for i in removed_files_list:
+        print "    %s" %i
+    print
+    removed_wibs_list.sort()
+    for i in removed_wibs_list:
+        print "    %s" %i
+
+    print
+
+    # new list
+    print "C) New tables (%d):" % len(new_tables_list)
+    new_tables_list.sort()
+    for i in new_tables_list:
+        print "    %s" %i
+
+    print "C') New files (%d downloadables, %d wibs):" % (len(new_files_list),
+                                                         len(new_wibs_list))
+    print "   current location on alpha:"
+    print "     /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/%s/%s/" % (options.track_name, current_release_dir)
+    print "   NOT on RR but must be placed in:"
+    print "     {...}/goldenPath/encodeDCC/%s/" % options.track_name
+    print
+
+    new_files_list.sort()
+    for i in new_files_list:
+        print "    %s" %i
+    print
+    new_wibs_list.sort()
+    for i in new_wibs_list:
+        print "    %s" %i
+
+    print
+
+    print "D) Additional items:"
+    print "   current location on alpha:"
+    print "     /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/%s/%s/" % (options.track_name, current_release_dir)
+    print "   should be placed on the RR in (overwritting any existing copy):"
+    print "     {...}/goldenPath/encodeDCC/%s/" % options.track_name
+    print
+    print "    index.html"
+    print "    files.txt"
+    print "    md5sum.txt"
+
+#### Module ####################################################################
+
+if __name__ == "__main__":
+    sys.exit(main())