src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes 1.5

1.5 2010/04/02 19:28:23 krish
added more usage message and README
Index: src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes	31 Mar 2010 22:55:51 -0000	1.4
+++ src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes	2 Apr 2010 19:28:23 -0000	1.5
@@ -1,420 +1,420 @@
 #!/bin/env python
 
 import sys
 import optparse
 import os
 import re
 import subprocess
 import md5
 
 #### Classes ###################################################################
 
 #### Functions #################################################################
 
 def get_file_set(path, regexp):
     """return a set of the files in the path matching a regexp"""
     expression = re.compile(regexp)
     file_set = set()
     for file in os.listdir(path):
         if expression.match(file):
             file_set.add(file)
     return file_set
 
 def table_exists(database, table):
     """check if a table exists in a database"""
     if "\"" in table:
         raise ValueError, "table name contains a \""
     command = "hgsql %s -e \"DESC %s;\"" % (database, table)
     proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE,stderr=subprocess.PIPE)
     proc.communicate()
     status_code = proc.returncode;
     return status_code == 0
 
 def parse_version(name):
     """parse out a the version in name like tableNameV4"""
     expression = re.compile("(.*)V([0-9]+)$")
     match = expression.match(name)
     if match == None:
         return name, 1
     else:
         return match.group(1), int(match.group(2))
 
 def prev_version(name, version):
     """return the prev version of a name/version pair"""
     if version == 1:
         raise ValueError, "there is no previous version of table %s" % name
     else:
         return "%sV%d" % (name, version - 1)
 
 def next_version(name, version):
     """return the next version of a name/version pair"""
     return "%sV%d" % (name, version + 1)
 
 def get_wib_pathname(database, table_name):
     """extract the wib pathaname of a wiggle table"""
     query = "SELECT file FROM %s LIMIT 1;" % table_name
     command = "hgsql %s --skip-column-names -e \"%s;\"" % (database, query)
     proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
                                                  stdin=subprocess.PIPE,
                                                  stderr=subprocess.PIPE)
     name = proc.communicate()[0].rstrip()
     status_code = proc.returncode;
     if status_code == 0:
         return name
     else:
         return None
 
 def same_file(x, y):
     """checks if two files are the same with various definitions of same"""
     if os.path.samefile(x, y):
         return True     # same inode
     else:
         if os.path.getsize(x) != os.path.getsize(y):
             return False    # different sizes
         else:   # now check md5s of the two files
             md5_x = md5.new()
             md5_x.update(open(x).read())
             md5_y = md5.new()
             md5_y.update(open(y).read())
             return md5_x.digest() == md5_y.digest()
 
 #### Main ######################################################################
 
 def main(argv=None):
     """ Generate a human readable file describing the changes between two
         releases of an ENCODE track.
     """
     if argv is None: argv = sys.argv
     # parse the args
     parser = optparse.OptionParser(usage="%prog [options] database current_release (prev_release|-)",
         version="%prog 0.9")
     parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False)
     parser.add_option("-t", "--composite-name", dest="composite_name",
         help="the name of the composite track by default this is the name of the current directory",
         metavar="N", default=None)
     parser.add_option("-n", "--track-name", dest="track_name",
-        help="the English name of track", metavar="N", default=None)
+        help="the English name of track, by default this is \"ENCODE [composite-name]\"", metavar="N", default=None)
     parser.add_option("--files", dest="files_path", help="dump list of new files to F", metavar="F")
     parser.add_option("--tables", dest="tables_path", help="dump list of new tablesto F", metavar="F")
 
     global options
     (options, args) = parser.parse_args()
 
     # output a usage message
     if len(args) != 3:
         parser.print_help()
         sys.exit(10)
 
     # default track name is the current direcotry name
     if options.composite_name == None:
         options.composite_name = os.path.basename(os.getcwd())
 
     # default composite name is "ENCODE composite_name"
     if options.track_name == None:
         options.track_name = "ENCODE %s" % options.composite_name
 
     # get the positional args
     database = args[0]
     current_release_dir = args[1]
     prev_release_dir = args[2]
 
     # some re we will be using
     table_and_file = re.compile("^(narrowPeaks|narrowPeak|broadPeak|gtf|bedGraph\d+|bed\d+)$")
     wig = re.compile("^(wig)$")
     file_only = re.compile("^(tagAlign|fastq|fasta|rpkm|bowtie|psl)$")
 
     # if new relase, add the full path to all files
     path_prefix = ""
     if prev_release_dir == "-":
         path_prefix = "/usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" \
             % (database, options.composite_name, current_release_dir)
 
     # generate the list of files
     current_files = get_file_set(current_release_dir, ".*\.gz$")
     if prev_release_dir == "-":
         prev_files = set()
         prev_release_dir = ""
     else:
         prev_files = get_file_set(prev_release_dir, ".*\.gz$")
 
     # form the three derived sets
     removed_files = prev_files - current_files
     unchanged_files = current_files & prev_files
     new_files = current_files - prev_files
 
     # warnings
     warnings = []
 
     # the list of files that we'll be printing
     unchanged_tables_list = []
     unchanged_files_list  = []
     unchanged_wibs_list  = []
     removed_tables_list = []
     removed_files_list  = []
     removed_wibs_list  = []
     new_tables_list = []
     new_files_list  = []
     new_wibs_list  = []
 
     # process the list of unchanged files
     for f in unchanged_files:
         name, type, extension = f.split(".")
         assert extension == "gz"
 
         # we don't deal with revisions yet
         stem, version = parse_version(name)
         if next_version(stem, version) in new_files:
             raise ValueError, "Unimplemented: newer version of %s found, can't deal with this yet" % name
         # check to make sure the files are really the same
         if not same_file(os.path.join(current_release_dir, f), os.path.join(prev_release_dir, f)):
             warn = "file %s in %s and %s don't appear to be the same (type=%s)" % \
                 (name, current_release_dir, prev_release_dir, type)
             warnings.append(warn)
             print >>sys.stderr, warn
         if wig.match(type):
             wib_path = get_wib_pathname(database, name)
             if not os.path.exists(wib_path):
                 warn = "could not find %s wib file for wig %s" % (wib_path, f)
                 warnings.append(warn)
                 print >>sys.stderr, warn
             if not table_exists(database, name):
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
             unchanged_tables_list.append(name)
             unchanged_files_list.append(path_prefix + f)
             unchanged_wibs_list.append(wib_path)
         elif table_and_file.match(type):
             if not table_exists(database, name):
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
             unchanged_tables_list.append(name)
             unchanged_files_list.append(path_prefix + f)
         elif file_only.match(type):
             unchanged_files_list.append(path_prefix + f)
         else:
             raise ValueError, "unknown type %s of file %s" % (type, f)
 
     # process the list of removed files
     for f in removed_files:
         name, type, extension = f.split(".")
         assert extension == "gz"
 
         if wig.match(type):
             wib_path = get_wib_pathname(database, name)
             if not os.path.exists(wib_path):
                 warn = "could not find %s wib file for wig %s" % (wib_path, f)
                 warnings.append(warn)
                 print >>sys.stderr, warn
             if not table_exists(database, name):
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
             removed_tables_list.append(name)
             removed_files_list.append(path_prefix + f)
             removed_wibs_list.append(wib_path)
         elif table_and_file.match(type):
             if not table_exists(database, name):
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
             removed_tables_list.append(name)
             removed_files_list.append(f + path_prefix)
         elif file_only.match(type):
             removed_files_list.append(path_prefix + f)
         else:
             raise ValueError, "unknown type %s of file %s" % (type, f)
 
     # process the list of new files
     for f in new_files:
         name, type, extension = f.split(".")
         assert extension == "gz"
 
         if wig.match(type):
             wib_path = get_wib_pathname(database, name)
             if not os.path.exists(wib_path):
                 warn = "could not find %s wib file for wig %s" % (wib_path, f)
                 warnings.append(warn)
                 print >>sys.stderr, warn
             if not table_exists(database, name):
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
             new_tables_list.append(name)
             new_files_list.append(path_prefix + f)
             new_wibs_list.append(wib_path)
         elif table_and_file.match(type):
             if not table_exists(database, name):
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
             new_tables_list.append(name)
             new_files_list.append(path_prefix + f)
         elif file_only.match(type):
             new_files_list.append(path_prefix + f)
         else:
             raise ValueError, "unknown type %s of file %s" % (type, f)
 
     # output some basic stats
     if options.verbose:
         print >>sys.stderr, "Counts:"
         print >>sys.stderr, "  unchanged tables: %d" % len(unchanged_tables_list)
         print >>sys.stderr, "  unchanged files: %d" % len(unchanged_files_list)
         print >>sys.stderr, "  unchanged wibs: %d" % len(unchanged_wibs_list)
         print >>sys.stderr, "  removed tables: %d" % len(removed_tables_list)
         print >>sys.stderr, "  removed files: %d" % len(removed_files_list)
         print >>sys.stderr, "  removed wibs: %d" % len(removed_wibs_list)
         print >>sys.stderr, "  new tables: %d" % len(new_tables_list)
         print >>sys.stderr, "  new files: %d" % len(new_files_list)
         print >>sys.stderr, "  new wibs: %d" % len(new_wibs_list)
 
     # if asked, save the list of new files
     if options.files_path:
         new_files_file = open(options.files_path, "w")
         new_files_list.sort()
         for i in new_files_list:
             print >>new_files_file, i
         print >>new_files_file
         new_wibs_list.sort()
         for i in new_wibs_list:
             print >>new_files_file, i
         new_files_file.close()
 
     # if asked, generate list of new tables
     if options.tables_path:
         new_tables_file = open(options.tables_path, "w")
         new_tables_list.sort()
         for i in new_tables_list:
             print >>new_tables_file, i
         print >>new_tables_file
         new_tables_file.close()
 
     # generate the header
     print "# generated with %s" % parser.get_version()
     print "This is a %s of the \"%s\"" % (current_release_dir, options.track_name)
     print "The composite track is %s" % options.composite_name
     
     if len(warnings) > 0:
         warn_header = "# WARNINGS "
         print warn_header, "#" * len(warn_header)
         c = 1
         for w in warnings:
             print  "%0d - %s" % (c, w)
             c += 1
         print "#" * 60
 
     print """
 Categories of tables and files('):
 A) Untouched - are on public browser and should remain
 B) Deprecated - are currently on RR but will no longer be needed and should not be referenced by the public site.
    NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR).  
    This list is provided for completeness. Any files marked here as in gbdb may be eliminated.
 C) New - are only currently on test but will need to be pushed to the RR.
 D) Additional items of note
 """
     
     # some summary counts of current files, i.e. new + untouched
     print "Summary total counts for %s (new+untouched):" % current_release_dir
     print "    Tables: %d" % (len(unchanged_tables_list) + len(new_tables_list))
     print "    Files: %d" % (len(unchanged_files_list) + len(new_files_list))
     print "    Wibs: %d" % (len(unchanged_wibs_list) + len(new_wibs_list))
     print
 
     # untouched list
     print "A) Untouched Tables (%d):" % len(unchanged_tables_list)
     unchanged_tables_list.sort()
     for i in unchanged_tables_list:
         print i
     print
 
     print "A') Untouched Files (%d downloadables, %d wibs):" % (len(unchanged_files_list),
                                                            len(unchanged_wibs_list))
     if prev_release_dir == "":
         assert len(unchanged_files_list) == 0
         assert len(unchanged_wibs_list) == 0
     else:
         print "    current location on alpha:"
         print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
         print "    on RR:"
         print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
         print
         unchanged_files_list.sort()
         for i in unchanged_files_list:
             print i
         print
         unchanged_wibs_list.sort()
         for i in unchanged_wibs_list:
             print i
     print
 
     # eliminated list
     print "B) Deprecated tables (%d):" % len(removed_tables_list)
     removed_tables_list.sort()
     for i in removed_tables_list:
         print i
     print
 
     print "B') Deprecated files (%d downloadables, %d wibs):" %(len(removed_files_list),
                                                                 len(removed_wibs_list))
     if prev_release_dir == "":
         assert len(removed_files_list) == 0
         assert len(removed_wibs_list) == 0
     else:
         print "    NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR)."
         print "    This list is provided for completeness. Any files marked here as in gbdb may be eliminated."
         print "    current location on alpha:"
         print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, prev_release_dir)
         print "    on RR:"
         print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
         print
 
         removed_files_list.sort()
         for i in removed_files_list:
             print i
         print
         removed_wibs_list.sort()
         for i in removed_wibs_list:
             print i
 
     print
 
     # new list
     print "C) New tables (%d):" % len(new_tables_list)
     new_tables_list.sort()
     for i in new_tables_list:
         print i
     print
 
     print "C') New files (%d downloadables, %d wibs):" % (len(new_files_list),
                                                          len(new_wibs_list))
     print "    current location on alpha:"
     print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
     print "    NOT on RR but must be placed in:"
     print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
     print
 
     new_files_list.sort()
     for i in new_files_list:
         print i
     print
     new_wibs_list.sort()
     for i in new_wibs_list:
         print i
 
     print
 
     print "D) Additional items:"
     print "    current location on alpha:"
     print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
     print "    should be placed on the RR in (overwritting any existing copy):"
     print "      /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
     print
     print "index.html"
     print "files.txt"
     print "md5sum.txt"
 
 #### Module ####################################################################
 
 if __name__ == "__main__":
     sys.exit(main())