src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes 1.14

1.14 2010/05/25 21:20:21 krish
added flag to skip md5 sums for faster run time
Index: src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 1000000 -r1.13 -r1.14
--- src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes	25 May 2010 17:17:29 -0000	1.13
+++ src/hg/encode/encodeMkChangeNotes/encodeMkChangeNotes	25 May 2010 21:20:21 -0000	1.14
@@ -1,495 +1,502 @@
 #!/bin/env python
 
 import sys
 import optparse
 import os
 import re
 import subprocess
 import md5
 
 #### Classes ###################################################################
 
 #### Functions #################################################################
 
 def get_file_set(path, regexp):
     """return a set of the files in the path matching a regexp"""
     expression = re.compile(regexp)
     file_set = set()
     for file in os.listdir(path):
         if expression.match(file):
             file_set.add(file)
     return file_set
 
 def table_exists(database, table):
     """check if a table exists in a database"""
     if "\"" in table:
         raise ValueError, "table name contains a \""
     command = "hgsql %s -e \"DESC %s;\"" % (database, table)
     proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE,stderr=subprocess.PIPE)
     proc.communicate()
     status_code = proc.returncode;
     return status_code == 0
 
 def parse_version(name):
     """parse out a the version in name like tableNameV4"""
     expression = re.compile("(.*)V([0-9]+)$")
     match = expression.match(name)
     if match == None:
         return name, 1
     else:
         return match.group(1), int(match.group(2))
 
 def prev_version(name, version):
     """return the prev version of a name/version pair"""
     if version == 1:
         raise ValueError, "there is no previous version of table %s" % name
     else:
         return "%sV%d" % (name, version - 1)
 
 def next_version(name, version):
     """return the next version of a name/version pair"""
     return "%sV%d" % (name, version + 1)
 
 def get_gbdb_pathname(database, table_name):
     """extract the gbdb pathaname of a gbdb table"""
     query = "SELECT file FROM %s LIMIT 1;" % table_name
     command = "hgsql %s --skip-column-names -e \"%s;\"" % (database, query)
     proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
                                                  stdin=subprocess.PIPE,
                                                  stderr=subprocess.PIPE)
     name = proc.communicate()[0].rstrip()
     status_code = proc.returncode;
     if status_code == 0:
         return name
     else:
         query = "SELECT fileName FROM %s LIMIT 1;" % table_name
         command = "hgsql %s --skip-column-names -e \"%s;\"" % (database, query)
         proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
                                                      stdin=subprocess.PIPE,
                                                      stderr=subprocess.PIPE)
         name = proc.communicate()[0].rstrip()
         status_code = proc.returncode;
 
         if status_code == 0:
             return name
         else:
             return None
 
-def same_file(x, y):
+def same_file(x, y, do_md5=True):
     """checks if two files are the same with various definitions of same"""
     if os.path.samefile(x, y):
         return True     # same inode
     else:
         if os.path.getsize(x) != os.path.getsize(y):
             return False    # different sizes
         else:   # now check md5s of the two files
+            if do_md5:
             md5_x = md5.new()
             file_x = open(x)
             block = file_x.read(2**24)
             while block != "":
                 md5_x.update(block)
                 block = file_x.read(2**24)
 
             md5_y = md5.new()
             file_y = open(y)
             block = file_y.read(2**24)
             while block != "":
                 md5_y.update(block)
                 block = file_y.read(2**24)
 
             return md5_x.digest() == md5_y.digest()
+            else:
+                return True
 
 #### Main ######################################################################
 
 def main(argv=None):
     """ Generate a human readable file describing the changes between two
         releases of an ENCODE track.
     """
     if argv is None: argv = sys.argv
     # parse the args
     parser = optparse.OptionParser(usage="%prog [options] database current_release (prev_release|-)",
         version="%prog 0.9")
     parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False)
     parser.add_option("-t", "--composite-name", dest="composite_name",
         help="the name of the composite track by default this is the name of the current directory",
         metavar="N", default=None)
     parser.add_option("-n", "--track-name", dest="track_name",
         help="the English name of track, by default this is \"ENCODE [composite-name]\"", metavar="N", default=None)
     parser.add_option("--files", dest="files_path", help="dump list of new files to F", metavar="F")
     parser.add_option("--tables", dest="tables_path", help="dump list of new tablesto F", metavar="F")
+    parser.add_option("--disable-md5-checks", dest="do_md5_checks", action="store_false", help="disable MD5 checks on files", default=True)
 
     global options
     (options, args) = parser.parse_args()
 
     # output a usage message
     if len(args) != 3:
         parser.print_help()
         sys.exit(10)
 
     # default track name is the current direcotry name
     if options.composite_name == None:
         options.composite_name = os.path.basename(os.getcwd())
 
     # default composite name is "ENCODE composite_name"
     if options.track_name == None:
         options.track_name = "ENCODE %s" % options.composite_name
 
     # get the positional args
     database = args[0]
     current_release_dir = args[1]
     prev_release_dir = args[2]
 
     # list of addition files
     possible_addition_files = ["index.html", "files.txt", "md5sum.txt"]
     addition_files = []
 
     # some re we will be using
     table_and_file = re.compile("^(narrowPeaks|narrowPeak|broadPeak|gtf|bedGraph\d+|bed\d+|pairedTagAlign)$")
     gbdb = re.compile("^(wig|tagAlign)$")
     file_only = re.compile("^(fastq|fasta|rpkm|bowtie|psl|csqual|csfasta)$")
 
     # if new relase, add the full path to all files
     path_prefix = ""
     if prev_release_dir == "-":
         path_prefix = "/usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" \
             % (database, options.composite_name, current_release_dir)
 
     # generate the list of files
     if os.path.exists("gbdb"):
         gbdb_files  = get_file_set("gbdb", ".*")
     else:
         gbdb_files = set()
     current_files = get_file_set(current_release_dir, ".*\.gz$")
     if prev_release_dir == "-":
         prev_files = set()
         prev_release_dir = ""
     else:
         prev_files = get_file_set(prev_release_dir, ".*\.gz$")
 
     # form the three derived sets
     removed_files = prev_files - current_files
     unchanged_files = current_files & prev_files
     new_files = current_files - prev_files
 
     # warnings
     warnings = []
 
     # the list of files that we'll be printing
     unchanged_tables_list = []
     unchanged_files_list  = []
     unchanged_gbdbs_list  = []
     removed_tables_list = []
     removed_files_list  = []
     removed_gbdbs_list  = []
     new_tables_list = []
     new_files_list  = []
     new_gbdbs_list  = []
 
+    if not options.do_md5_checks:
+        warnings.append("Use of MD5 checksums to verify unchanged files has been disabled.")
+
     # process the list of unchanged files
     for f in unchanged_files:
         name, type, extension = f.split(".")
         assert extension == "gz"
 
         # we don't deal with revisions yet
         stem, version = parse_version(name)
         if next_version(stem, version) in new_files:
             raise ValueError, "Unimplemented: newer version of %s found, can't deal with this yet" % name
         # check to make sure the files are really the same
-        if not same_file(os.path.join(current_release_dir, f), os.path.join(prev_release_dir, f)):
+        if not same_file(os.path.join(current_release_dir, f), os.path.join(prev_release_dir, f), options.do_md5_checks):
             warn = "file %s in %s and %s don't appear to be the same (type=%s)" % \
                 (name, current_release_dir, prev_release_dir, type)
             warnings.append(warn)
             print >>sys.stderr, warn
 
         if gbdb.match(type):
             unchanged_files_list.append(path_prefix + f)
             if table_exists(database, name):
                 unchanged_tables_list.append(name)
                 gbdb_path = get_gbdb_pathname(database, name)
                 if gbdb_path != None and os.path.exists(gbdb_path):
                     unchanged_gbdbs_list.append(gbdb_path)
                     gbdb_files.remove(os.path.basename(gbdb_path))
                 else:
                     warn = "could not find %s file for %s" % (gbdb_path, f)
                     warnings.append(warn)
                     print >>sys.stderr, warn
             else:
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
         elif table_and_file.match(type):
             unchanged_files_list.append(path_prefix + f)
             if table_exists(database, name):
                 unchanged_tables_list.append(name)
             else:
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
         elif file_only.match(type):
             unchanged_files_list.append(path_prefix + f)
         else:
             raise ValueError, "unknown type %s of file %s" % (type, f)
 
     # process the list of removed files
     for f in removed_files:
         name, type, extension = f.split(".")
         assert extension == "gz"
 
         if gbdb.match(type):
             removed_files_list.append(path_prefix + f)
             if table_exists(database, name):
                 removed_tables_list.append(name)
                 gbdb_path = get_gbdb_pathname(database, name)
                 if gbdb_path != None and os.path.exists(gbdb_path):
                     removed_gbdbs_list.append(gbdb_path)
                 else:
                     warn = "could not find %s file for %s" % (gbdb_path, f)
                     warnings.append(warn)
                     print >>sys.stderr, warn
             else:
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
         elif table_and_file.match(type):
             removed_files_list.append(path_prefix + f)
             if table_exists(database, name):
                 removed_tables_list.append(name)
             else:
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
         elif file_only.match(type):
             removed_files_list.append(path_prefix + f)
         else:
             raise ValueError, "unknown type %s of file %s" % (type, f)
 
     # process the list of new files
     for f in new_files:
         name, type, extension = f.split(".")
         assert extension == "gz"
 
         if gbdb.match(type):
             new_files_list.append(path_prefix + f)
             if table_exists(database, name):
                 new_tables_list.append(name)
                 gbdb_path = get_gbdb_pathname(database, name)
                 if gbdb_path != None and os.path.exists(gbdb_path):
                     new_gbdbs_list.append(gbdb_path)
                     gbdb_files.remove(os.path.basename(gbdb_path))
                 else:
                     warn = "could not find %s file for %s" % (gbdb_path, f)
                     warnings.append(warn)
                     print >>sys.stderr, warn
             else:
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
         elif table_and_file.match(type):
             new_files_list.append(path_prefix + f)
             if table_exists(database, name):
                 new_tables_list.append(name)
             else:
                 warn = "table %s does not exist, from filetype %s" % (name, type)
                 warnings.append(warn)
                 print >>sys.stderr, warn
         elif file_only.match(type):
             new_files_list.append(path_prefix + f)
         else:
             raise ValueError, "unknown type %s of file %s" % (type, f)
 
     # check the list of addition files
     for f in possible_addition_files:
         if os.path.exists(f):
             addition_files.append(path_prefix + f)
         else:
             warn = "addition file %s not found" % f
             warnings.append(warn)
             print >>sys.stderr, warn
 
     # check remaining files from gbdb
     for f in gbdb_files:
         if '.' in f:
             file, extension = f.split(".", 1)
         else:
             file = f
         if table_exists(database, file):
             if file not in new_tables_list:     # make sure table is already in our list
                 new_tables_list.append(file)
             gbdb_path = get_gbdb_pathname(database, file)
             if gbdb_path != None and os.path.exists(gbdb_path):
                 new_gbdbs_list.append(gbdb_path)
                 
                 warn = "file %s in ./gbdb/ found with coresponding table and gbdb file but no download file" % f
                 warnings.append(warn)
                 print >>sys.stderr, warn
             else:
                 warn = "file %s in ./gbdb/ found with coresponding table but no download file" % f
                 warnings.append(warn)
                 print >>sys.stderr, warn
         else:
             warn = "file %s in ./gbdb/ found with no coresponding table or download file found" % f
             warnings.append(warn)
             print >>sys.stderr, warn
 
     # output some basic stats
     if options.verbose:
         print >>sys.stderr, "Counts:"
         print >>sys.stderr, "  unchanged tables: %d" % len(unchanged_tables_list)
         print >>sys.stderr, "  unchanged files: %d" % len(unchanged_files_list)
         print >>sys.stderr, "  unchanged gbdbs: %d" % len(unchanged_gbdbs_list)
         print >>sys.stderr, "  removed tables: %d" % len(removed_tables_list)
         print >>sys.stderr, "  removed files: %d" % len(removed_files_list)
         print >>sys.stderr, "  removed gbdbs: %d" % len(removed_gbdbs_list)
         print >>sys.stderr, "  new tables: %d" % len(new_tables_list)
         print >>sys.stderr, "  new files: %d" % len(new_files_list)
         print >>sys.stderr, "  new gbdbs: %d" % len(new_gbdbs_list)
         print >>sys.stderr, "  additional files: %d" % len(addition_files)
 
     # if asked, save the list of new files
     if options.files_path:
         new_files_file = open(options.files_path, "w")
         new_files_list.sort()
         for i in new_files_list:
             print >>new_files_file, i
         print >>new_files_file
         new_gbdbs_list.sort()
         for i in new_gbdbs_list:
             print >>new_files_file, i
         new_files_file.close()
 
     # if asked, generate list of new tables
     if options.tables_path:
         new_tables_file = open(options.tables_path, "w")
         new_tables_list.sort()
         for i in new_tables_list:
             print >>new_tables_file, i
         print >>new_tables_file
         new_tables_file.close()
 
     # generate the header
     print "# generated with %s" % parser.get_version()
     print "This is a %s of the \"%s\"" % (current_release_dir, options.track_name)
     print "The composite track is %s" % options.composite_name
     
     if len(warnings) > 0:
         warn_header = "# WARNINGS "
         print warn_header + "#" * (60 - len(warn_header))
         c = 1
         for w in warnings:
             print  "%0d - %s" % (c, w)
             c += 1
         print "#" * 60
 
     print """
 Categories of tables and files('):
 A) Untouched - are on public browser and should remain
 B) Deprecated - are currently on RR but will no longer be needed and should not be referenced by the public site.
    NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR).  
    This list is provided for completeness. Any files marked here as in gbdb may be eliminated.
 C) New - are only currently on test but will need to be pushed to the RR.
 D) Additional items of note
 """
     
     # some summary counts of current files, i.e. new + untouched
     print "Summary total counts for %s (new+untouched):" % current_release_dir
     print "    Tables: %d" % (len(unchanged_tables_list) + len(new_tables_list))
     print "    Files: %d" % (len(unchanged_files_list) + len(new_files_list) + len(addition_files))
     print "    Gbdbs: %d" % (len(unchanged_gbdbs_list) + len(new_gbdbs_list))
     print
 
     # untouched list
     print "A) Untouched Tables (%d):" % len(unchanged_tables_list)
     unchanged_tables_list.sort()
     for i in unchanged_tables_list:
         print i
     print
 
     print "A') Untouched Files (%d downloadables, %d gbdbs):" % (len(unchanged_files_list),
                                                            len(unchanged_gbdbs_list))
     if prev_release_dir == "":
         assert len(unchanged_files_list) == 0
         assert len(unchanged_gbdbs_list) == 0
     else:
         print "    current location on alpha:"
         print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
         print "    on RR:"
         print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
         print
         unchanged_files_list.sort()
         for i in unchanged_files_list:
             print i
         print
         unchanged_gbdbs_list.sort()
         for i in unchanged_gbdbs_list:
             print i
     print
 
     # eliminated list
     print "B) Deprecated tables (%d):" % len(removed_tables_list)
     removed_tables_list.sort()
     for i in removed_tables_list:
         print i
     print
 
     print "B') Deprecated files (%d downloadables, %d gbdbs):" %(len(removed_files_list),
                                                                 len(removed_gbdbs_list))
     if prev_release_dir == "":
         assert len(removed_files_list) == 0
         assert len(removed_gbdbs_list) == 0
     else:
         print "    NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR)."
         print "    This list is provided for completeness. Any files marked here as in gbdb may be eliminated."
         print "    current location on alpha:"
         print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, prev_release_dir)
         print "    on RR:"
         print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
         print
 
         removed_files_list.sort()
         for i in removed_files_list:
             print i
         print
         removed_gbdbs_list.sort()
         for i in removed_gbdbs_list:
             print i
 
     print
 
     # new list
     print "C) New tables (%d):" % len(new_tables_list)
     new_tables_list.sort()
     for i in new_tables_list:
         print i
     print
 
     print "C') New files (%d downloadables, %d gbdbs):" % (len(new_files_list),
                                                          len(new_gbdbs_list))
     print "    current location on alpha:"
     print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
     print "    NOT on RR but must be placed in:"
     print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
     print
 
     new_files_list.sort()
     for i in new_files_list:
         print i
     print
     new_gbdbs_list.sort()
     for i in new_gbdbs_list:
         print i
 
     print
 
     print "D) Additional items:"
     print "    current location on alpha:"
     print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir)
     print "    should be placed on the RR in (overwritting any existing copy):"
     print "      /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name)
     print
     for f in addition_files:
         print f
 
 #### Module ####################################################################
 
 if __name__ == "__main__":
     sys.exit(main())