8a6f4c6a2883ca6fe87636bac0b186b28803b375
wong
  Mon Oct 17 14:16:19 2011 -0700
added comments, changed how paths are printed, added automatic gigabyte printing for file sizes
diff --git python/programs/mkChangeNotes/mkChangeNotes python/programs/mkChangeNotes/mkChangeNotes
index e0e0576..d480801 100755
--- python/programs/mkChangeNotes/mkChangeNotes
+++ python/programs/mkChangeNotes/mkChangeNotes
@@ -1,70 +1,66 @@
 #!/hive/groups/encode/dcc/bin/python
 import sys, os, re, argparse, subprocess, math
 from ucscgenomics import ra, track
 
 def checkMetaDbForFiles(mdb, files, status, loose):
     errors = []
     revokedset = set()
     revokedfiles = set()
     atticset = set()
     supplementalset = set()
     filtermdb = ra.RaFile()
     
     for i in files:
-        if re.match('.\/', i):
+        if re.match('supplemental', i):
             supplementalset.add(i)
         if not re.match('wgEncode.*', i):
             continue
         
-        
-        #this needs to be rewritten to handle V2's and things like that
         filestanza = mdb.filter(lambda s: re.match(".*%s.*" % i,s['fileName']), lambda s: s)
-        
         #should only return 1, just in case
         if filestanza:
             for j in filestanza:
                 filtermdb[j.name] = j
                 if 'objStatus' in j and re.search('revoked|replaced|renamed', j['objStatus']):
                     revokedfiles.add(i)
                     revokedset.add(j.name)
                 if 'attic' in j:
                     atticset.add(j.name)
         else:
             #pass
             if loose and re.match('.*bai', i):
                 pass
             else:
                 errors.append("metaDb: %s has is not mentioned in %s" % (i, status))
                 
     return (filtermdb, revokedset, revokedfiles, atticset, supplementalset, errors)
 
 def checkAlphaForDropped(new, old, status, type):
     errors=[]
     diff = set(old) -set(new)
     for i in diff:
         errors.append("%s: %s missing from %s" % (type, i, status))
-    
     return errors
 
 def checkFilesForDropped(new, old):
     diff = set(old) - set(new)
     return diff
     
-
 def checkTableStatus(mdb, files, database, composite, status, loose, revokedset):
     errors=[]
+    
     #home = os.environ['HOME']
     #dbhost = ''
     #dbuser = ''
     #dbpassword = ''
     #p = re.compile('db.(\S+)=(\S+)')
     #with open("%s/.hg.conf" % home) as f:
     #    for line in f:
     #        line.rstrip("\n\r")
     #        if p.match(line):
     #            m = p.match(line)
     #            if m.groups(1)[0] == 'host':
     #                dbhost = m.groups(1)[1]
     #            if m.groups(1)[0] == 'user':
     #                dbuser = m.groups(1)[1]
     #            if m.groups(1)[0] == 'password':
@@ -98,32 +94,30 @@
     
     sqltableset = set(output.split("\n")[1:])
 
     missingTableNames = set(mdb.filter(lambda s: s['objType'] == 'table' and 'tableName' not in s and 'attic' not in s, lambda s: s['metaObject']))
 
     missingFromDb = mdbtableset - sqltableset
     
     if missingTableNames:
         for i in missingTableNames:
             errors.append("table: %s is type obj, but missing tableName field called by %s" % (i, status))
 
     if missingFromDb:
         for i in missingFromDb:
             errors.append("table: %s table not found in Db called by %s" % (i, status))
     
-    
-            
     return (mdbtableset, revokedtableset, errors)
 
 def getGbdbFiles(database, tableset, revokedset, mdb):
     errors = []
     sep = "','"
     tablestr = sep.join(tableset)
     tablestr = "'" + tablestr + "'"
     revokestr = sep.join(revokedset)
     revokestr = "'" + revokestr + "'"
     
     cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr)
     p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
     output = p.stdout.read()
     
     gbdbtableset = set(output.split("\n")[1:])
@@ -152,31 +146,31 @@
         for j in filelist:
             if os.path.isfile("/gbdb/%s/bbi/%s" % (database, j)):
                 revokedfileset.add(j)
             else:
                 errors.append("gbdb: revoked gbdb %s does not exist in /gbdb/%s/bbi" % (j, databsase))
     
     return (gbdbfileset, revokedfileset, errors)
 
 def getTableSize(mdbtableset, database):
     tablesize = float(0)
     tablelist = list()
     for i in mdbtableset:
         tablelist.append("table_name = '%s'" % i)
     orsep = " OR "
     orstr = orsep.join(tablelist)
-    #print orstr
+
     cmd = "hgsql %s -e \"SELECT ROUND(data_length/1024/1024,2) total_size_mb, ROUND(index_length/1024/1024,2) total_index_size_mb FROM information_schema.TABLES WHERE table_name = %s\"" % (database, orstr)
     p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
     output = p.stdout.read()
     for i in output.split("\n")[1:]:
         fields = i.split()
         for j in fields:
             tablesize = tablesize + float(j)
     return math.ceil(tablesize)
 
 def checkMd5sums(newfiles, oldfiles):
     errors = []
     for i in oldfiles:
         if i not in newfiles:
             pass
         elif re.match('wgEncode.*', i):
@@ -198,88 +192,111 @@
 
 def cleanSpecialFiles(inlist):
     specialRemoveList = ['md5sum.history']
     for i in specialRemoveList:
         if i in inlist:
             inlist.remove(i)
     
     return(inlist)
     
 def separateOutAdditional(oldReleaseFiles, totalFiles, newSupplementalSet, oldSupplementalSet):
     additionalList = set()
     oldAdditionalList = set()
     newTotal = set()
     newOld = set()
     for i in totalFiles:
-        if not re.match('wgEncode.*', i):
-            additionalList.add(i)
-        elif i in newSupplementalSet:
+        if i in newSupplementalSet:
             continue
+        elif not re.match('wgEncode.*', i):
+            additionalList.add(i)
         else:
             newTotal.add(i)
     for i in oldReleaseFiles:
         if not re.match('wgEncode.*', i):
             if i in totalFiles:
                 pass
             elif i in newSupplementalSet:
                 continue
             else:
                 oldAdditionalList.add(i)
         else:
             newOld.add(i)
     
     oldReleaseFiles = newOld
 
     return(newOld, additionalList, oldAdditionalList, newTotal)
 
 def printWithPath(set, c, release):
     for i in sorted(set):
-        print "%s/%s" % (c.downloadsDirectory + 'release' + release, i)
+        print "%s/%s" % (c.httpDownloadsPath + 'release' + release, i)
+        
+def printGbdbPath(set, database):
+    for i in sorted(set):
+        print "/gbdb/%s/bbi/%s" % (database, i)
         
 def printIter(inlist):
     for i in sorted(inlist):
         print i
 
 def printReport(args, totalFiles, newGbdbSet, newTableSet, additionalList, oldAdditionalList, pushTables, pushFiles, pushGbdbs, c, oldTableSet, oldReleaseFiles, oldGbdbSet, atticSet, revokedFiles, mdb, revokedTableSet, revokedGbdbs, missingFiles, newSupplementalSet, oldSupplementalSet, tableSize):
     #the groups here need to be predefined, I just copied and pasted after working out what they were
     sep = "\n"
     print "mkChangeNotes v2"
     print "%s %s Release %s" % (args.database, args.composite, args.releaseNew)    
     print "" 
     print "QA Count Summaries for Release %s:" % args.releaseNew
     print "Tables: %d" % int(len(newTableSet))
     print "Files: %d" % int(len(totalFiles - revokedFiles))
     print "Gbdbs: %d" % int(len(newGbdbSet))
     print "Supplemental: %d" % int(len(newSupplementalSet - oldSupplementalSet))
     print "Other: %d" % int(len(additionalList))
     print "\n"
     totalsize = 0;
     print "Sizes of New:"
+    tableGb = int(tableSize/1024)
+    if tableGb > 1:
+        print "Tables: %d MB (%d GB)" % (tableSize, tableGb)
+    else:
     print "Tables: %d MB" % tableSize
     totalsize = totalsize + tableSize
     size = int(makeFileSizes(c, args, pushFiles))
     totalsize = totalsize + size
+    if int(size/1024) > 1:
+        print "Files: %d MB (%d GB)" % (size, int(size/1024))
+    else:
     print "Files: %d MB" % size
     size = int(makeFileSizes(c, args, pushGbdbs))
     totalsize = totalsize + size
+    if int(size/1024) > 1:
+        print "Gbdbs: %d MB (%d GB)" % (size, int(size/1024))
+    else:
     print "Gbdbs: %d MB" % size
     size = int(makeFileSizes(c, args, (newSupplementalSet - oldSupplementalSet)))
     totalsize = totalsize + size
+    if int(size/1-24) > 1:
+        print "Supplemental: %d MB" % (size, int(size/1024))
+    else:
     print "Supplemental: %d MB" % size
     size = int(makeFileSizes(c, args, (additionalList)))
     totalsize = totalsize + size
+    if int(size/1024) > 1:
     print "Other: %d MB" % size
+    else:
+        print "Other: %d MB" % size
+    if int(totalsize/1024) > 1:
+        print "Total: %d MB (%d GB)" % (totalsize, int(totalsize/1024))
+    else:
     print "Total: %d MB" % totalsize
     print "\n"
     print "TABLES:"
     print "New: %s" % len(pushTables)
     print "Persisting: %s" % len(oldTableSet & newTableSet)
     print "Revoked/Replaced/Renamed: %s" % len(revokedTableSet)
     print "New + Persisting: %s" % len(newTableSet)
     print "Total (New + Persisting + Revoked/Replaced/Renamed): %s" % len(newTableSet | oldTableSet | revokedTableSet)
     if args.full:
         print ""
         print "New Tables (%s):" % len(pushTables)
         printIter(pushTables)
         print ""
         print "Persisting (%s):" % len(oldTableSet & newTableSet)
         printIter(oldTableSet & newTableSet)
@@ -302,37 +319,37 @@
         print "Persisting Download Files (%s):" % len((totalFiles & oldReleaseFiles) - revokedFiles)    
         printWithPath(((totalFiles & oldReleaseFiles) - revokedFiles), c, args.releaseNew)
         print ""
         print "Revoked/Replaced/Renamed Download Files (%s):" % len(revokedFiles)    
         printWithPath(revokedFiles, c, args.releaseNew)
     print "\n"
     print "GBDBS:"
     print "New: %s" % len(pushGbdbs)
     print "Persisting: %s" % len((newGbdbSet & oldGbdbSet) - revokedGbdbs)
     print "Revoked/Replaced/Renamed: %s" % len(revokedGbdbs)
     print "New + Persisting: %s" % len(pushGbdbs | ((newGbdbSet & oldGbdbSet) - revokedGbdbs))
     print "Total (New + Persisting + Revoked/Replaced/Renamed): %s" % len(newGbdbSet | oldGbdbSet | revokedGbdbs)
     if args.full:
         print ""
         print "New Gbdb Files (%s):" % len(pushGbdbs)
-        printWithPath(pushGbdbs, c, args.releaseNew)
+        printGbdbPath(pushGbdbs, args.database)
         print ""
         print "Persisting Gbdb Files (%s):" % len((newGbdbSet & oldGbdbSet) - revokedGbdbs)
-        printWithPath(((newGbdbSet & oldGbdbSet) - revokedGbdbs), c, args.releaseNew)
+        printGbdbPath(((newGbdbSet & oldGbdbSet) - revokedGbdbs), args.database)
         print ""
         print "Revoked/Replaced/Renamed Gbdb Files (%s):" % len(revokedGbdbs)
-        printWithPath(revokedGbdbs, c, args.releaseNew)
+        printGbdbPath(revokedGbdbs, args.database)
     print "\n"
     print "Supplemental Files:"
     print "New: %s" % len(newSupplementalSet - oldSupplementalSet)
     print "Persisting: %s" % len(oldSupplementalSet & newSupplementalSet)
     print "Removed: %s" % len(oldSupplementalSet - newSupplementalSet)
     print "New + Persisting: %s" % len((newSupplementalSet - oldSupplementalSet) | (oldSupplementalSet & newSupplementalSet))
     print "Total: %s" % len(newSupplementalSet | oldSupplementalSet)
     if args.full:
         print ""
         print "New Supplemental Files:"
         printWithPath(newSupplementalSet - oldSupplementalSet, c, args.releaseNew)
         print ""
         print "Persisting Supplemental Files:"
         printWithPath(oldSupplementalSet & newSupplementalSet, c, args.releaseNew)
         print ""
@@ -358,181 +375,217 @@
         print "No Errors"
     
 def printReportOne(args, totalFiles, newGbdbSet, newTableSet, additionalList, pushTables, c, pushGbdbs, atticSet, newSupplementalSet, tableSize):
     print "mkChangeNotes v2"
     print "%s %s Release %s" % (args.database, args.composite, args.releaseNew)
     print ""
     print "QA Count Summaries for Release %s:" % args.releaseNew
     print "Tables: %d" % int(len(newTableSet))
     print "Files: %d" % int(len(totalFiles))
     print "Gbdbs: %d" % int(len(newGbdbSet))
     print "Supplemental: %d" % int(len(newSupplementalSet))
     print "Other: %d" % int(len(additionalList))
     print "\n"
     totalsize = 0;
     print "Sizes of New:"
+    tableGb = int(tableSize / 1024)
+    if tableGb > 1:
+        print "Tables: %d MB (%d GB)" % (tableSize, tableGb)
+    else:
     print "Tables: %d MB" % tableSize
     totalsize = totalsize + tableSize
     size = int(makeFileSizes(c, args, totalFiles))
     totalsize = totalsize + size
+    if int(size/1024) > 1:
+        print "Files: %d MB (%d GB)" % (size, int(size/1024))
+    else:
     print "Files: %d MB" % size
     size = int(makeFileSizes(c, args, pushGbdbs))
     totalsize = totalsize + size
+    if int(size/1024) > 1:
+        print "Gbdbs: %d MB (%d GB)" % (size, int(size/1024))
+    else:
     print "Gbdbs: %d MB" % size
     size = int(makeFileSizes(c, args, newSupplementalSet))
     totalsize = totalsize + size
+    if int(size/1-24) > 1:
+        print "Supplemental: %d MB" % (size, int(size/1024))
+    else:
     print "Supplemental: %d MB" % size
     size = int(makeFileSizes(c, args, (additionalList)))
     totalsize = totalsize + size
+    if int(size/1024) > 1:
     print "Other: %d MB" % size
+    else:
+        print "Other: %d MB" % size
+    if int(totalsize/1024) > 1:
+        print "Total: %d MB (%d GB)" % (totalsize, int(totalsize/1024))
+    else:
     print "Total: %d MB" % totalsize
     print "\n"
     if args.full:
         print ""
         print "New Tables (%s):" % len(pushTables)
         printIter(pushTables)
         print "\n"
         print "New Download Files (%s):" % len(totalFiles)
         printWithPath(totalFiles, c, args.releaseNew)
         print "\n"
         print "New Gbdb Files (%s):" % len(pushGbdbs)
         printWithPath(pushGbdbs, c, args.releaseNew)
         print "\n"
         print "New Supplemental Files:"
         printWithPath(newSupplementalSet, c, args.releaseNew)
         print "\n" 
         print "New Other Files (%s):" % len(additionalList)
         printWithPath(additionalList, c, args.releaseNew)
-    
+        print "\n"
     if not args.ignore:
         print "No Errors"
 
 
 def printErrors(errors):
     errorsDict = {}
     for i in errors:
         line = i.split(":", 1)
         try:
             errorsDict[line[0]].append(line[1])
         except:
             errorsDict[line[0]] = []
             errorsDict[line[0]].append(line[1])
     print "Errors (%s):" % len(errors)
     for i in sorted(errorsDict.keys()):
         print "%s:" % i
         for j in sorted(errorsDict[i]):
             print "%s" % j
 
 def main():
 
     parser = argparse.ArgumentParser(
         prog='mkChangeNotes',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
         description='Writes out notes file for packing to QA',
-        epilog='example: encodeMkChange hg19 wgEncodeUwDnase 3 2'
+        epilog=
+"""Examples:
+
+mkChangeNotes hg19 wgEncodeUwDnase 3 2 --loose
+mkChangeNotes hg19 wgEncodeSydhTfbs 1 - --full
+mkChangeNotes hg19 wgEncodeCshlLongRnaSeq 1 -
+
+"""
         )
     parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/')
     parser.add_argument('-l', '--loose', action="store_true", default=0, help='Loose checking for legacy elements. Will be retired once all tracks go through a release cycle')
     parser.add_argument('-i', '--ignore', action="store_true", default=0, help='Ignore errors, print out report.')
     parser.add_argument('-f', '--full', action="store_true", default=0, help='Print full stats.')
     parser.add_argument('database', help='The database, typically hg19 or mm9')
     parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance')
     parser.add_argument('releaseNew', help='The new release to be released')
-    parser.add_argument('releaseOld', help='The old release that is already released')
+    parser.add_argument('releaseOld', nargs='?', default='-', help='The old release that is already released, if on release 1, put anything here')
 
     if len(sys.argv) == 1:
         parser.print_help()
         return
     args = parser.parse_args(sys.argv[1:])
     if not args.releaseNew.isdigit():
         parser.print_help()
         return
 
-    
     c = track.CompositeTrack(args.database,args.composite)
 
     loose = args.loose
 
     errors = []
     
-    if args.releaseOld == "-":
+    if not args.releaseOld.isdigit():
         args.releaseOld = 0
     if int(args.releaseOld) > int(args.releaseNew):
         errors.append("Old Release is higher than New Release")
         args.releaseOld = args.releaseNew
 
     if int(args.releaseNew) > 1:
     
         newReleaseFiles = c.releases[int(args.releaseNew)-1]
         oldReleaseFiles = c.releases[int(args.releaseOld)-1]
 
         newMdb = c.alphaMetaDb
         oldMdb = c.publicMetaDb
 
+        #check if all files listed in release directories have associated metaDb entries
         (newMdb, revokedSet, revokedFiles, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose)
-        errors.extend(newFileErrors)
         (oldMdb, spam, eggs, ham, oldSupplementalSet, oldFileErrors) = checkMetaDbForFiles(oldMdb, oldReleaseFiles, "public metaDb", loose)
+        errors.extend(newFileErrors)
         errors.extend(oldFileErrors)
+        
+        #checks to see that nothing has disappeared between public and alpha
         errors.extend(checkAlphaForDropped(newMdb, oldMdb, "alpha metaDb", "stanza"))
         missingFiles = checkFilesForDropped(newReleaseFiles, oldReleaseFiles)
         errors.extend(checkMd5sums(newReleaseFiles, oldReleaseFiles))
 
+        #checks and gets tables that are present, also returns a revoked set of tables for new
         (newTableSet, revokedTableSet, newTableError) = checkTableStatus(newMdb, newReleaseFiles, args.database, args.composite, "alpha metaDb", loose, revokedSet)
-        errors.extend(newTableError)
         (oldTableSet, spam,  oldTableError) = checkTableStatus(oldMdb, oldReleaseFiles, args.database, args.composite, "public metaDb", loose, revokedSet)
+        errors.extend(newTableError)
         errors.extend(oldTableError)
 
+        #same as above except for gbdbs
         (newGbdbSet, revokedGbdbs, newGbdbError) = getGbdbFiles(args.database, newTableSet, revokedTableSet, newMdb)
-        errors.extend(newGbdbError)
         (oldGbdbSet, eggs, oldGbdbError) = getGbdbFiles(args.database, oldTableSet, set(), oldMdb)
+        errors.extend(newGbdbError)
         errors.extend(oldGbdbError)
     
+        #check to see if tables were dropped between releases
         droppedTables = oldTableSet - newTableSet
         if droppedTables:
             for i in droppedTables:
                 errors.append("table: %s was dropped between releases" % i)
 
-
+        #for ease of typing
         totalFiles = set(newReleaseFiles)
 
         #these could honestly be moved earlier, get a file list processing section or something
+        #they clean out special fiels out and separated the master fiels list into the 3 required
+        #ones: wgEncode, supplemental and additional.
         totalFiles = cleanSpecialFiles(totalFiles)
         oldReleaseFiles = cleanSpecialFiles(set(oldReleaseFiles))
         (oldReleaseFiles, additionalList, oldAdditionalList, totalFiles) = separateOutAdditional(oldReleaseFiles, totalFiles, newSupplementalSet, oldSupplementalSet)
         
+        #get the stuff you need to push, also table sizes        
         pushTables = set(sorted((newTableSet - oldTableSet)))
         tableSize = getTableSize(pushTables, args.database)
-        
         pushFiles = set(sorted((totalFiles - oldReleaseFiles)))
         pushGbdbs = set(sorted((newGbdbSet - oldGbdbSet)))
+        
+        #don't print report unless ignore option is on or no errors
         if (not errors) or args.ignore:
             printReport(args, totalFiles, newGbdbSet, newTableSet, additionalList, oldAdditionalList, pushTables, pushFiles, pushGbdbs, c, oldTableSet, oldReleaseFiles, oldGbdbSet, atticSet, revokedFiles, newMdb, revokedTableSet, revokedGbdbs, missingFiles, newSupplementalSet, oldSupplementalSet, tableSize)
         else:
             printErrors(errors)
 
 
     else:
 
-        args.releaseOld = 0
         newReleaseFiles = c.releases[int(args.releaseNew)-1]
         
         newMdb = c.alphaMetaDb
         
         (newMdb, revokedSet, revokedFiles, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose)
         errors.extend(newFileErrors)
         
         (newTableSet, revokedTableSet, newTableError) = checkTableStatus(newMdb, newReleaseFiles, args.database, args.composite, "alpha metaDb", loose, revokedSet)
         errors.extend(newTableError)
+        
         tableSize = getTableSize(newTableSet, args.database)
 
         (newGbdbSet, revokedGbdbs, newGbdbError) = getGbdbFiles(args.database, newTableSet, revokedTableSet, newMdb)
         errors.extend(newGbdbError)
         
         #set for easy operations
         totalFiles = set(newReleaseFiles)
         
         #clean out special fiels we don't push i.e. md5sum.history
         totalFiles = cleanSpecialFiles(totalFiles)
         
         #makes list for additional files
         (oldReleaseFiles, additionalList, oldAdditionalList, totalFiles) = separateOutAdditional(set(), totalFiles, newSupplementalSet, set())
         if (not errors) or args.ignore:
             printReportOne(args, totalFiles, newGbdbSet, newTableSet, additionalList, newTableSet, c, newGbdbSet, atticSet, newSupplementalSet, tableSize)