8b09636a6001792a6dca8135057806bf5ad88d43
wong
  Mon Oct 17 12:12:30 2011 -0700
some minor corrections and fixed a bug for calculating revoked files, pigeon hole expansion problem
diff --git python/programs/mkChangeNotes/mkChangeNotes python/programs/mkChangeNotes/mkChangeNotes
index dd0c59f..e0e0576 100755
--- python/programs/mkChangeNotes/mkChangeNotes
+++ python/programs/mkChangeNotes/mkChangeNotes
@@ -1,49 +1,54 @@
 #!/hive/groups/encode/dcc/bin/python
 import sys, os, re, argparse, subprocess, math
 from ucscgenomics import ra, track
 
 def checkMetaDbForFiles(mdb, files, status, loose):
     errors = []
     revokedset = set()
+    revokedfiles = set()
     atticset = set()
     supplementalset = set()
     filtermdb = ra.RaFile()
+    
     for i in files:
         if re.match('.\/', i):
             supplementalset.add(i)
         if not re.match('wgEncode.*', i):
             continue
         
+        
+        #this needs to be rewritten to handle V2's and things like that
         filestanza = mdb.filter(lambda s: re.match(".*%s.*" % i,s['fileName']), lambda s: s)
         
         #should only return 1, just in case
         if filestanza:
-            for i in filestanza:
-                filtermdb[i.name] = i
-                if 'objStatus' in i and  re.search('revoked|replaced|renamed', i['objStatus']):
-                    revokedset.add(i.name)
-                if 'attic' in i:
-                    atticset.add(i.name)
+            for j in filestanza:
+                filtermdb[j.name] = j
+                if 'objStatus' in j and re.search('revoked|replaced|renamed', j['objStatus']):
+                    revokedfiles.add(i)
+                    revokedset.add(j.name)
+                if 'attic' in j:
+                    atticset.add(j.name)
         else:
             #pass
             if loose and re.match('.*bai', i):
                 pass
             else:
                 errors.append("metaDb: %s has is not mentioned in %s" % (i, status))
         
-    return (filtermdb, revokedset, atticset, supplementalset, errors)
+    return (filtermdb, revokedset, revokedfiles, atticset, supplementalset, errors)
 
 def checkAlphaForDropped(new, old, status, type):
     errors=[]
     diff = set(old) -set(new)
     for i in diff:
         errors.append("%s: %s missing from %s" % (type, i, status))
     
     return errors
 
 def checkFilesForDropped(new, old):
     diff = set(old) - set(new)
     return diff
     
 
 def checkTableStatus(mdb, files, database, composite, status, loose, revokedset):
@@ -102,32 +107,32 @@
             errors.append("table: %s is type obj, but missing tableName field called by %s" % (i, status))
 
     if missingFromDb:
         for i in missingFromDb:
             errors.append("table: %s table not found in Db called by %s" % (i, status))
     
     
             
     return (mdbtableset, revokedtableset, errors)
 
 def getGbdbFiles(database, tableset, revokedset, mdb):
     errors = []
     sep = "','"
     tablestr = sep.join(tableset)
     tablestr = "'" + tablestr + "'"
-    revokestr = sep.join(tableset)
-    revokestr = "'" + tablestr + "'"
+    revokestr = sep.join(revokedset)
+    revokestr = "'" + revokestr + "'"
     
     cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr)
     p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
     output = p.stdout.read()
     
     gbdbtableset = set(output.split("\n")[1:])
     
     cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, revokestr)
     p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
     output = p.stdout.read()
     
     revokedtableset = set(output.split("\n")[1:])
     
     file1stanzalist = mdb.filter(lambda s: s['tableName'] in gbdbtableset, lambda s: s)
     revokedstanzalist = mdb.filter(lambda s: s['tableName'] in revokedtableset, lambda s: s)
@@ -218,40 +223,30 @@
             elif i in newSupplementalSet:
                 continue
             else:
                 oldAdditionalList.add(i)
         else:
             newOld.add(i)
     
     oldReleaseFiles = newOld
 
     return(newOld, additionalList, oldAdditionalList, newTotal)
 
 def printWithPath(set, c, release):
     for i in sorted(set):
         print "%s/%s" % (c.downloadsDirectory + 'release' + release, i)
 
-def makeRevokedFiles(revokedSet, mdb):
-    revokedFiles = set()
-    for i in revokedSet:
-        file = mdb.filter(lambda s: re.match(".*%s.*" % i,s['fileName']), lambda s: s['fileName'])
-        for j in file:
-            splitfile = j.split(",")
-            for k in splitfile:
-                revokedFiles.add(k)
-    return revokedFiles
-
 def printIter(inlist):
     for i in sorted(inlist):
         print i
 
 def printReport(args, totalFiles, newGbdbSet, newTableSet, additionalList, oldAdditionalList, pushTables, pushFiles, pushGbdbs, c, oldTableSet, oldReleaseFiles, oldGbdbSet, atticSet, revokedFiles, mdb, revokedTableSet, revokedGbdbs, missingFiles, newSupplementalSet, oldSupplementalSet, tableSize):
     #the groups here need to be predefined, I just copied and pasted after working out what they were
     sep = "\n"
     print "mkChangeNotes v2"
     print "%s %s Release %s" % (args.database, args.composite, args.releaseNew)    
     print "" 
     print "QA Count Summaries for Release %s:" % args.releaseNew
     print "Tables: %d" % int(len(newTableSet))
     print "Files: %d" % int(len(totalFiles - revokedFiles))
     print "Gbdbs: %d" % int(len(newGbdbSet))
     print "Supplemental: %d" % int(len(newSupplementalSet - oldSupplementalSet))
@@ -330,47 +325,49 @@
     print "Supplemental Files:"
     print "New: %s" % len(newSupplementalSet - oldSupplementalSet)
     print "Persisting: %s" % len(oldSupplementalSet & newSupplementalSet)
     print "Removed: %s" % len(oldSupplementalSet - newSupplementalSet)
     print "New + Persisting: %s" % len((newSupplementalSet - oldSupplementalSet) | (oldSupplementalSet & newSupplementalSet))
     print "Total: %s" % len(newSupplementalSet | oldSupplementalSet)
     if args.full:
         print ""
         print "New Supplemental Files:"
         printWithPath(newSupplementalSet - oldSupplementalSet, c, args.releaseNew)
         print ""
         print "Persisting Supplemental Files:"
         printWithPath(oldSupplementalSet & newSupplementalSet, c, args.releaseNew)
         print ""
         print "Removed Supplemental Files:"
-        printWithPath(oldSupplementaList - newSupplementalSet, c, args.releaseNew)
+        printWithPath(oldSupplementalSet - newSupplementalSet, c, args.releaseNew)
     print "\n"
     print "OTHER FILES:"
     print "New: %s" % len(additionalList)
     print "Revoked/Replace: %s" % len(oldAdditionalList)
     print "Total: %s" % len(additionalList | oldAdditionalList)
-    print "\n"
     if args.full:
         print "" 
         print "New Other Files (%s):" % len(additionalList)
         printWithPath(additionalList, c, args.releaseNew)
         print ""
         print "Revoked Other Files (%s):" % len(oldAdditionalList)
         printWithPath(oldAdditionalList, c, args.releaseNew)
+    print "\n"
     print "Files that dropped between releases (%s):" % len(missingFiles)
     printWithPath(missingFiles, c, args.releaseOld)
-    
+    print "\n"
+    if not args.ignore:
+        print "No Errors"
     
 def printReportOne(args, totalFiles, newGbdbSet, newTableSet, additionalList, pushTables, c, pushGbdbs, atticSet, newSupplementalSet, tableSize):
     print "mkChangeNotes v2"
     print "%s %s Release %s" % (args.database, args.composite, args.releaseNew)
     print ""
     print "QA Count Summaries for Release %s:" % args.releaseNew
     print "Tables: %d" % int(len(newTableSet))
     print "Files: %d" % int(len(totalFiles))
     print "Gbdbs: %d" % int(len(newGbdbSet))
     print "Supplemental: %d" % int(len(newSupplementalSet))
     print "Other: %d" % int(len(additionalList))
     print "\n"
     totalsize = 0;
     print "Sizes of New:"
     print "Tables: %d MB" % tableSize
@@ -394,30 +391,32 @@
         print "New Tables (%s):" % len(pushTables)
         printIter(pushTables)
         print "\n"
         print "New Download Files (%s):" % len(totalFiles)
         printWithPath(totalFiles, c, args.releaseNew)
         print "\n"
         print "New Gbdb Files (%s):" % len(pushGbdbs)
         printWithPath(pushGbdbs, c, args.releaseNew)
         print "\n"
         print "New Supplemental Files:"
         printWithPath(newSupplementalSet, c, args.releaseNew)
         print "\n" 
         print "New Other Files (%s):" % len(additionalList)
         printWithPath(additionalList, c, args.releaseNew)
     
+    if not args.ignore:
+        print "No Errors"
 
 
 def printErrors(errors):
     errorsDict = {}
     for i in errors:
         line = i.split(":", 1)
         try:
             errorsDict[line[0]].append(line[1])
         except:
             errorsDict[line[0]] = []
             errorsDict[line[0]].append(line[1])
     print "Errors (%s):" % len(errors)
     for i in sorted(errorsDict.keys()):
         print "%s:" % i
         for j in sorted(errorsDict[i]):
@@ -456,81 +455,80 @@
     
     if args.releaseOld == "-":
         args.releaseOld = 0
     if int(args.releaseOld) > int(args.releaseNew):
         errors.append("Old Release is higher than New Release")
         args.releaseOld = args.releaseNew
 
     if int(args.releaseNew) > 1:
     
         newReleaseFiles = c.releases[int(args.releaseNew)-1]
         oldReleaseFiles = c.releases[int(args.releaseOld)-1]
 
         newMdb = c.alphaMetaDb
         oldMdb = c.publicMetaDb
 
-        (newMdb, revokedSet, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose)
+        (newMdb, revokedSet, revokedFiles, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose)
         errors.extend(newFileErrors)
-        (oldMdb, spam, eggs, oldSupplementalSet, oldFileErrors) = checkMetaDbForFiles(oldMdb, oldReleaseFiles, "public metaDb", loose)
+        (oldMdb, spam, eggs, ham, oldSupplementalSet, oldFileErrors) = checkMetaDbForFiles(oldMdb, oldReleaseFiles, "public metaDb", loose)
         errors.extend(oldFileErrors)
         errors.extend(checkAlphaForDropped(newMdb, oldMdb, "alpha metaDb", "stanza"))
         missingFiles = checkFilesForDropped(newReleaseFiles, oldReleaseFiles)
         errors.extend(checkMd5sums(newReleaseFiles, oldReleaseFiles))
 
         (newTableSet, revokedTableSet, newTableError) = checkTableStatus(newMdb, newReleaseFiles, args.database, args.composite, "alpha metaDb", loose, revokedSet)
         errors.extend(newTableError)
         (oldTableSet, spam,  oldTableError) = checkTableStatus(oldMdb, oldReleaseFiles, args.database, args.composite, "public metaDb", loose, revokedSet)
         errors.extend(oldTableError)
 
         (newGbdbSet, revokedGbdbs, newGbdbError) = getGbdbFiles(args.database, newTableSet, revokedTableSet, newMdb)
         errors.extend(newGbdbError)
         (oldGbdbSet, eggs, oldGbdbError) = getGbdbFiles(args.database, oldTableSet, set(), oldMdb)
         errors.extend(oldGbdbError)
     
         droppedTables = oldTableSet - newTableSet
         if droppedTables:
             for i in droppedTables:
                 errors.append("table: %s was dropped between releases" % i)
 
 
         totalFiles = set(newReleaseFiles)
 
         #these could honestly be moved earlier, get a file list processing section or something
         totalFiles = cleanSpecialFiles(totalFiles)
         oldReleaseFiles = cleanSpecialFiles(set(oldReleaseFiles))
         (oldReleaseFiles, additionalList, oldAdditionalList, totalFiles) = separateOutAdditional(oldReleaseFiles, totalFiles, newSupplementalSet, oldSupplementalSet)
         
-        revokedFiles = makeRevokedFiles(revokedSet, newMdb)
         pushTables = set(sorted((newTableSet - oldTableSet)))
         tableSize = getTableSize(pushTables, args.database)
         
         pushFiles = set(sorted((totalFiles - oldReleaseFiles)))
         pushGbdbs = set(sorted((newGbdbSet - oldGbdbSet)))
         if (not errors) or args.ignore:
             printReport(args, totalFiles, newGbdbSet, newTableSet, additionalList, oldAdditionalList, pushTables, pushFiles, pushGbdbs, c, oldTableSet, oldReleaseFiles, oldGbdbSet, atticSet, revokedFiles, newMdb, revokedTableSet, revokedGbdbs, missingFiles, newSupplementalSet, oldSupplementalSet, tableSize)
         else:
             printErrors(errors)
 
 
     else:
 
         args.releaseOld = 0
         newReleaseFiles = c.releases[int(args.releaseNew)-1]
         
         newMdb = c.alphaMetaDb
         
-        (newMdb, revokedSet, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose)
+        (newMdb, revokedSet, revokedFiles, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose)
         errors.extend(newFileErrors)
         
         (newTableSet, revokedTableSet, newTableError) = checkTableStatus(newMdb, newReleaseFiles, args.database, args.composite, "alpha metaDb", loose, revokedSet)
         errors.extend(newTableError)
         tableSize = getTableSize(newTableSet, args.database)
 
         (newGbdbSet, revokedGbdbs, newGbdbError) = getGbdbFiles(args.database, newTableSet, revokedTableSet, newMdb)
         errors.extend(newGbdbError)
         
         #set for easy operations
         totalFiles = set(newReleaseFiles)
         
         #clean out special fiels we don't push i.e. md5sum.history
         totalFiles = cleanSpecialFiles(totalFiles)