cb3e2f4cfd186807136d469b7221b258896b749c
wong
  Thu Oct 27 16:34:32 2011 -0700
pulled out functions that may be useful outside of mkChangeNotes
diff --git python/lib/ucscgenomics/mkChangeNotes.py python/lib/ucscgenomics/mkChangeNotes.py
index 3c21bb8..0a52704 100644
--- python/lib/ucscgenomics/mkChangeNotes.py
+++ python/lib/ucscgenomics/mkChangeNotes.py
@@ -1,18 +1,18 @@
 #!/hive/groups/encode/dcc/bin/python
 import sys, os, re, argparse, subprocess, math
-from ucscgenomics import ra, track, qa
+from ucscgenomics import ra, track, qa, ucscUtils
 
 class makeNotes(object):
     def checkMetaDbForFiles(self, status, state):
         if state == 'new':
             (mdb, files, loose) = (self.newMdb, self.newReleaseFiles, self.loose)
         elif state == 'old':
             (mdb, files, loose) = (self.oldMdb, self.oldReleaseFiles, self.loose)
 
         errors = []
         revokedset = set()
         revokedfiles = set()
         atticset = set()
         supplementalset = set()
         filtermdb = ra.RaFile()
 
@@ -111,37 +111,31 @@
         if missingTableNames:
             for i in missingTableNames:
                 errors.append("table: %s is type obj, but missing tableName field called by %s" % (i, status))
         missingFromDb = mdbtableset - sqltableset
         if missingFromDb:
             for i in missingFromDb:
                 errors.append("table: %s table not found in Db called by %s" % (i, status))
 
 
 
         return (mdbtableset, revokedtableset, errors)
 
     def __checkGbdbFileStatus(self, i, set, errors, state):
         filelist = i['fileName'].split(',')
         for j in filelist:
-            if os.path.isfile("%s/%s" % (self.gbdbPath, j)):
-                set.add(j)
-            else:
-                cmd = "hgsql %s -e \"select fileName from (%s)\"" % (self.database, i['tableName'])
-                p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
-                cmdoutput = p.stdout.read()
-                if os.path.isfile(cmdoutput.split("\n")[1]):
+            if ucscUtils.isGbdbFile(j, i['tableName'], self.database):
                     set.add(j)
                 else:
                     errors.append("gbdb: %s%s does not exist in %s" % (state, j, self.gbdbPath))
         return set, errors
 
     def getGbdbFiles(self, state):
         revokedset = set()
         if state == 'new':
             (tableset, revokedset, mdb) = (self.newTableSet, self.revokedSet, self.newMdb)
         elif state == 'old':
             (tableset, mdb) = (self.oldTableSet, self.oldMdb)
 
         errors = []
 
         gbdbtableset = qa.getGbdbTables(self.database, tableset)
@@ -187,45 +181,30 @@
 
     def __checkMd5sums(self):
         (newfiles, oldfiles, loose) = (self.newReleaseFiles, self.oldReleaseFiles, self.loose)
         errors = []
         for i in oldfiles:
             if i not in newfiles:
                 pass
             elif re.match('wgEncode.*', i):
                 if oldfiles[i].md5sum != newfiles[i].md5sum:
                     errors.append("file: %s have changed md5sums between releases. %s vs %s" % (i, oldfiles[i].md5sum, newfiles[i].md5sum))
         if loose:
             return list()
         else:
             return errors
 
-    def __makeFileSizes(self, args, inlist):
-        checklist = list()
-
-        for i in inlist:
-            checklist.append("%s/%s" % (self.releasePath, i))    
-
-        filesizes = 0
-        for i in checklist:
-            realpath = os.path.realpath(i)
-            filesizes = filesizes + int(os.path.getsize(realpath))
-
-        filesizes = math.ceil(float(filesizes) / (1024**2))
-
-        return int(filesizes)
-
     def __cleanSpecialFiles(self, inlist):
         specialRemoveList = ['md5sum.history']
         for i in specialRemoveList:
             if i in inlist:
                 inlist.remove(i)
 
         return(inlist)
 
     def __separateOutAdditional(self):
         (oldReleaseFiles, totalFiles, newSupplementalSet, oldSupplementalSet) = (self.oldTotalFiles, self.totalFiles, self.newSupplementalSet, self.oldSupplementalSet)
         additionalList = set()
         oldAdditionalList = set()
         newTotal = set()
         newOld = set()
         for i in totalFiles:
@@ -238,40 +217,30 @@
         for i in oldReleaseFiles:
             if not re.match('wgEncode.*', i):
                 if i in totalFiles:
                     pass
                 elif i in newSupplementalSet:
                     continue
                 else:
                     oldAdditionalList.add(i)
             else:
                 newOld.add(i)
 
         oldReleaseFiles = newOld
 
         return(newOld, additionalList, oldAdditionalList, newTotal)
 
-    def __printIter(self, set, path):
-        output = []
-        for i in sorted(set):
-            if path:
-                output.append("%s/%s" % (path, i))
-            else:
-                output.append("%s" % (i))
-        return output
-
-
     def __printSize(self, size, output, totalsize, type):
 
         sizeGb = int(size/1024)
         if sizeGb > 1:
             output.append("%s: %d MB (%d GB)" % (type, size, sizeGb))
         else:
             output.append("%s: %d MB" % (type, size))
 
         totalsize = totalsize + size
 
         return (output, totalsize)
 
     def __printSection(self, new, untouched, revoked, all, title, path, summary):
         output = []
         removeline = "Revoked/Replaced/Renamed"
@@ -294,37 +263,37 @@
             output.append("New: %s" % len(new))
             output.append("Untouched: %s" % len(untouched))
             output.append("%s: %s" % (removeline, len(revoked)))
             output.append("New + Untouched: %s" % len(new | untouched))
             output.append("%s: %s" % (totaline, len(all)))
             intersect = new & revoked
             if intersect:
                 output.append("")
                 output.append("These %s objects exist in both new and revoked %s:" % (len(intersect), title))
                 for i in intersect:
                     output.append("%s" % i)
             
         if all and not summary:
             output.append("")
             output.append("New %s (%s):" % (title.title(), len(new)))
-            output.extend(self.__printIter(new, path))
+            output.extend(ucscUtils.printIter(new, path))
             output.append("")
             output.append("Untouched %s (%s):" % (title.title(), len(untouched)))
-            output.extend(self.__printIter(untouched, path))
+            output.extend(ucscUtils.printIter(untouched, path))
             output.append("")
             output.append("%s %s (%s):" % (removeline, title.title(), len(revoked)))
-            output.extend(self.__printIter(revoked, path))
+            output.extend(ucscUtils.printIter(revoked, path))
         return output
 
     def __qaHeader(self, output, newTableSet, filesNoRevoke, newGbdbSet, newSupp, additionalList, revokedTables, revokedFiles, revokedGbdbs, pushFiles, pushGbdbs, args, c):
         output = []
         tableSize = self.__getTableSize()
 
         output.append("mkChangeNotes v2")
         title = "%s %s Release %s" % (args['database'], args['composite'], args['releaseNew'])
         if args['releaseOld'] != "solo":
             title = title + " vs Release %s" % args['releaseOld']
         if args['summary']:
             title = "Summary for " + title
         output.append(title)
 
         output.append("")
@@ -333,34 +302,34 @@
         output.append("Files: %d" % int(len(filesNoRevoke)))
         output.append("Gbdbs: %d" % int(len(newGbdbSet)))
         output.append("Supplemental: %d" % int(len(newSupp)))
         output.append("Other: %d" % int(len(additionalList)))
         output.append("")
         output.append("REVOKED:")
         output.append("Tables: %s" % len(revokedTables))
         output.append("Files: %s" % len(revokedFiles))
         output.append("Gbdbs: %s" % len(revokedGbdbs))
         output.append("")
         output.append("Sizes of New:")
 
         totalsize = 0
 
         (output, totalsize) = self.__printSize(tableSize, output, totalsize, "Table")
-        (output, totalsize) = self.__printSize(int(self.__makeFileSizes(args, pushFiles)), output, totalsize, "Files")
-        (output, totalsize) = self.__printSize(int(self.__makeFileSizes(args, pushGbdbs)), output, totalsize, "Gbdbs")
-        (output, totalsize) = self.__printSize(int(self.__makeFileSizes(args, newSupp)), output, totalsize, "Supplemental")
-        (output, totalsize) = self.__printSize(int(self.__makeFileSizes(args, additionalList)), output, totalsize, "Other")
+        (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(pushFiles, self.releasePath)), output, totalsize, "Files")
+        (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(pushGbdbs, self.releasePath)), output, totalsize, "Gbdbs")
+        (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(newSupp, self.releasePath)), output, totalsize, "Supplemental")
+        (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(additionalList, self.releasePath)), output, totalsize, "Other")
         (output, totalsize) = self.__printSize(totalsize, output, 0, "Total")
 
         return output
 
     def printReport(self, args, c):
         (totalFiles, newGbdbSet, newTableSet, additionalList, oldAdditionalList, oldTableSet, oldReleaseFiles, oldGbdbSet, atticSet, revokedFiles, revokedTableSet, revokedGbdbs, missingFiles, newSupplementalSet, oldSupplementalSet) = (self.totalFiles, self.newGbdbSet, self.newTableSet, self.additionalList, self.oldAdditionalList, self.oldTableSet, self.oldTotalFiles, self.oldGbdbSet, self.atticSet, self.revokedFiles, self.revokedTableSet, self.revokedGbdbs, self.missingFiles, self.newSupplementalSet, self.oldSupplementalSet)
         #the groups here need to be predefined, I just copied and pasted after working out what they were
         sep = "\n"
         output = []
         pushTables = set(sorted((self.newTableSet - self.oldTableSet)))
         pushFiles = set(sorted((self.totalFiles - self.oldTotalFiles)))
         pushGbdbs = set(sorted((self.newGbdbSet - self.oldGbdbSet)))
         filesNoRevoke = totalFiles - revokedFiles
         allTables = newTableSet | oldTableSet | revokedTableSet
         untouchedTables = oldTableSet & newTableSet
@@ -373,92 +342,92 @@
         newSupp = newSupplementalSet - oldSupplementalSet
         removedSupp = oldSupplementalSet - newSupplementalSet
         untouchedSupp = oldSupplementalSet & newSupplementalSet
         allOther = additionalList | oldAdditionalList
         removedOther = oldAdditionalList - additionalList
 
 
         output.extend(self.__qaHeader(output, newTableSet, filesNoRevoke, newGbdbSet, newSupp, additionalList, revokedTableSet, revokedFiles, revokedGbdbs, pushFiles, pushGbdbs, args, c))
 
         output.extend(self.__printSection(pushTables, untouchedTables, revokedTableSet, allTables, "tables", 0, args['summary']))
         output.extend(self.__printSection(pushFiles, untouchedFiles, revokedFiles, allFiles, "download", self.releasePath, args['summary']))
         output.extend(self.__printSection(pushGbdbs, untouchedGbdbs, revokedGbdbs, allGbdbs, "gbdbs", self.gbdbPath, args['summary']))
         output.extend(self.__printSection(newSupp, untouchedSupp, removedSupp, allSupp, "supplemental", self.releasePath, args['summary']))
 
         self.newTables = set(pushTables)
-        self.newFiles = set(self.__printIter(pushFiles, self.releasePath))
-        self.newGbdbs = set(self.__printIter(pushGbdbs, self.gbdbPath))
-        self.newSupplemental = set(self.__printIter(newSupp, self.releasePath))
-        self.newOthers = set(self.__printIter(additionalList, self.releasePath))
+        self.newFiles = set(ucscUtils.printIter(pushFiles, self.releasePath))
+        self.newGbdbs = set(ucscUtils.printIter(pushGbdbs, self.gbdbPath))
+        self.newSupplemental = set(ucscUtils.printIter(newSupp, self.releasePath))
+        self.newOthers = set(ucscUtils.printIter(additionalList, self.releasePath))
 
         otherprint = len(allOther)
         if otherprint:
             output.append("\n")
             output.append("OTHER FILES:")
             output.append("New: %s" % len(additionalList))
             output.append("Revoked/Replace: %s" % len(removedOther))
             output.append("Total: %s" % len(allOther))
         if otherprint and not args['summary']:
             output.append("")
             output.append("New Other Files (%s):" % len(additionalList))
             output.extend(sorted(list(self.newOthers)))
             output.append("")
             output.append("Revoked Other Files (%s):" % len(removedOther))
-            output.extend(self.__printIter((removedOther), self.releasePath))
+            output.extend(ucscUtils.printIter((removedOther), self.releasePath))
         output.append("\n")
 
         if len(missingFiles):
             output.append("Files that dropped between releases (%s):" % len(missingFiles))
-            output.extend(self.__printIter(missingFiles, self.releasePath))
+            output.extend(ucscUtils.printIter(missingFiles, self.releasePath))
             output.append("\n")
 
         if not args['ignore']:
             output.append("No Errors")
         return output
 
     def __printSectionOne(self, output, set, title):
         output = []
         if set:
             output.append("%s (%s):" % (title, len(set)))
             output.extend(sorted(list(set)))
             output.append("\n")
         return output
 
     def printReportOne(self, args, c):
         (totalFiles, revokedFiles, newGbdbSet, revokedGbdbs, newTableSet, revokedTables, additionalList, atticSet, newSupplementalSet, tableSize) = (self.totalFiles, self.revokedFiles, self.newGbdbSet, self.revokedGbdbs, self.newTableSet, self.revokedTableSet, self.additionalList, self.atticSet, self.newSupplementalSet, self.tableSize)
         output = []
         newTables = newTableSet - revokedTables
         newFiles = totalFiles - revokedFiles
         newGbdbs = newGbdbSet - revokedGbdbs
 
         output.extend(self.__qaHeader(output, newTables, newFiles, newGbdbSet, newSupplementalSet, additionalList, revokedTables, revokedFiles, revokedGbdbs, totalFiles, newGbdbSet, args, c))
         self.newTables = set(newTables)
-        self.newFiles = set(self.__printIter(newFiles, self.releasePath))
-        self.newGbdbs = set(self.__printIter(newGbdbs, self.releasePath))
-        self.newSupplemental = set(self.__printIter(newSupplementalSet, self.releasePath))
-        self.newOthers = set(self.__printIter(additionalList, self.releasePath))
+        self.newFiles = set(ucscUtils.printIter(newFiles, self.releasePath))
+        self.newGbdbs = set(ucscUtils.printIter(newGbdbs, self.releasePath))
+        self.newSupplemental = set(ucscUtils.printIter(newSupplementalSet, self.releasePath))
+        self.newOthers = set(ucscUtils.printIter(additionalList, self.releasePath))
 
         if not args['summary']:
             output.append("")
             output.extend(self.__printSectionOne(output, self.newTables, "New Tables"))
             output.extend(self.__printSectionOne(output, self.newFiles, "New Download Files"))
             output.extend(self.__printSectionOne(output, self.newGbdbs, "New Gbdb Files"))
             output.extend(self.__printSectionOne(output, self.newSupplemental, "New Supplemental Files"))
             output.extend(self.__printSectionOne(output, self.newOthers, "New Other Files"))
-            output.extend(self.__printSectionOne(output, self.__printIter(revokedTables, 0), "Revoked Tables"))
-            output.extend(self.__printSectionOne(output, self.__printIter(revokedFiles, self.releasePath), "Revoked Files"))
-            output.extend(self.__printSectionOne(output, self.__printIter(revokedGbdbs, self.gbdbPath), "Revoked Gbdbs"))
+            output.extend(self.__printSectionOne(output, ucscUtils.printIter(revokedTables, 0), "Revoked Tables"))
+            output.extend(self.__printSectionOne(output, ucscUtils.printIter(revokedFiles, self.releasePath), "Revoked Files"))
+            output.extend(self.__printSectionOne(output, ucscUtils.printIter(revokedGbdbs, self.gbdbPath), "Revoked Gbdbs"))
         if not args['ignore']:
             output.append("No Errors")
         return output
 
     def printErrors(self, errors):
         errorsDict = {}
         output = []
         for i in errors:
             line = i.split(":", 1)
             try:
                 errorsDict[line[0]].append(line[1])
             except:
                 errorsDict[line[0]] = []
                 errorsDict[line[0]].append(line[1])
         output.append("Errors (%s):" % len(errors))