cb3e2f4cfd186807136d469b7221b258896b749c wong Thu Oct 27 16:34:32 2011 -0700 pulled out functions that may be useful outside of mkChangeNotes diff --git python/lib/ucscgenomics/mkChangeNotes.py python/lib/ucscgenomics/mkChangeNotes.py index 3c21bb8..0a52704 100644 --- python/lib/ucscgenomics/mkChangeNotes.py +++ python/lib/ucscgenomics/mkChangeNotes.py @@ -1,18 +1,18 @@ #!/hive/groups/encode/dcc/bin/python import sys, os, re, argparse, subprocess, math -from ucscgenomics import ra, track, qa +from ucscgenomics import ra, track, qa, ucscUtils class makeNotes(object): def checkMetaDbForFiles(self, status, state): if state == 'new': (mdb, files, loose) = (self.newMdb, self.newReleaseFiles, self.loose) elif state == 'old': (mdb, files, loose) = (self.oldMdb, self.oldReleaseFiles, self.loose) errors = [] revokedset = set() revokedfiles = set() atticset = set() supplementalset = set() filtermdb = ra.RaFile() @@ -111,37 +111,31 @@ if missingTableNames: for i in missingTableNames: errors.append("table: %s is type obj, but missing tableName field called by %s" % (i, status)) missingFromDb = mdbtableset - sqltableset if missingFromDb: for i in missingFromDb: errors.append("table: %s table not found in Db called by %s" % (i, status)) return (mdbtableset, revokedtableset, errors) def __checkGbdbFileStatus(self, i, set, errors, state): filelist = i['fileName'].split(',') for j in filelist: - if os.path.isfile("%s/%s" % (self.gbdbPath, j)): - set.add(j) - else: - cmd = "hgsql %s -e \"select fileName from (%s)\"" % (self.database, i['tableName']) - p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) - cmdoutput = p.stdout.read() - if os.path.isfile(cmdoutput.split("\n")[1]): + if ucscUtils.isGbdbFile(j, i['tableName'], self.database): set.add(j) else: errors.append("gbdb: %s%s does not exist in %s" % (state, j, self.gbdbPath)) return set, errors def getGbdbFiles(self, state): revokedset = set() if state == 'new': (tableset, revokedset, mdb) = (self.newTableSet, self.revokedSet, self.newMdb) elif state == 'old': (tableset, mdb) = (self.oldTableSet, self.oldMdb) errors = [] gbdbtableset = qa.getGbdbTables(self.database, tableset) @@ -187,45 +181,30 @@ def __checkMd5sums(self): (newfiles, oldfiles, loose) = (self.newReleaseFiles, self.oldReleaseFiles, self.loose) errors = [] for i in oldfiles: if i not in newfiles: pass elif re.match('wgEncode.*', i): if oldfiles[i].md5sum != newfiles[i].md5sum: errors.append("file: %s have changed md5sums between releases. %s vs %s" % (i, oldfiles[i].md5sum, newfiles[i].md5sum)) if loose: return list() else: return errors - def __makeFileSizes(self, args, inlist): - checklist = list() - - for i in inlist: - checklist.append("%s/%s" % (self.releasePath, i)) - - filesizes = 0 - for i in checklist: - realpath = os.path.realpath(i) - filesizes = filesizes + int(os.path.getsize(realpath)) - - filesizes = math.ceil(float(filesizes) / (1024**2)) - - return int(filesizes) - def __cleanSpecialFiles(self, inlist): specialRemoveList = ['md5sum.history'] for i in specialRemoveList: if i in inlist: inlist.remove(i) return(inlist) def __separateOutAdditional(self): (oldReleaseFiles, totalFiles, newSupplementalSet, oldSupplementalSet) = (self.oldTotalFiles, self.totalFiles, self.newSupplementalSet, self.oldSupplementalSet) additionalList = set() oldAdditionalList = set() newTotal = set() newOld = set() for i in totalFiles: @@ -238,40 +217,30 @@ for i in oldReleaseFiles: if not re.match('wgEncode.*', i): if i in totalFiles: pass elif i in newSupplementalSet: continue else: oldAdditionalList.add(i) else: newOld.add(i) oldReleaseFiles = newOld return(newOld, additionalList, oldAdditionalList, newTotal) - def __printIter(self, set, path): - output = [] - for i in sorted(set): - if path: - output.append("%s/%s" % (path, i)) - else: - output.append("%s" % (i)) - return output - - def __printSize(self, size, output, totalsize, type): sizeGb = int(size/1024) if sizeGb > 1: output.append("%s: %d MB (%d GB)" % (type, size, sizeGb)) else: output.append("%s: %d MB" % (type, size)) totalsize = totalsize + size return (output, totalsize) def __printSection(self, new, untouched, revoked, all, title, path, summary): output = [] removeline = "Revoked/Replaced/Renamed" @@ -294,37 +263,37 @@ output.append("New: %s" % len(new)) output.append("Untouched: %s" % len(untouched)) output.append("%s: %s" % (removeline, len(revoked))) output.append("New + Untouched: %s" % len(new | untouched)) output.append("%s: %s" % (totaline, len(all))) intersect = new & revoked if intersect: output.append("") output.append("These %s objects exist in both new and revoked %s:" % (len(intersect), title)) for i in intersect: output.append("%s" % i) if all and not summary: output.append("") output.append("New %s (%s):" % (title.title(), len(new))) - output.extend(self.__printIter(new, path)) + output.extend(ucscUtils.printIter(new, path)) output.append("") output.append("Untouched %s (%s):" % (title.title(), len(untouched))) - output.extend(self.__printIter(untouched, path)) + output.extend(ucscUtils.printIter(untouched, path)) output.append("") output.append("%s %s (%s):" % (removeline, title.title(), len(revoked))) - output.extend(self.__printIter(revoked, path)) + output.extend(ucscUtils.printIter(revoked, path)) return output def __qaHeader(self, output, newTableSet, filesNoRevoke, newGbdbSet, newSupp, additionalList, revokedTables, revokedFiles, revokedGbdbs, pushFiles, pushGbdbs, args, c): output = [] tableSize = self.__getTableSize() output.append("mkChangeNotes v2") title = "%s %s Release %s" % (args['database'], args['composite'], args['releaseNew']) if args['releaseOld'] != "solo": title = title + " vs Release %s" % args['releaseOld'] if args['summary']: title = "Summary for " + title output.append(title) output.append("") @@ -333,34 +302,34 @@ output.append("Files: %d" % int(len(filesNoRevoke))) output.append("Gbdbs: %d" % int(len(newGbdbSet))) output.append("Supplemental: %d" % int(len(newSupp))) output.append("Other: %d" % int(len(additionalList))) output.append("") output.append("REVOKED:") output.append("Tables: %s" % len(revokedTables)) output.append("Files: %s" % len(revokedFiles)) output.append("Gbdbs: %s" % len(revokedGbdbs)) output.append("") output.append("Sizes of New:") totalsize = 0 (output, totalsize) = self.__printSize(tableSize, output, totalsize, "Table") - (output, totalsize) = self.__printSize(int(self.__makeFileSizes(args, pushFiles)), output, totalsize, "Files") - (output, totalsize) = self.__printSize(int(self.__makeFileSizes(args, pushGbdbs)), output, totalsize, "Gbdbs") - (output, totalsize) = self.__printSize(int(self.__makeFileSizes(args, newSupp)), output, totalsize, "Supplemental") - (output, totalsize) = self.__printSize(int(self.__makeFileSizes(args, additionalList)), output, totalsize, "Other") + (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(pushFiles, self.releasePath)), output, totalsize, "Files") + (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(pushGbdbs, self.releasePath)), output, totalsize, "Gbdbs") + (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(newSupp, self.releasePath)), output, totalsize, "Supplemental") + (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(additionalList, self.releasePath)), output, totalsize, "Other") (output, totalsize) = self.__printSize(totalsize, output, 0, "Total") return output def printReport(self, args, c): (totalFiles, newGbdbSet, newTableSet, additionalList, oldAdditionalList, oldTableSet, oldReleaseFiles, oldGbdbSet, atticSet, revokedFiles, revokedTableSet, revokedGbdbs, missingFiles, newSupplementalSet, oldSupplementalSet) = (self.totalFiles, self.newGbdbSet, self.newTableSet, self.additionalList, self.oldAdditionalList, self.oldTableSet, self.oldTotalFiles, self.oldGbdbSet, self.atticSet, self.revokedFiles, self.revokedTableSet, self.revokedGbdbs, self.missingFiles, self.newSupplementalSet, self.oldSupplementalSet) #the groups here need to be predefined, I just copied and pasted after working out what they were sep = "\n" output = [] pushTables = set(sorted((self.newTableSet - self.oldTableSet))) pushFiles = set(sorted((self.totalFiles - self.oldTotalFiles))) pushGbdbs = set(sorted((self.newGbdbSet - self.oldGbdbSet))) filesNoRevoke = totalFiles - revokedFiles allTables = newTableSet | oldTableSet | revokedTableSet untouchedTables = oldTableSet & newTableSet @@ -373,92 +342,92 @@ newSupp = newSupplementalSet - oldSupplementalSet removedSupp = oldSupplementalSet - newSupplementalSet untouchedSupp = oldSupplementalSet & newSupplementalSet allOther = additionalList | oldAdditionalList removedOther = oldAdditionalList - additionalList output.extend(self.__qaHeader(output, newTableSet, filesNoRevoke, newGbdbSet, newSupp, additionalList, revokedTableSet, revokedFiles, revokedGbdbs, pushFiles, pushGbdbs, args, c)) output.extend(self.__printSection(pushTables, untouchedTables, revokedTableSet, allTables, "tables", 0, args['summary'])) output.extend(self.__printSection(pushFiles, untouchedFiles, revokedFiles, allFiles, "download", self.releasePath, args['summary'])) output.extend(self.__printSection(pushGbdbs, untouchedGbdbs, revokedGbdbs, allGbdbs, "gbdbs", self.gbdbPath, args['summary'])) output.extend(self.__printSection(newSupp, untouchedSupp, removedSupp, allSupp, "supplemental", self.releasePath, args['summary'])) self.newTables = set(pushTables) - self.newFiles = set(self.__printIter(pushFiles, self.releasePath)) - self.newGbdbs = set(self.__printIter(pushGbdbs, self.gbdbPath)) - self.newSupplemental = set(self.__printIter(newSupp, self.releasePath)) - self.newOthers = set(self.__printIter(additionalList, self.releasePath)) + self.newFiles = set(ucscUtils.printIter(pushFiles, self.releasePath)) + self.newGbdbs = set(ucscUtils.printIter(pushGbdbs, self.gbdbPath)) + self.newSupplemental = set(ucscUtils.printIter(newSupp, self.releasePath)) + self.newOthers = set(ucscUtils.printIter(additionalList, self.releasePath)) otherprint = len(allOther) if otherprint: output.append("\n") output.append("OTHER FILES:") output.append("New: %s" % len(additionalList)) output.append("Revoked/Replace: %s" % len(removedOther)) output.append("Total: %s" % len(allOther)) if otherprint and not args['summary']: output.append("") output.append("New Other Files (%s):" % len(additionalList)) output.extend(sorted(list(self.newOthers))) output.append("") output.append("Revoked Other Files (%s):" % len(removedOther)) - output.extend(self.__printIter((removedOther), self.releasePath)) + output.extend(ucscUtils.printIter((removedOther), self.releasePath)) output.append("\n") if len(missingFiles): output.append("Files that dropped between releases (%s):" % len(missingFiles)) - output.extend(self.__printIter(missingFiles, self.releasePath)) + output.extend(ucscUtils.printIter(missingFiles, self.releasePath)) output.append("\n") if not args['ignore']: output.append("No Errors") return output def __printSectionOne(self, output, set, title): output = [] if set: output.append("%s (%s):" % (title, len(set))) output.extend(sorted(list(set))) output.append("\n") return output def printReportOne(self, args, c): (totalFiles, revokedFiles, newGbdbSet, revokedGbdbs, newTableSet, revokedTables, additionalList, atticSet, newSupplementalSet, tableSize) = (self.totalFiles, self.revokedFiles, self.newGbdbSet, self.revokedGbdbs, self.newTableSet, self.revokedTableSet, self.additionalList, self.atticSet, self.newSupplementalSet, self.tableSize) output = [] newTables = newTableSet - revokedTables newFiles = totalFiles - revokedFiles newGbdbs = newGbdbSet - revokedGbdbs output.extend(self.__qaHeader(output, newTables, newFiles, newGbdbSet, newSupplementalSet, additionalList, revokedTables, revokedFiles, revokedGbdbs, totalFiles, newGbdbSet, args, c)) self.newTables = set(newTables) - self.newFiles = set(self.__printIter(newFiles, self.releasePath)) - self.newGbdbs = set(self.__printIter(newGbdbs, self.releasePath)) - self.newSupplemental = set(self.__printIter(newSupplementalSet, self.releasePath)) - self.newOthers = set(self.__printIter(additionalList, self.releasePath)) + self.newFiles = set(ucscUtils.printIter(newFiles, self.releasePath)) + self.newGbdbs = set(ucscUtils.printIter(newGbdbs, self.releasePath)) + self.newSupplemental = set(ucscUtils.printIter(newSupplementalSet, self.releasePath)) + self.newOthers = set(ucscUtils.printIter(additionalList, self.releasePath)) if not args['summary']: output.append("") output.extend(self.__printSectionOne(output, self.newTables, "New Tables")) output.extend(self.__printSectionOne(output, self.newFiles, "New Download Files")) output.extend(self.__printSectionOne(output, self.newGbdbs, "New Gbdb Files")) output.extend(self.__printSectionOne(output, self.newSupplemental, "New Supplemental Files")) output.extend(self.__printSectionOne(output, self.newOthers, "New Other Files")) - output.extend(self.__printSectionOne(output, self.__printIter(revokedTables, 0), "Revoked Tables")) - output.extend(self.__printSectionOne(output, self.__printIter(revokedFiles, self.releasePath), "Revoked Files")) - output.extend(self.__printSectionOne(output, self.__printIter(revokedGbdbs, self.gbdbPath), "Revoked Gbdbs")) + output.extend(self.__printSectionOne(output, ucscUtils.printIter(revokedTables, 0), "Revoked Tables")) + output.extend(self.__printSectionOne(output, ucscUtils.printIter(revokedFiles, self.releasePath), "Revoked Files")) + output.extend(self.__printSectionOne(output, ucscUtils.printIter(revokedGbdbs, self.gbdbPath), "Revoked Gbdbs")) if not args['ignore']: output.append("No Errors") return output def printErrors(self, errors): errorsDict = {} output = [] for i in errors: line = i.split(":", 1) try: errorsDict[line[0]].append(line[1]) except: errorsDict[line[0]] = [] errorsDict[line[0]].append(line[1]) output.append("Errors (%s):" % len(errors))