ca16d10bd54612cfac0d53560485b670c08ff038 wong Fri Oct 28 14:00:03 2011 -0700 changed output a little, added in some some printing functions, prints out a missing tables list now also diff --git python/lib/ucscgenomics/mkChangeNotes.py python/lib/ucscgenomics/mkChangeNotes.py index 0a52704..8a9a3ba 100644 --- python/lib/ucscgenomics/mkChangeNotes.py +++ python/lib/ucscgenomics/mkChangeNotes.py @@ -94,46 +94,49 @@ mdbobjectset = set(mdb.filter(lambda s: s['objType'] == 'table' and 'tableName' in s and 'attic' not in s, lambda s: s['metaObject'])) - revokedset mdbtableset = set(mdb.filter(lambda s: s['metaObject'] in mdbobjectset, lambda s: s['tableName'])) revokedtableset = set(mdb.filter(lambda s: s['metaObject'] in revokedset, lambda s: s['tableName'])) sep = "','" tablestr = sep.join(mdbtableset) tablestr = "'" + tablestr + "'" #this should really be using python's database module, but I'd need admin access to install it #at this point, I am just parsing the output form hgsql cmd = "hgsql %s -e \"select table_name from information_schema.TABLES where table_name in (%s)\"" % (database, tablestr) p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) cmdoutput = p.stdout.read() sqltableset = set(cmdoutput.split("\n")[1:-1]) - missingTableNames = set(mdb.filter(lambda s: s['objType'] == 'table' and 'tableName' not in s and 'attic' not in s, lambda s: s['metaObject'])) if missingTableNames: for i in missingTableNames: errors.append("table: %s is type obj, but missing tableName field called by %s" % (i, status)) missingFromDb = mdbtableset - sqltableset if missingFromDb: for i in missingFromDb: errors.append("table: %s table not found in Db called by %s" % (i, status)) - return (mdbtableset, revokedtableset, errors) + return (mdbtableset, revokedtableset, missingFromDb, errors) def __checkGbdbFileStatus(self, i, set, errors, state): filelist = i['fileName'].split(',') + #preprocess filelist, delete after bai in mdb issue is revoled + #print filelist[0] + if re.match('\S+.bam', filelist[0]) and filelist[0] in self.oldReleaseFiles and (filelist[0] + '.bai') not in filelist: + filelist.append(filelist[0] + '.bai') for j in filelist: if ucscUtils.isGbdbFile(j, i['tableName'], self.database): set.add(j) else: errors.append("gbdb: %s%s does not exist in %s" % (state, j, self.gbdbPath)) return set, errors def getGbdbFiles(self, state): revokedset = set() if state == 'new': (tableset, revokedset, mdb) = (self.newTableSet, self.revokedSet, self.newMdb) elif state == 'old': (tableset, mdb) = (self.oldTableSet, self.oldMdb) errors = [] @@ -142,30 +145,31 @@ revokedtableset = qa.getGbdbTables(self.database, revokedset) filestanzas = mdb.filter(lambda s: s['tableName'] in gbdbtableset, lambda s: s) revokedstanzas = mdb.filter(lambda s: s['tableName'] in revokedtableset, lambda s: s) gbdbfileset = set() revokedfileset = set() for i in filestanzas: (gbdbfileset, errors) = self.__checkGbdbFileStatus(i, gbdbfileset, errors, "") for i in revokedstanzas: (revokedfileset, errors) = self.__checkGbdbFileStatus(i, revokedfileset, errors, "revoked gbdb ") + return (gbdbfileset, revokedfileset, errors) def __getTableSize(self): (mdbtableset, database) = (self.newTableSet, self.database) tablesize = float(0) tablelist = list() for i in mdbtableset: tablelist.append("table_name = '%s'" % i) orsep = " OR " orstr = orsep.join(tablelist) cmd = "hgsql %s -e \"SELECT ROUND(data_length/1024/1024,2) total_size_mb, ROUND(index_length/1024/1024,2) total_index_size_mb FROM information_schema.TABLES WHERE %s\"" % (database, orstr) @@ -310,246 +314,296 @@ output.append("Gbdbs: %s" % len(revokedGbdbs)) output.append("") output.append("Sizes of New:") totalsize = 0 (output, totalsize) = self.__printSize(tableSize, output, totalsize, "Table") (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(pushFiles, self.releasePath)), output, totalsize, "Files") (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(pushGbdbs, self.releasePath)), output, totalsize, "Gbdbs") (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(newSupp, self.releasePath)), output, totalsize, "Supplemental") (output, totalsize) = self.__printSize(int(ucscUtils.makeFileSizes(additionalList, self.releasePath)), output, totalsize, "Other") (output, totalsize) = self.__printSize(totalsize, output, 0, "Total") return output + def __addMissingToReport(self, missing, type, path=None): + output = [] + if missing: + output.append("%s that dropped between releases (%s):" % (type, len(missing))) + output.extend(ucscUtils.printIter(missing, path)) + output.append("\n") + return output + + def __checkAtticNotInTrackDb(self): + errors = [] + atticTables = self.newMdb.filter(lambda s: s['objType'] == 'table' and 'attic' in s, lambda s: s['tableName']) + for i in atticTables: + foo = self.trackDb.filter(lambda s: i in s['track'], lambda s: s['track']) + if foo: + errors.append("trackDb: %s is attic in metaDb, has an active trackDb entry" % i) + + return errors + def printReport(self, args, c): (totalFiles, newGbdbSet, newTableSet, additionalList, oldAdditionalList, oldTableSet, oldReleaseFiles, oldGbdbSet, atticSet, revokedFiles, revokedTableSet, revokedGbdbs, missingFiles, newSupplementalSet, oldSupplementalSet) = (self.totalFiles, self.newGbdbSet, self.newTableSet, self.additionalList, self.oldAdditionalList, self.oldTableSet, self.oldTotalFiles, self.oldGbdbSet, self.atticSet, self.revokedFiles, self.revokedTableSet, self.revokedGbdbs, self.missingFiles, self.newSupplementalSet, self.oldSupplementalSet) #the groups here need to be predefined, I just copied and pasted after working out what they were sep = "\n" output = [] - pushTables = set(sorted((self.newTableSet - self.oldTableSet))) - pushFiles = set(sorted((self.totalFiles - self.oldTotalFiles))) - pushGbdbs = set(sorted((self.newGbdbSet - self.oldGbdbSet))) - filesNoRevoke = totalFiles - revokedFiles + + #maths allTables = newTableSet | oldTableSet | revokedTableSet + pushTables = set(sorted((self.newTableSet - self.oldTableSet))) untouchedTables = oldTableSet & newTableSet + allFiles = totalFiles | oldReleaseFiles | revokedFiles + pushFiles = set(sorted((self.totalFiles - self.oldTotalFiles))) newFiles = pushFiles - revokedFiles untouchedFiles = (totalFiles & oldReleaseFiles) - revokedFiles + filesNoRevoke = totalFiles - revokedFiles + allGbdbs = newGbdbSet | oldGbdbSet | revokedGbdbs untouchedGbdbs = (newGbdbSet & oldGbdbSet) - revokedGbdbs + pushGbdbs = set(sorted((self.newGbdbSet - self.oldGbdbSet))) + allSupp = newSupplementalSet | oldSupplementalSet newSupp = newSupplementalSet - oldSupplementalSet removedSupp = oldSupplementalSet - newSupplementalSet untouchedSupp = oldSupplementalSet & newSupplementalSet + allOther = additionalList | oldAdditionalList removedOther = oldAdditionalList - additionalList - output.extend(self.__qaHeader(output, newTableSet, filesNoRevoke, newGbdbSet, newSupp, additionalList, revokedTableSet, revokedFiles, revokedGbdbs, pushFiles, pushGbdbs, args, c)) output.extend(self.__printSection(pushTables, untouchedTables, revokedTableSet, allTables, "tables", 0, args['summary'])) output.extend(self.__printSection(pushFiles, untouchedFiles, revokedFiles, allFiles, "download", self.releasePath, args['summary'])) output.extend(self.__printSection(pushGbdbs, untouchedGbdbs, revokedGbdbs, allGbdbs, "gbdbs", self.gbdbPath, args['summary'])) output.extend(self.__printSection(newSupp, untouchedSupp, removedSupp, allSupp, "supplemental", self.releasePath, args['summary'])) + #These attributes are the critical ones that are used by qaInit, others could potentially use these also. self.newTables = set(pushTables) self.newFiles = set(ucscUtils.printIter(pushFiles, self.releasePath)) self.newGbdbs = set(ucscUtils.printIter(pushGbdbs, self.gbdbPath)) self.newSupplemental = set(ucscUtils.printIter(newSupp, self.releasePath)) self.newOthers = set(ucscUtils.printIter(additionalList, self.releasePath)) otherprint = len(allOther) if otherprint: output.append("\n") output.append("OTHER FILES:") output.append("New: %s" % len(additionalList)) output.append("Revoked/Replace: %s" % len(removedOther)) output.append("Total: %s" % len(allOther)) if otherprint and not args['summary']: output.append("") output.append("New Other Files (%s):" % len(additionalList)) output.extend(sorted(list(self.newOthers))) output.append("") output.append("Revoked Other Files (%s):" % len(removedOther)) output.extend(ucscUtils.printIter((removedOther), self.releasePath)) output.append("\n") - if len(missingFiles): - output.append("Files that dropped between releases (%s):" % len(missingFiles)) - output.extend(ucscUtils.printIter(missingFiles, self.releasePath)) + output.extend(self.__addMissingToReport(missingFiles, "Files", self.releasePath)) output.append("\n") + output.extend(self.__addMissingToReport(self.droppedTables, "Tables")) if not args['ignore']: output.append("No Errors") + else: + output.append("The counts here were generated by ignoring errors, they may not be correct") return output def __printSectionOne(self, output, set, title): output = [] if set: output.append("%s (%s):" % (title, len(set))) output.extend(sorted(list(set))) - output.append("\n") return output def printReportOne(self, args, c): (totalFiles, revokedFiles, newGbdbSet, revokedGbdbs, newTableSet, revokedTables, additionalList, atticSet, newSupplementalSet, tableSize) = (self.totalFiles, self.revokedFiles, self.newGbdbSet, self.revokedGbdbs, self.newTableSet, self.revokedTableSet, self.additionalList, self.atticSet, self.newSupplementalSet, self.tableSize) output = [] newTables = newTableSet - revokedTables newFiles = totalFiles - revokedFiles newGbdbs = newGbdbSet - revokedGbdbs output.extend(self.__qaHeader(output, newTables, newFiles, newGbdbSet, newSupplementalSet, additionalList, revokedTables, revokedFiles, revokedGbdbs, totalFiles, newGbdbSet, args, c)) self.newTables = set(newTables) self.newFiles = set(ucscUtils.printIter(newFiles, self.releasePath)) self.newGbdbs = set(ucscUtils.printIter(newGbdbs, self.releasePath)) self.newSupplemental = set(ucscUtils.printIter(newSupplementalSet, self.releasePath)) self.newOthers = set(ucscUtils.printIter(additionalList, self.releasePath)) if not args['summary']: output.append("") output.extend(self.__printSectionOne(output, self.newTables, "New Tables")) output.extend(self.__printSectionOne(output, self.newFiles, "New Download Files")) output.extend(self.__printSectionOne(output, self.newGbdbs, "New Gbdb Files")) output.extend(self.__printSectionOne(output, self.newSupplemental, "New Supplemental Files")) output.extend(self.__printSectionOne(output, self.newOthers, "New Other Files")) output.extend(self.__printSectionOne(output, ucscUtils.printIter(revokedTables, 0), "Revoked Tables")) output.extend(self.__printSectionOne(output, ucscUtils.printIter(revokedFiles, self.releasePath), "Revoked Files")) output.extend(self.__printSectionOne(output, ucscUtils.printIter(revokedGbdbs, self.gbdbPath), "Revoked Gbdbs")) + if not args['ignore']: output.append("No Errors") + else: + output.append("The counts here were generated by ignoring errors, they may not be correct") return output - def printErrors(self, errors): + def printErrors(self, errors, missingFiles): errorsDict = {} output = [] for i in errors: line = i.split(":", 1) try: errorsDict[line[0]].append(line[1]) except: errorsDict[line[0]] = [] errorsDict[line[0]].append(line[1]) output.append("Errors (%s):" % len(errors)) for i in sorted(errorsDict.keys()): output.append("%s:" % i) for j in sorted(errorsDict[i]): output.append("%s" % j) + output.append("\n") + output.extend(self.__addMissingToReport(missingFiles, "Files", self.releasePath)) + output.append("\n") + output.extend(self.__addMissingToReport(self.droppedTables, "Tables")) return output def __init__(self, args): self.releaseNew = args['releaseNew'] self.releaseOld = args['releaseOld'] self.database = args['database'] self.composite = args['composite'] self.loose = args['loose'] self.ignore = args['ignore'] self.summary = args['summary'] self.specialMdb = args['specialMdb'] self.args = args errors = [] c = track.CompositeTrack(self.database, self.composite, None, self.specialMdb) #sanitize arguments if not self.releaseOld.isdigit(): self.releaseOld = 'solo' elif int(self.releaseOld) <= 0: self.releaseOlf = 'solo' elif self.releaseOld > self.releaseNew: self.releaseOld = 'solo' self.releasePath = c.httpDownloadsPath + 'release' + args['releaseNew'] self.gbdbPath = "/gbdb/%s/bbi" % args['database'] + self.trackDbFile = c.currentTrackDb + if not self.trackDbFile: + errors.append("track: There is no entry in trackDb.wgEncode.ra for %s with the alpha tag" % self.composite) + else: + self.trackDb = ra.RaFile(self.trackDbFile) if int(self.releaseNew) > 1 and str(self.releaseOld) != 'solo': self.newReleaseFiles = c.releases[int(self.releaseNew)-1] self.oldReleaseFiles = c.releases[int(self.releaseOld)-1] self.newMdb = c.alphaMetaDb self.oldMdb = c.publicMetaDb #make a list of missing files self.missingFiles = self.__checkFilesForDropped() + #filter them out of old release files + + + #check if all files listed in release directories have associated metaDb entries (self.newMdb, self.revokedSet, self.revokedFiles, self.atticSet, self.newSupplementalSet, newFileErrors) = self.checkMetaDbForFiles("alpha metaDb", "new") (self.oldMdb, spam, eggs, ham, self.oldSupplementalSet, oldFileErrors) = self.checkMetaDbForFiles("public metaDb", "old") + #check that attic fiels aren't in trackDb + errors.extend(self.__checkAtticNotInTrackDb()) #checks to see that nothing has disappeared between public and alpha errors.extend(self.__checkAlphaForDropped("alpha metaDb", "stanza")) errors.extend(self.__checkMd5sums()) #checks and gets tables that are present, also returns a revoked set of tables for new - (self.newTableSet, self.revokedTableSet, newTableError) = self.checkTableStatus("alpha metaDb", "new") - (self.oldTableSet, spam, oldTableError) = self.checkTableStatus("public metaDb", "old") + (self.newTableSet, self.revokedTableSet, self.newMissingTables, newTableError) = self.checkTableStatus("alpha metaDb", "new") + (self.oldTableSet, spam, self.droppedTables, oldTableError) = self.checkTableStatus("public metaDb", "old") #same as above except for gbdbs (self.newGbdbSet, self.revokedGbdbs, newGbdbError) = self.getGbdbFiles("new") (self.oldGbdbSet, eggs, oldGbdbError) = self.getGbdbFiles("old") + #remove missing files from gbdbs + self.oldGbdbSet = self.oldGbdbSet - self.missingFiles + for i in self.missingFiles: + if i in self.oldReleaseFiles: + del self.oldReleaseFiles[i] #fill in the errors errors.extend(newFileErrors) errors.extend(oldFileErrors) errors.extend(newTableError) errors.extend(oldTableError) errors.extend(newGbdbError) errors.extend(oldGbdbError) #for ease of typing totalFiles = set(self.newReleaseFiles) oldTotalFiles = set(self.oldReleaseFiles) #these could honestly be moved earlier, get a file list processing section or something #they clean out special fiels out and separated the master fiels list into the 3 required #ones: wgEncode, supplemental and additional. self.totalFiles = self.__cleanSpecialFiles(totalFiles) self.oldTotalFiles = self.__cleanSpecialFiles(oldTotalFiles) (self.oldTotalFiles, self.additionalList, self.oldAdditionalList, self.totalFiles) = self.__separateOutAdditional() #get the stuff you need to push, also table sizes self.errors = errors #don't output.append(report unless ignore option is on or no errors if (not errors) or self.ignore: self.output = self.printReport(args, c) else: - self.output = self.printErrors(errors) + self.output = self.printErrors(errors, self.missingFiles) elif self.releaseOld == 'solo': self.newReleaseFiles = c.releases[int(self.releaseNew)-1] self.newMdb = c.alphaMetaDb + #check that attic fiels aren't in trackDb + errors.extend(self.__checkAtticNotInTrackDb()) + (self.newMdb, self.revokedSet, self.revokedFiles, self.atticSet, self.newSupplementalSet, newFileErrors) = self.checkMetaDbForFiles("alpha metaDb", "new") - (self.newTableSet, self.revokedTableSet, newTableError) = self.checkTableStatus("alpha metaDb", "new") + (self.newTableSet, self.revokedTableSet, spam, newTableError) = self.checkTableStatus("alpha metaDb", "new") self.tableSize = self.__getTableSize() (self.newGbdbSet, self.revokedGbdbs, newGbdbError) = self.getGbdbFiles("new") #collect errors errors.extend(newFileErrors) errors.extend(newTableError) errors.extend(newGbdbError) #set for easy operations totalFiles = set(self.newReleaseFiles) #clean out special fiels we don't push i.e. md5sum.history self.totalFiles = self.__cleanSpecialFiles(totalFiles) #makes list for additional files (self.oldTotalFiles, self.oldSupplementalSet) = (set(), set()) (self.oldReleaseFiles, self.additionalList, self.oldAdditionalList, self.totalFiles) = self.__separateOutAdditional() self.errors = errors if (not errors) or self.ignore: self.output = self.printReportOne(args, c) else: - self.output = self.printErrors(errors) + self.output = self.printErrors(errors, self.missingFiles)