8b09636a6001792a6dca8135057806bf5ad88d43 wong Mon Oct 17 12:12:30 2011 -0700 some minor corrections and fixed a bug for calculating revoked files, pigeon hole expansion problem diff --git python/programs/mkChangeNotes/mkChangeNotes python/programs/mkChangeNotes/mkChangeNotes index dd0c59f..e0e0576 100755 --- python/programs/mkChangeNotes/mkChangeNotes +++ python/programs/mkChangeNotes/mkChangeNotes @@ -1,49 +1,54 @@ #!/hive/groups/encode/dcc/bin/python import sys, os, re, argparse, subprocess, math from ucscgenomics import ra, track def checkMetaDbForFiles(mdb, files, status, loose): errors = [] revokedset = set() + revokedfiles = set() atticset = set() supplementalset = set() filtermdb = ra.RaFile() + for i in files: if re.match('.\/', i): supplementalset.add(i) if not re.match('wgEncode.*', i): continue + + #this needs to be rewritten to handle V2's and things like that filestanza = mdb.filter(lambda s: re.match(".*%s.*" % i,s['fileName']), lambda s: s) #should only return 1, just in case if filestanza: - for i in filestanza: - filtermdb[i.name] = i - if 'objStatus' in i and re.search('revoked|replaced|renamed', i['objStatus']): - revokedset.add(i.name) - if 'attic' in i: - atticset.add(i.name) + for j in filestanza: + filtermdb[j.name] = j + if 'objStatus' in j and re.search('revoked|replaced|renamed', j['objStatus']): + revokedfiles.add(i) + revokedset.add(j.name) + if 'attic' in j: + atticset.add(j.name) else: #pass if loose and re.match('.*bai', i): pass else: errors.append("metaDb: %s has is not mentioned in %s" % (i, status)) - return (filtermdb, revokedset, atticset, supplementalset, errors) + return (filtermdb, revokedset, revokedfiles, atticset, supplementalset, errors) def checkAlphaForDropped(new, old, status, type): errors=[] diff = set(old) -set(new) for i in diff: errors.append("%s: %s missing from %s" % (type, i, status)) return errors def checkFilesForDropped(new, old): diff = set(old) - set(new) return diff def checkTableStatus(mdb, files, database, composite, status, loose, revokedset): @@ -102,32 +107,32 @@ errors.append("table: %s is type obj, but missing tableName field called by %s" % (i, status)) if missingFromDb: for i in missingFromDb: errors.append("table: %s table not found in Db called by %s" % (i, status)) return (mdbtableset, revokedtableset, errors) def getGbdbFiles(database, tableset, revokedset, mdb): errors = [] sep = "','" tablestr = sep.join(tableset) tablestr = "'" + tablestr + "'" - revokestr = sep.join(tableset) - revokestr = "'" + tablestr + "'" + revokestr = sep.join(revokedset) + revokestr = "'" + revokestr + "'" cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr) p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) output = p.stdout.read() gbdbtableset = set(output.split("\n")[1:]) cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, revokestr) p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) output = p.stdout.read() revokedtableset = set(output.split("\n")[1:]) file1stanzalist = mdb.filter(lambda s: s['tableName'] in gbdbtableset, lambda s: s) revokedstanzalist = mdb.filter(lambda s: s['tableName'] in revokedtableset, lambda s: s) @@ -218,40 +223,30 @@ elif i in newSupplementalSet: continue else: oldAdditionalList.add(i) else: newOld.add(i) oldReleaseFiles = newOld return(newOld, additionalList, oldAdditionalList, newTotal) def printWithPath(set, c, release): for i in sorted(set): print "%s/%s" % (c.downloadsDirectory + 'release' + release, i) -def makeRevokedFiles(revokedSet, mdb): - revokedFiles = set() - for i in revokedSet: - file = mdb.filter(lambda s: re.match(".*%s.*" % i,s['fileName']), lambda s: s['fileName']) - for j in file: - splitfile = j.split(",") - for k in splitfile: - revokedFiles.add(k) - return revokedFiles - def printIter(inlist): for i in sorted(inlist): print i def printReport(args, totalFiles, newGbdbSet, newTableSet, additionalList, oldAdditionalList, pushTables, pushFiles, pushGbdbs, c, oldTableSet, oldReleaseFiles, oldGbdbSet, atticSet, revokedFiles, mdb, revokedTableSet, revokedGbdbs, missingFiles, newSupplementalSet, oldSupplementalSet, tableSize): #the groups here need to be predefined, I just copied and pasted after working out what they were sep = "\n" print "mkChangeNotes v2" print "%s %s Release %s" % (args.database, args.composite, args.releaseNew) print "" print "QA Count Summaries for Release %s:" % args.releaseNew print "Tables: %d" % int(len(newTableSet)) print "Files: %d" % int(len(totalFiles - revokedFiles)) print "Gbdbs: %d" % int(len(newGbdbSet)) print "Supplemental: %d" % int(len(newSupplementalSet - oldSupplementalSet)) @@ -330,47 +325,49 @@ print "Supplemental Files:" print "New: %s" % len(newSupplementalSet - oldSupplementalSet) print "Persisting: %s" % len(oldSupplementalSet & newSupplementalSet) print "Removed: %s" % len(oldSupplementalSet - newSupplementalSet) print "New + Persisting: %s" % len((newSupplementalSet - oldSupplementalSet) | (oldSupplementalSet & newSupplementalSet)) print "Total: %s" % len(newSupplementalSet | oldSupplementalSet) if args.full: print "" print "New Supplemental Files:" printWithPath(newSupplementalSet - oldSupplementalSet, c, args.releaseNew) print "" print "Persisting Supplemental Files:" printWithPath(oldSupplementalSet & newSupplementalSet, c, args.releaseNew) print "" print "Removed Supplemental Files:" - printWithPath(oldSupplementaList - newSupplementalSet, c, args.releaseNew) + printWithPath(oldSupplementalSet - newSupplementalSet, c, args.releaseNew) print "\n" print "OTHER FILES:" print "New: %s" % len(additionalList) print "Revoked/Replace: %s" % len(oldAdditionalList) print "Total: %s" % len(additionalList | oldAdditionalList) - print "\n" if args.full: print "" print "New Other Files (%s):" % len(additionalList) printWithPath(additionalList, c, args.releaseNew) print "" print "Revoked Other Files (%s):" % len(oldAdditionalList) printWithPath(oldAdditionalList, c, args.releaseNew) + print "\n" print "Files that dropped between releases (%s):" % len(missingFiles) printWithPath(missingFiles, c, args.releaseOld) - + print "\n" + if not args.ignore: + print "No Errors" def printReportOne(args, totalFiles, newGbdbSet, newTableSet, additionalList, pushTables, c, pushGbdbs, atticSet, newSupplementalSet, tableSize): print "mkChangeNotes v2" print "%s %s Release %s" % (args.database, args.composite, args.releaseNew) print "" print "QA Count Summaries for Release %s:" % args.releaseNew print "Tables: %d" % int(len(newTableSet)) print "Files: %d" % int(len(totalFiles)) print "Gbdbs: %d" % int(len(newGbdbSet)) print "Supplemental: %d" % int(len(newSupplementalSet)) print "Other: %d" % int(len(additionalList)) print "\n" totalsize = 0; print "Sizes of New:" print "Tables: %d MB" % tableSize @@ -394,30 +391,32 @@ print "New Tables (%s):" % len(pushTables) printIter(pushTables) print "\n" print "New Download Files (%s):" % len(totalFiles) printWithPath(totalFiles, c, args.releaseNew) print "\n" print "New Gbdb Files (%s):" % len(pushGbdbs) printWithPath(pushGbdbs, c, args.releaseNew) print "\n" print "New Supplemental Files:" printWithPath(newSupplementalSet, c, args.releaseNew) print "\n" print "New Other Files (%s):" % len(additionalList) printWithPath(additionalList, c, args.releaseNew) + if not args.ignore: + print "No Errors" def printErrors(errors): errorsDict = {} for i in errors: line = i.split(":", 1) try: errorsDict[line[0]].append(line[1]) except: errorsDict[line[0]] = [] errorsDict[line[0]].append(line[1]) print "Errors (%s):" % len(errors) for i in sorted(errorsDict.keys()): print "%s:" % i for j in sorted(errorsDict[i]): @@ -456,81 +455,80 @@ if args.releaseOld == "-": args.releaseOld = 0 if int(args.releaseOld) > int(args.releaseNew): errors.append("Old Release is higher than New Release") args.releaseOld = args.releaseNew if int(args.releaseNew) > 1: newReleaseFiles = c.releases[int(args.releaseNew)-1] oldReleaseFiles = c.releases[int(args.releaseOld)-1] newMdb = c.alphaMetaDb oldMdb = c.publicMetaDb - (newMdb, revokedSet, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose) + (newMdb, revokedSet, revokedFiles, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose) errors.extend(newFileErrors) - (oldMdb, spam, eggs, oldSupplementalSet, oldFileErrors) = checkMetaDbForFiles(oldMdb, oldReleaseFiles, "public metaDb", loose) + (oldMdb, spam, eggs, ham, oldSupplementalSet, oldFileErrors) = checkMetaDbForFiles(oldMdb, oldReleaseFiles, "public metaDb", loose) errors.extend(oldFileErrors) errors.extend(checkAlphaForDropped(newMdb, oldMdb, "alpha metaDb", "stanza")) missingFiles = checkFilesForDropped(newReleaseFiles, oldReleaseFiles) errors.extend(checkMd5sums(newReleaseFiles, oldReleaseFiles)) (newTableSet, revokedTableSet, newTableError) = checkTableStatus(newMdb, newReleaseFiles, args.database, args.composite, "alpha metaDb", loose, revokedSet) errors.extend(newTableError) (oldTableSet, spam, oldTableError) = checkTableStatus(oldMdb, oldReleaseFiles, args.database, args.composite, "public metaDb", loose, revokedSet) errors.extend(oldTableError) (newGbdbSet, revokedGbdbs, newGbdbError) = getGbdbFiles(args.database, newTableSet, revokedTableSet, newMdb) errors.extend(newGbdbError) (oldGbdbSet, eggs, oldGbdbError) = getGbdbFiles(args.database, oldTableSet, set(), oldMdb) errors.extend(oldGbdbError) droppedTables = oldTableSet - newTableSet if droppedTables: for i in droppedTables: errors.append("table: %s was dropped between releases" % i) totalFiles = set(newReleaseFiles) #these could honestly be moved earlier, get a file list processing section or something totalFiles = cleanSpecialFiles(totalFiles) oldReleaseFiles = cleanSpecialFiles(set(oldReleaseFiles)) (oldReleaseFiles, additionalList, oldAdditionalList, totalFiles) = separateOutAdditional(oldReleaseFiles, totalFiles, newSupplementalSet, oldSupplementalSet) - revokedFiles = makeRevokedFiles(revokedSet, newMdb) pushTables = set(sorted((newTableSet - oldTableSet))) tableSize = getTableSize(pushTables, args.database) pushFiles = set(sorted((totalFiles - oldReleaseFiles))) pushGbdbs = set(sorted((newGbdbSet - oldGbdbSet))) if (not errors) or args.ignore: printReport(args, totalFiles, newGbdbSet, newTableSet, additionalList, oldAdditionalList, pushTables, pushFiles, pushGbdbs, c, oldTableSet, oldReleaseFiles, oldGbdbSet, atticSet, revokedFiles, newMdb, revokedTableSet, revokedGbdbs, missingFiles, newSupplementalSet, oldSupplementalSet, tableSize) else: printErrors(errors) else: args.releaseOld = 0 newReleaseFiles = c.releases[int(args.releaseNew)-1] newMdb = c.alphaMetaDb - (newMdb, revokedSet, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose) + (newMdb, revokedSet, revokedFiles, atticSet, newSupplementalSet, newFileErrors) = checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose) errors.extend(newFileErrors) (newTableSet, revokedTableSet, newTableError) = checkTableStatus(newMdb, newReleaseFiles, args.database, args.composite, "alpha metaDb", loose, revokedSet) errors.extend(newTableError) tableSize = getTableSize(newTableSet, args.database) (newGbdbSet, revokedGbdbs, newGbdbError) = getGbdbFiles(args.database, newTableSet, revokedTableSet, newMdb) errors.extend(newGbdbError) #set for easy operations totalFiles = set(newReleaseFiles) #clean out special fiels we don't push i.e. md5sum.history totalFiles = cleanSpecialFiles(totalFiles)