ae77effc0be4895682562b78b98514cfca05b918
wong
  Fri Oct 14 02:56:47 2011 -0700
stable version of mkChangeNotes, still very alpha
diff --git python/programs/mkChangeNotes/mkChangeNotes python/programs/mkChangeNotes/mkChangeNotes
index 6115da6..9f7c464 100755
--- python/programs/mkChangeNotes/mkChangeNotes
+++ python/programs/mkChangeNotes/mkChangeNotes
@@ -1,123 +1,413 @@
 #!/hive/groups/encode/dcc/bin/python
-import sys, os, shutil, stat, argparse, datetime
-from ucscgenomics.compositetrack.CompositeTrack import *
-from ucscgenomics.rafile.RaFile import *
-from ucscgenomics.softfile.SoftFile import *
-from ucscgenomics.cvfile.CvFile import *
-
-def diff(list1, list2):
-	return list1.difference(list2), list1.intersection(list2), list2.difference(list1)
-	
-def diff2(dict1, dict2):
-	for file in list1.itervalues():
-		if file.name in list2:
-			if file.md5sum == dict2[file.name].md5sum:
-				same.append(file)
+import sys, os, re, argparse, subprocess
+from ucscgenomics import ra, track
+
+def checkMetaDbForFiles(mdb, files, status, loose):
+    errors = []
+    for i in files:
+        if not re.match('wgEncode.*', i):
+            continue
+        filestanza = mdb.filter(lambda s: re.match(".*%s.*" % i,s['fileName']), lambda s: s)
+        if filestanza:
+            pass
+        else:
+            #pass    
+            if loose and re.match('.*bai', i):
+                pass
+            else:
+                errors.append("metaDb: %s has is not mentioned in %s" % (i, status))
+                
+    return errors
+
+def checkAlphaForDropped(new, old, status, type):
+    errors=[]
+    for i in old:
+        if re.search('MAGIC', i):
+            pass
+            if i in old:
+                errors.append("MAGIC number same in alpha and public metaDb")
+            else:
+                continue
+        if not re.match('wgEncode.*', i):
+            continue
+        if i in new:
+            pass
+        else:
+            errors.append("%s: %s missing from %s" % (type, i, status))
+            
+    return errors
+
+def checkTableStatus(mdb, files, database, composite, status, loose):
+    errors=[]
+    #home = os.environ['HOME']
+    #dbhost = ''
+    #dbuser = ''
+    #dbpassword = ''
+    #p = re.compile('db.(\S+)=(\S+)')
+    #with open("%s/.hg.conf" % home) as f:
+    #    for line in f:
+    #        line.rstrip("\n\r")
+    #        if p.match(line):
+    #            m = p.match(line)
+    #            if m.groups(1)[0] == 'host':
+    #                dbhost = m.groups(1)[1]
+    #            if m.groups(1)[0] == 'user':
+    #                dbuser = m.groups(1)[1]
+    #            if m.groups(1)[0] == 'password':
+    #                dbpassword = m.groups(1)[1]
+    #print dbhost
+    #print dbuser
+    #print dbpassword
+
+    #db = MySQLdb.connect (host = dbhost,
+    #            user = dbuser,
+    #            passwd = dbpassword,
+    #            db = database)
+
+    #cursor = db.cursor ()
+    #cursor.execute ("show tables like '%s%s'" % (composite, "%"))
+    #tableset = set(cursor.fetchall())
+    
+    mdbtableset = set(mdb.filter(lambda s: s['objType'] == 'table' and 'tableName' in s and 'attic' not in s and s['fileName'].split(",",1)[0] in files, lambda s: s['metaObject']))
+    atticset = set(mdb.filter(lambda s: 'attic' in s, lambda s: s['metaObject']))
+    revokedset = set(mdb.filter(lambda s: re.search('revoked|replaced|renamed', s['objStatus']), lambda s: s['metaObject']))
+    mdbtableset = mdbtableset - revokedset
+
+    sep = "','"
+    tablestr = sep.join(mdbtableset)
+    tablestr = "'" + tablestr + "'"
+
+    #this should really be using python's database module, but I'd need admin access to install it
+    #at this point, I am just parsing the output form hgsql
+    cmd = "hgsql %s -e \"select table_name from information_schema.TABLES where table_name in (%s)\"" % (database, tablestr)
+    p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+    output = p.stdout.read()
+    
+    sqltableset = set(output.split("\n")[1:])
+
+    missingTableNames = set(mdb.filter(lambda s: s['objType'] == 'table' and 'tableName' not in s and 'attic' not in s, lambda s: s['metaObject']))
+
+    missingFromDb = mdbtableset - sqltableset
+    
+    if missingTableNames:
+        for i in missingTableNames:
+            errors.append("table: %s is type obj, but missing tableName field called by %s" % (i, status))
+
+    if missingFromDb:
+        for i in missingFromDb:
+            errors.append("table: %s table not found in Db called by %s" % (i, status))
+
+    return (mdbtableset, revokedset, atticset, errors)
+
+def getGbdbFiles(database, tableset, mdb):
+    errors = []
+    sep = "','"
+    tablestr = sep.join(tableset)
+    tablestr = "'" + tablestr + "'"
+
+    cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr)
+    p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+    output = p.stdout.read()
+    
+    gbdbtableset = set(output.split("\n")[1:])
+    
+    file1stanzalist = mdb.filter(lambda s: s['tableName'] in set(output.split("\n")[1:]), lambda s: s)
+    
+    gbdbfileset = set()
+    for i in file1stanzalist:
+        filelist = i['fileName'].split(',')
+        for j in filelist:
+            if os.path.isfile("/gbdb/%s/bbi/%s" % (database, j)):
+                gbdbfileset.add(j)
 			else:
-				new.append(file)
+                errors.append("gbdb: %s does not exist in /gbdb/%s/bbi" % (j, databsase))
+    
+    return (gbdbfileset, errors)
+
+def checkMd5sums(newfiles, oldfiles):
+    errors = []
+    for i in oldfiles:
+        if i not in newfiles:
+            pass
 		else:
-			dep.append(file)
+            if oldfiles[i].md5sum != newfiles[i].md5sum:
+                errors.append("file: %s have changed md5sums between releases. %s vs %s" % (i, oldfiles[i].md5sum, newfiles[i].md5sum))
+
+    return errors
+
+def makeFileSizes(c, args, pushFiles, pushGbdbs, additionalList):
+    pushFileSize = list()
+    for i in pushGbdbs:
+        pushFileSize.append("%s/%s" % (c.downloadsDirectory + 'release' + args.releaseNew, i))    
+    for i in pushFiles:
+        pushFileSize.append("%s/%s" % (c.downloadsDirectory + 'release' + args.releaseNew, i))
+    for i in additionalList:
+        pushFileSize.append("%s/%s" % (c.downloadsDirectory + 'release' + args.releaseNew, i))
+    filesizes = 0
+    for i in pushFileSize:
+        filesizes = filesizes + int(os.path.getsize(i))
+    filesizes = filesizes / (1024**2)
+        
+    return filesizes
+
+def cleanSpecialFiles(pushFiles, totalFiles):
+    specialRemoveList = ['md5sum.history']
+    for i in specialRemoveList:
+        if i in pushFiles:
+            pushFiles.remove(i)
+        if i in totalFiles:
+            totalFiles.remove(i)
+    
+    return(pushFiles, totalFiles)
+    
+def separateOutAdditional(oldReleaseFiles, totalFiles, pushFiles):
+    additionalList = set()
+    for i in totalFiles:
+        if not re.match('wgEncode.*', i):
+            additionalList.add(i)
+    for i in additionalList:
+        if i in pushFiles:
+            pushFiles.remove(i)
+    oldReleaseFiles = oldReleaseFiles - (oldReleaseFiles - totalFiles)
+
+    return(oldReleaseFiles, pushFiles, additionalList)
+
+def printWithPath(set, c, args):
+    for i in sorted(set):
+        print "%s/%s" % (c.downloadsDirectory + 'release' + args.releaseNew, i)
 	
-	for file in dict2.itervalues():
-		if file.name not in dict1:
-			new.append(file)
+def printReport(args, totalFiles, newGbdbSet, newTableSet, additionalList, pushTables, pushFiles, pushGbdbs, c, oldTableSet, oldReleaseFiles, oldGbdbSet, filesizes, newAtticSet, oldAtticSet, newRevokedSet, oldRevokedSet):
+    revokedSet = ((newRevokedSet | oldRevokedSet) - (oldRevokedSet - newRevokedSet))
+    unrevokedSet = (oldRevokedSet - newRevokedSet)
+    atticSet = ((newAtticSet | oldAtticSet) - (oldAtticSet - newAtticSet))
+    unatticSet = (oldAtticSet - newAtticSet) 
+    sep = "\n"    
+    print "mkChangeNotes v2"
+    print "%s %s Release %s" % (args.database, args.composite, args.releaseNew)
+    print "" 
+    print "Totals:"
+    print "New Files and Gbdbs: %d" % (int(len(additionalList) + int(len(pushFiles)) + int(len(pushGbdbs))))
+    print "Total size of files to be pushed: %d MB" % filesizes
+    print ""
+    print "Total Files: %d" % int(len(totalFiles | oldReleaseFiles))
+    print "Total Gbdbs: %d" % int(len(newGbdbSet | oldGbdbSet))
+    print "Total Tables: %d" % int(len(newTableSet | oldTableSet))
+    print "Other Files: %d" % int(len(additionalList))
+    print "Total Revoked: %d" % int(len(revokedSet))
+    print "Total Unrevoked: %d" % int(len(unrevokedSet))
+    print "Total Attic: %d" % int(len(atticSet))
+    print "Total Un-attic: %d" % int(len(unatticSet))
+    print "\n"
+    print "New Tables (%s):" % len(pushTables)
+    print sep.join(sorted(pushTables))
+    print "\n"
+    print "New Files (%s):" % len(pushFiles)
+    printWithPath(pushFiles, c, args)
+    print "\n"
+    print "New Gbdbs (%s):" % len(pushGbdbs)
+    printWithPath(pushGbdbs, c, args)
+    print "\n"
+    print "Additional Files (%s):" % len(additionalList)
+    printWithPath(additionalList, c, args)
+    print "\n"
+    print "Active Untouched Tables (%s):" % len(oldTableSet)
+    print sep.join(sorted(oldTableSet))
+    print "\n"
+    print "Active Untouched Files (%s):" % len(set(oldReleaseFiles))
+    printWithPath(oldReleaseFiles, c, args)
+    print "\n"
+    print "Active Untouched Gbdbs (%s):" % len(oldGbdbSet)
+    printWithPath(oldGbdbSet, c, args)
+    print "\n"
+    print "Revoked Objects (%s):" % len(revokedSet)
+    for i in sorted(revokedSet):
+        print i
+    print "\n"
+    print "Attic Objects (%s):" % len(atticSet)
+    for i in sorted(atticSet):
+        print i
+    print "\n"
+    print "Unrevoked Objects (%s):" % len(unrevokedSet)
+    for i in sorted(unrevokedSet):
+        print i
+    print "\n"
+    print "Un-attic Objects (%s):" % len(unatticSet)
+    for i in sorted(unatticSet):
+        print i
+    print "\n"
+    print "No Errors"
 			
-	return new, same, dep
+def printReportOne(args, totalFiles, newGbdbSet, newTableSet, additionalList, pushTables, pushFiles, c, pushGbdbs, filesizes, atticSet, revokedSet):
+    print "mkChangeNotes v2"
+    print "%s %s Release %s against Release %s" % (args.database, args.composite, args.releaseNew, args.releaseOld)
+    print "" 
+    print "Totals:"
+    print "Files and Gbdbs: %d" % (int(len(totalFiles)) + int(len(newGbdbSet)))
+    print "Total size of files to be pushed: %d MB" % filesizes
+    print ""
+    print "Files: %d" % int(len(totalFiles))
+    print "Gbdbs: %d" % int(len(newGbdbSet))
+    print "Tables: %d" % int(len(newTableSet))
+    print "Other Files: %d" % int(len(additionalList))
+    print "Total Revoked: %d" % int(len(revokedSet))
+    print "Total Attic: %d" % int(len(atticSet))
+    print "\n"
+    sep = "\n"    
+    print "New Tables (%s):" % len(pushTables)
+    print sep.join(sorted(pushTables))
+    print "\n"
+    print "New Files (%s):" % len(pushFiles)
+    printWithPath(pushFiles, c, args)
+    print "\n"
+    print "New Gbdbs (%s):" % len(pushGbdbs)
+    printWithPath(pushGbdbs, c, args)
+    print "\n"
+    print "Additional Files (%s):" % len(additionalList)
+    printWithPath(additionalList, c, args)
+    print "\n"
+    print "Revoked Objects (%s):" % len(revokedSet)
+    for i in sorted(revokedSet):
+        print i
+    print "\n"
+    print "Attic Objects (%s):" % len(atticSet)
+    for i in sorted(atticSet):
+        print i
+    print "\n"
+    print "No Errors"
+
+def printErrors(errors):
+    errorsDict = {}
+    for i in errors:
+        line = i.split(":", 1)
+        try:
+            errorsDict[line[0]].append(line[1])
+        except:
+            errorsDict[line[0]] = []
+            errorsDict[line[0]].append(line[1])
+    print "Errors (%s):" % len(errors)
+    for i in sorted(errorsDict.keys()):
+        print "%s:" % i
+        for j in sorted(errorsDict[i]):
+            print "%s" % j
 	
 def main():
 
-	parser = argparse.ArgumentParser(description = 'Prepares a submission to GEO. Creates a soft file and shell script with the correct call to aspera.')
+    parser = argparse.ArgumentParser(
+        prog='mkChangeNotes',
+        description='Writes out notes file for packing to QA',
+        epilog='example: encodeMkChange hg19 wgEncodeUwDnase 3 2'
+        )
 	parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/')
-	parser.add_argument('-i', '--instrument', help='If specified, expIds without instruments listed will default to this value. Use the no-spacing name eg Illumina_GA2')
+    parser.add_argument('-l', '--loose', action="store_true", default=0, help='Loose checking for legacy elements. Will be retired once all tracks go through a release cycle')
+    parser.add_argument('-i', '--ignore', action="store_true", default=0, help='Ignore errors, print out report.')
 	parser.add_argument('database', help='The database, typically hg19 or mm9')
 	parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance')
-	parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file')
+    parser.add_argument('releaseNew', help='The new release to be released')
+    parser.add_argument('releaseOld', help='The old release that is already released')
 	
 	if len(sys.argv) == 1:
-		parser.print_usage()
+        parser.print_help()
 		return
-	
 	args = parser.parse_args(sys.argv[1:])
+    if not args.releaseNew.isdigit():
+        parser.print_help()
+        return
 		
-	compositeTrack = CompositeTrack(args.database, args.composite, args.trackPath)
 	
-	cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra'
-	cv = CvFile(cvPath)
+    c = track.CompositeTrack(args.database,args.composite)
 	
-	same = list()
-	new = list()
-	changed = list()
-	dep = list()
+    loose = args.loose
 	
-	r1 = 0
-	r2 = 1
-	print len(compositeTrack.releases)
+    errors = []
 	
-	for file in compositeTrack.releases[r1].itervalues():
-		if file.name in compositeTrack.releases[r2]:
-			if file.md5sum == compositeTrack.releases[r2][file.name].md5sum:
-				same.append(file)
-			else:
-				new.append(file)
-		else:
-			dep.append(file)
+    if args.releaseOld == "-":
+        args.releaseOld = 0
+    if int(args.releaseOld) > int(args.releaseNew):
+        errors.append("Old Release is higher than New Release")
+        args.releaseOld = args.releaseNew
 		
-	for file in compositeTrack.releases[r2].itervalues():
-		if file.name not in compositeTrack.releases[r1]:
-			new.append(file)
+    if int(args.releaseNew) > 1:
 			
-	print 'Same: ' + str(len(same))
-	for file in same:
-		print file.name
+        newReleaseFiles = c.releases[int(args.releaseNew)-1]
+        oldReleaseFiles = c.releases[int(args.releaseOld)-1]
 		
-	print 'New: ' + str(len(new))
-	for file in new:
-		print file.name
+        newMdb = c.alphaMetaDb
+        oldMdb = c.publicMetaDb
 		
-	print 'Deprecated: ' + str(len(dep))
-	for file in dep:
-		print file.name
+        errors.extend(checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose))    
+        errors.extend(checkMetaDbForFiles(oldMdb, oldReleaseFiles, "public metaDb", loose))
+        errors.extend(checkAlphaForDropped(newMdb, oldMdb, "alpha metaDb", "stanza"))
+        errors.extend(checkAlphaForDropped(newReleaseFiles, oldReleaseFiles, "new release download directory", "file"))
+        errors.extend(checkMd5sums(newReleaseFiles, oldReleaseFiles))
 		
-	alphafiles = compositeTrack.alphaMetaDb.filter(lambda s: 'fileName' in s, lambda s: s['fileName'])
-	publicfiles = compositeTrack.publicMetaDb.filter(lambda s: 'fileName' in s, lambda s: s['fileName'])
+        (newTableSet, newRevokedSet, newAtticSet, newTableError) = checkTableStatus(newMdb, newReleaseFiles, args.database, args.composite, "alpha metaDb", loose)
+        errors.extend(newTableError)
+        (oldTableSet, oldRevokedSet, oldAtticSet, oldTableError) = checkTableStatus(oldMdb, oldReleaseFiles, args.database, args.composite, "public metaDb", loose)
+        errors.extend(oldTableError)
 	
-	sameMdb = list()
-	newMdb = list()
-	depMdb = list()
+        (newGbdbSet, newGbdbError) = getGbdbFiles(args.database, newTableSet, newMdb)
+        errors.extend(newGbdbError)
+        (oldGbdbSet, oldGbdbError) = getGbdbFiles(args.database, oldTableSet, oldMdb)
+        errors.extend(oldGbdbError)
 	
-	for file in alphafiles:
-		if file in publicfiles:
-			sameMdb.append(file)
-		else:
-			depMdb.append(file)
+        droppedTables = oldTableSet - newTableSet
+        if droppedTables:
+            for i in droppedTables:
+                errors.append("table: %s was dropped between releases" % i)
 		
-	for file in publicfiles:
-		if file not in alphafiles:
-			newMdb.append(file)
 			
-	print 'Same Mdb: ' + str(len(same))
-	for file in sameMdb:
-		print file
+#########
+        #some weird suggestion from online about python 3 not being able to compare strings to ints implicitly,
+        #here for future reference in case something breaks
+        #pushTables = sorted((newTableSet - oldTableSet), key=lambda item: (int(item.partition(' ')[0])
+        #           if item[0].isdigit() else float('inf'), item))
+        #pushFiles = sorted((newReleaseFiles - oldReleaseFiles), key=lambda item: (int(item.partition(' ')[0])
+        #           if item[0].isdigit() else float('inf'), item))
+        #pushGbdbs = sorted((newGbdbSet - oldGbdbSet), key=lambda item: (int(item.partition(' ')[0])
+        #           if item[0].isdigit() else float('inf'), item))
+#########
 		
-	print 'New Mdb: ' + str(len(new))
-	for file in newMdb:
-		print file
+        pushTables = sorted((newTableSet - oldTableSet))
+        pushFiles = sorted((set(newReleaseFiles) - set(oldReleaseFiles)))
+        pushGbdbs = sorted((newGbdbSet - oldGbdbSet))
+        totalFiles = set(newReleaseFiles)
 		
-	print 'Deprecated Mdb: ' + str(len(dep))
-	for file in depMdb:
-		print file
+        (pushFiles, totalFiles) = cleanSpecialFiles(pushFiles, totalFiles)
+        (oldReleaseFiles, totalFiles) = cleanSpecialFiles(set(oldReleaseFiles), totalFiles)
+        (oldReleaseFiles, pushFiles, additionalList) = separateOutAdditional(oldReleaseFiles, totalFiles, pushFiles)
 		
-	onlyAlphaMdb = list()
-	bothAlpha = list()
-	onlyAlphaD = list()
+        filesizes = makeFileSizes(c, args, pushFiles, pushGbdbs, additionalList)   
 	
-	onlyPublicMdb = list()
-	bothPublic = list()
-	onlyPublicD = list()
+        if (not errors) or args.ignore:
+            printReport(args, totalFiles, newGbdbSet, newTableSet, additionalList, pushTables, pushFiles, pushGbdbs, c, oldTableSet, oldReleaseFiles, oldGbdbSet, filesizes, newAtticSet, oldAtticSet, newRevokedSet, oldRevokedSet)
 	
-	for file in alphafiles:
+        else:
+            printErrors(errors)
 		
 
+    else:
+
+        args.releaseOld = 0
+        newReleaseFiles = c.releases[int(args.releaseNew)-1]
+        newMdb = c.alphaMetaDb
+        errors.extend(checkMetaDbForFiles(newMdb, newReleaseFiles, "alpha metaDb", loose))
+        (newTableSet, newRevokedSet, newAtticSet, newTableError) = checkTableStatus(newMdb, newReleaseFiles, args.database, args.composite, "alpha metaDb", loose)
+        errors.extend(newTableError)
+        (newGbdbSet, newGbdbError) = getGbdbFiles(args.database, newTableSet, newMdb)
+        errors.extend(newGbdbError)
+        #set for easy operations
+        totalFiles = set(newReleaseFiles)
+        #clean out special fiels we don't push i.e. md5sum.history
+        (pushFiles, totalFiles) = cleanSpecialFiles(totalFiles, totalFiles)
+        #makes list for additional files
+        (spam, pushFiles, additionalList) = separateOutAdditional(set(), totalFiles, pushFiles)
+        #makes files sizes
+        filesizes = makeFileSizes(c, args, pushFiles, newGbdbSet, additionalList)
+        if (not errors) or args.ignore:
+            printReportOne(args, totalFiles, newGbdbSet, newTableSet, additionalList, newTableSet, pushFiles, c, newGbdbSet, filesizes, newAtticSet, newRevokedSet) 
+        else:
+            printErrors(errors)
 	
 if __name__ == '__main__':
-	main()
\ No newline at end of file
+    main()
+