python/lib/ucscgenomics/qa.py 85f1b7cafc40059c3036283d6472323f871fe930

85f1b7cafc40059c3036283d6472323f871fe930
wong
  Mon Oct 24 17:28:48 2011 -0700
added new functions and some better outputting
diff --git python/lib/ucscgenomics/qa.py python/lib/ucscgenomics/qa.py
index 49f9ac2..0f4bb95 100644
--- python/lib/ucscgenomics/qa.py
+++ python/lib/ucscgenomics/qa.py
@@ -1,123 +1,240 @@
 ##!/hive/groups/encode/dcc/bin/python
 import sys, os, re, argparse, subprocess, math
 from ucscgenomics import ra, track
 
 def getGbdbTables(database, tableset):
 		sep = "','"
 		tablestr = sep.join(tableset)
 		tablestr = "'" + tablestr + "'"
 
 		cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
-		output = p.stdout.read()
+		cmdoutput = p.stdout.read()
 
-		gbdbtableset = set(output.split("\n")[1:-1])
+		gbdbtableset = set(cmdoutput.split("\n")[1:-1])
         
 		return gbdbtableset
 
 def sorted_nicely(l): 
     """ Sort the given iterable in the way that humans expect.""" 
     convert = lambda text: int(text) if text.isdigit() else text 
     alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
     return sorted(l, key = alphanum_key)
 
 def countPerChrom(database, tables):
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	tablecounts = dict()
 	cmd = "hgsql %s -e \"select chrom from chromInfo\"" % database
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
-	output = p.stdout.read()
+	cmdoutput = p.stdout.read()
 
-	chrlist = set(output.split("\n")[1:-1])
+	chrlist = set(cmdoutput.split("\n")[1:-1])
 	globalseen = set()
 	localseen = dict()
+	output = []
+	if not tables:
+		output.append("No Tables to count")
+		output.append("")
+		return (output, tablecounts)
 	for i in notgbdbtablelist:
 		counts = dict()
 		cmd = "hgsql %s -e \"select chrom from %s\"" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
-		output = p.stdout.read()
+		cmdoutput = p.stdout.read()
 
-		chrs = output.split("\n")[1:-1]
+		chrs = cmdoutput.split("\n")[1:-1]
 		localseen[i] = set()
 		
 		for j in chrs:
 			globalseen.add(j)
 			if counts.has_key(j):
 				counts[j] = counts[j] + 1
 			else: 
 				localseen[i].add(j)
 				counts[j] = 1
 		tablecounts[i] = counts
 
 	for i in sorted(tablecounts):
-		print ""
-		print i
+		output.append(i)
 		used = set()
 		for j in sorted_nicely(tablecounts[i]):
-			print "%s = %s" % (j, tablecounts[i][j])
+			output.append("%s = %s" % (j, tablecounts[i][j]))
 		
 		notused = chrlist - (localseen[i] | (chrlist - globalseen))
 		if notused:
-			print "Seen by others, but not used here:"
+			output.append("Seen by others, but not used here:")
 			for j in sorted_nicely(notused):
-				print j
-	print ""
-	print "Not seen anywhere:"
+				output.append(j)
+		output.append("")
 	globalnotused = chrlist - globalseen
+	if globalnotused:
+		output.append("Not seen anywhere:")
 	for i in sorted_nicely(globalnotused):
-		print i
-	return tablecounts
+			output.append(i)
+	output.append("")
+	return (output, tablecounts)
 
 def checkTableDescriptions(database, tables):
 	tablelist = list()
-	remain = set()
+	missing = set()
+	output = []
 	for i in tables:
 		tablelist.append("tableName = '%s'" % i)
 		orsep = " OR "
 		orstr = orsep.join(tablelist)
 	cmd = "hgsql %s -e \"select tableName from tableDescriptions where %s\"" % (database, orstr)
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
-	output = p.stdout.read()
+	cmdoutput = p.stdout.read()
+
+	described = set(cmdoutput.split("\n")[1:-1])
+	missing = tables - described
+	if missing:
+		output.append("Tables missing a description:")
+		for i in missing:
+			output.append(i)
+		output.append("")
+	else:
+		output.append("No tables missing a description")
+		output.append("")
+	
+	return (output, missing)
+	
+def checkTableIndex(database, tables):
+	notgbdbtablelist = tables - getGbdbTables(database, tables)
+	tablelist = list()
+	missing = set()
+	output = []
+
+	for i in notgbdbtablelist:
+		cmd = "hgsql %s -e \"show indexes from %s\"" % (database, i)
+		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+		cmdoutput = p.stdout.read()
+		
+		index = cmdoutput.split("\n")[1:-1]
+		if index:
+			pass
+		else:
+			missing.add(i)
+	if missing:
+		output.append("Tables missing an index:")
+		for i in missing:
+			output.append(i)
+		output.append("")
+	else:
+		output.append("No missing indices")
+		output.append("")
+
+	return (output, missing)
 
-	described = set(output.split("\n")[1:-1])
-	remain = tables - described
-	return
+	
+def checkTableName(database, tables):
+	bad = set()
+	output = []
+	for i in tables:
+		if re.search('.*_.*', i):
+			bad.add(i)
+	if bad:
+		output.append("These tables have underscores in the name")
+		for i in bad:
+			output.append(i)
+		output.append("")
+	else:
+		output.append("No malformed table names")
+		output.append("")
+	return (output, bad)
+
+def checkLabels(trackDb):
+	f = open(trackDb, "r")
+	lines = f.readlines()
+	seenlabel = dict()
+	output = []
+	toolong = list()
+	p1 = re.compile('^\s+longLabel\s+(.*)$')
+	p2 = re.compile('^\s+shortLabel\s+(.*)$')
+	for i in lines:
+		m1 = p1.match(i)
+		m2 = p2.match(i)
+		if m1:
+			if seenlabel.has_key(m1.group(1)):
+				seenlabel[m1.group(1)] = seenlabel[m1.group(1)] + 1
+			else:
+				seenlabel[m1.group(1)] = 1
+			if len(m1.group(1)) > 80:
+				toolong.append([m1.group(1), len(m1.group(1))])
+				output.append("longLabel '%s' is too long: %s" % (m1.group(1), len(m1.group(1))))
+		if m2:
+			#short labels are allowed to repeat
+# 			if seenlabel.has_key(m2.group(1)):
+# 				seenlabel[m2.group(1)] = seenlabel[m2.group(1)] + 1
+# 			else:
+# 				seenlabel[m2.group(1)] = 1
+			if len(m2.group(1)) > 17:
+				toolong.append([m2.group(1), len(m2.group(1))])
+				output.append("shortLabel '%s' is too long: %s" % (m2.group(1), len(m2.group(1))))
+	for i in seenlabel:
+		if seenlabel[i] > 1:
+			output.append("%s label seen more than once: %s" % (i, seenlabel[i]))
+			
+	if output:
+		output.insert(0,"Label errors:")
+		output.append("")
+	else:
+		output.append("No labels are incorrect")
+		output.append("")
+
+	return (output, toolong)
 		
 def main():
 
 	parser = argparse.ArgumentParser(
         prog='qaChecks',
         formatter_class=argparse.RawDescriptionHelpFormatter,
         description='A series of checks for QA',
         epilog=
     """Examples:
 
 qaChecks hg19 tableList
 qaChecks hg19 tableList /path/to/trackDb.ra
 qaChecks hg19 tableList ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeSydhTfbs.new.ra
 
     """
         )
 	parser.add_argument('database', help='The database, typically hg19 or mm9')
 	parser.add_argument('tableList', help='The file containing a list of tables')
-	#parser.add_argument('trackDb', help='The trackDb file to check')
+	parser.add_argument('trackDb', help='The trackDb file to check')
 
 	if len(sys.argv) == 1:
 		parser.print_help()
 		return
 
 	args = parser.parse_args(sys.argv[1:])
 
 	f = open(args.tableList, "r")
 	lines = f.readlines()
 	tables = set()
 	for i in lines:
 		tables.add(i.rstrip())
 
+	output = []
+
+	(tableDescOutput, noDescription) = checkTableDescriptions(args.database, tables)
+	output.extend(tableDescOutput)
+
+	(tableIndexOut, missingIndex) = checkTableIndex(args.database, tables)
+	output.extend(tableIndexOut)
+	
+	(tableNameOut, badTableNames) = checkTableName(args.database, tables)
+	output.extend(tableNameOut)
 	
-	tableCounts = countPerChrom(args.database, tables)
-	noDescriptions = checkTableDescriptions(args.database, tables)
+	(labelOut, badLabels) = checkLabels(args.trackDb)
+	output.extend(labelOut)
+
+	(countChromOut, tableCounts) = countPerChrom(args.database, tables)
+	output.extend(countChromOut)
+
+	for i in output:
+		print i
 
 
 if __name__ == "__main__":
 	main()
\ No newline at end of file