034683bf9ab4de313e47bb188ac89e0c71ac5ae8
wong
  Tue Oct 25 14:05:54 2011 -0700
added some documentation, added in 2 new functions, and corrected some other stuff i noticed
diff --git python/lib/ucscgenomics/qa.py python/lib/ucscgenomics/qa.py
old mode 100644
new mode 100755
index 0f4bb95..fdd32a5
--- python/lib/ucscgenomics/qa.py
+++ python/lib/ucscgenomics/qa.py
@@ -1,39 +1,73 @@
-##!/hive/groups/encode/dcc/bin/python
-import sys, os, re, argparse, subprocess, math
+#!/hive/groups/encode/dcc/bin/python
+import sys, os, re, argparse, subprocess, math, threading, datetime, time, signal
 from ucscgenomics import ra, track
 
+"""
+	A collection of functions useful to the ENCODE QA Team.
+
+	Programs known to be dependent on it:
+	mkChangeNotes
+	qaInit
+
+	Functions:
+	getGbdbTables - takes in a database and a set of tables,
+		returns a set of tables that are gbdb files in reality
+	sorted_nicely - a neat way of alphanumerically sorting 
+		anything sortable, a nice snippet of functional programming
+		Google found.
+	countPerChrom - takes in Db & tables, returns a list of strings for 
+		outputting and a dictionary of counts
+		dict->{Table}->{Chrom} = countPerChrom
+		*automatically filters out Gbdbs*
+	checkTableDescriptions - takes in Db and tables, returns a list 
+		of strings and a set of tables with no description
+	checkTableIndex - takes in Db & tables, returns a list of strings 
+		and a set of tables with no index
+		*automatically filters out Gbdbs*
+	checkTableName - takes in a set of tables, returns string output
+		and a set of tables with underscores in the name
+	checkLabels - takes in a trackDb.ra filepath, returns a string 
+		output list and a set of tuples, ([label, #tooLongBy],[labe...)
+
+	main - runs all the checks, prints output
+
+"""
+
+
 def getGbdbTables(database, tableset):
+	""" Remove tables that aren't pointers to Gbdb files."""
 		sep = "','"
 		tablestr = sep.join(tableset)
 		tablestr = "'" + tablestr + "'"
 
 		cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 		cmdoutput = p.stdout.read()
 
 		gbdbtableset = set(cmdoutput.split("\n")[1:-1])
         
 		return gbdbtableset
 
 def sorted_nicely(l): 
     """ Sort the given iterable in the way that humans expect.""" 
     convert = lambda text: int(text) if text.isdigit() else text 
     alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
     return sorted(l, key = alphanum_key)
 
 def countPerChrom(database, tables):
+	""" Count the amount of rows per chromosome."""
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	tablecounts = dict()
 	cmd = "hgsql %s -e \"select chrom from chromInfo\"" % database
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 	cmdoutput = p.stdout.read()
 
 	chrlist = set(cmdoutput.split("\n")[1:-1])
 	globalseen = set()
 	localseen = dict()
 	output = []
 	if not tables:
 		output.append("No Tables to count")
 		output.append("")
 		return (output, tablecounts)
 	for i in notgbdbtablelist:
@@ -63,99 +97,102 @@
 		notused = chrlist - (localseen[i] | (chrlist - globalseen))
 		if notused:
 			output.append("Seen by others, but not used here:")
 			for j in sorted_nicely(notused):
 				output.append(j)
 		output.append("")
 	globalnotused = chrlist - globalseen
 	if globalnotused:
 		output.append("Not seen anywhere:")
 		for i in sorted_nicely(globalnotused):
 			output.append(i)
 	output.append("")
 	return (output, tablecounts)
 
 def checkTableDescriptions(database, tables):
+	""" Check if each table has a description or not."""
 	tablelist = list()
 	missing = set()
 	output = []
 	for i in tables:
 		tablelist.append("tableName = '%s'" % i)
 		orsep = " OR "
 		orstr = orsep.join(tablelist)
 	cmd = "hgsql %s -e \"select tableName from tableDescriptions where %s\"" % (database, orstr)
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 	cmdoutput = p.stdout.read()
 
 	described = set(cmdoutput.split("\n")[1:-1])
 	missing = tables - described
 	if missing:
 		output.append("Tables missing a description:")
 		for i in missing:
 			output.append(i)
 		output.append("")
 	else:
 		output.append("No tables missing a description")
 		output.append("")
 	
 	return (output, missing)
 	
 def checkTableIndex(database, tables):
+	""" Check if each table has an index or not."""
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	tablelist = list()
 	missing = set()
 	output = []
 
 	for i in notgbdbtablelist:
 		cmd = "hgsql %s -e \"show indexes from %s\"" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 		cmdoutput = p.stdout.read()
 		
 		index = cmdoutput.split("\n")[1:-1]
 		if index:
 			pass
 		else:
 			missing.add(i)
 	if missing:
 		output.append("Tables missing an index:")
 		for i in missing:
 			output.append(i)
 		output.append("")
 	else:
 		output.append("No missing indices")
 		output.append("")
 
 	return (output, missing)
 	
-	
-def checkTableName(database, tables):
+def checkTableName(tables):
+	""" Check if table name has an underscore or not."""
 	bad = set()
 	output = []
 	for i in tables:
 		if re.search('.*_.*', i):
 			bad.add(i)
 	if bad:
 		output.append("These tables have underscores in the name")
 		for i in bad:
 			output.append(i)
 		output.append("")
 	else:
 		output.append("No malformed table names")
 		output.append("")
 	return (output, bad)
 
 def checkLabels(trackDb):
+	""" Check if long and short labels are too long."""
 	f = open(trackDb, "r")
 	lines = f.readlines()
 	seenlabel = dict()
 	output = []
 	toolong = list()
 	p1 = re.compile('^\s+longLabel\s+(.*)$')
 	p2 = re.compile('^\s+shortLabel\s+(.*)$')
 	for i in lines:
 		m1 = p1.match(i)
 		m2 = p2.match(i)
 		if m1:
 			if seenlabel.has_key(m1.group(1)):
 				seenlabel[m1.group(1)] = seenlabel[m1.group(1)] + 1
 			else:
 				seenlabel[m1.group(1)] = 1
@@ -172,32 +209,91 @@
 				toolong.append([m2.group(1), len(m2.group(1))])
 				output.append("shortLabel '%s' is too long: %s" % (m2.group(1), len(m2.group(1))))
 	for i in seenlabel:
 		if seenlabel[i] > 1:
 			output.append("%s label seen more than once: %s" % (i, seenlabel[i]))
 			
 	if output:
 		output.insert(0,"Label errors:")
 		output.append("")
 	else:
 		output.append("No labels are incorrect")
 		output.append("")
 
 	return (output, toolong)
 
+
+def checkTableCoords(database, tables):
+	"""Runs checkTableCoords externally against a set of tables, timeout is 10 seconds"""
+	notgbdbtablelist = tables - getGbdbTables(database, tables)
+	results = []
+	output = []
+	timeout = 10
+	for i in sorted(notgbdbtablelist):
+		start = datetime.datetime.now()
+		cmd = "checkTableCoords %s %s" % (database, i)
+		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+		killed = 0
+		while p.poll() is None:
+			time.sleep(0.1)
+			now = datetime.datetime.now()
+			if (now - start).seconds > timeout:
+				p.kill()
+				
+				killed = 1
+		if not killed:
+			cmdoutput = p.stdout.read()
+			if cmdoutput:
+				results.append(cmdoutput)
+		elif killed:
+			results.append("Process timeout for table: %s" % i)
+			results.append("You might want to manually run: '%s'" % cmd)
+			results.append("")
+		
+	if results:
+		output.append("These tables have coordinate errors:")
+		for i in results:
+			output.append(i)
+	else:
+		output.append("No coordinate errors")
+		output.append("")
+	return (output, results)
+
+def positionalTblCheck(database, tables):
+	notgbdbtablelist = tables - getGbdbTables(database, tables)
+	results = []
+	output = []
+	for i in notgbdbtablelist:
+		cmd = "positionalTblCheck %s %s" % (database, i)
+		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+		cmdoutput = p.stdout.read()
+		
+		if cmdoutput:
+			results.append(cmdoutput)
+	if results:
+		output.append("These tables have position errors:")
+		for i in results:
+			output.append(i)
+	else:
+		output.append("No position errors")
+		output.append("")
+	return (output, results)
+
 def main():
 
+	""" Run all the checks, print output"""
+
 	parser = argparse.ArgumentParser(
         prog='qaChecks',
         formatter_class=argparse.RawDescriptionHelpFormatter,
         description='A series of checks for QA',
         epilog=
     """Examples:
 
 qaChecks hg19 tableList
 qaChecks hg19 tableList /path/to/trackDb.ra
 qaChecks hg19 tableList ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeSydhTfbs.new.ra
 
     """
         )
 	parser.add_argument('database', help='The database, typically hg19 or mm9')
 	parser.add_argument('tableList', help='The file containing a list of tables')
@@ -211,30 +307,36 @@
 
 	f = open(args.tableList, "r")
 	lines = f.readlines()
 	tables = set()
 	for i in lines:
 		tables.add(i.rstrip())
 
 	output = []
 
 	(tableDescOutput, noDescription) = checkTableDescriptions(args.database, tables)
 	output.extend(tableDescOutput)
 
 	(tableIndexOut, missingIndex) = checkTableIndex(args.database, tables)
 	output.extend(tableIndexOut)
 	
-	(tableNameOut, badTableNames) = checkTableName(args.database, tables)
+	(tableNameOut, badTableNames) = checkTableName(tables)
 	output.extend(tableNameOut)
 	
 	(labelOut, badLabels) = checkLabels(args.trackDb)
 	output.extend(labelOut)
 
-	(countChromOut, tableCounts) = countPerChrom(args.database, tables)
-	output.extend(countChromOut)
+	(coordsOut, badCoords) = checkTableCoords(args.database, tables)
+	output.extend(coordsOut)
+
+	(posOut, badPos) = positionalTblCheck(args.database, tables)
+	output.extend(posOut)
+
+	#(countChromOut, tableCounts) = countPerChrom(args.database, tables)
+	#output.extend(countChromOut)
 
 	for i in output:
 		print i
 	
 
 if __name__ == "__main__":
-	main()
\ No newline at end of file
+	main()