87909f5188ff027a43543876b759a9f1d3e3def8
wong
  Thu Oct 27 14:44:53 2011 -0700
added skipping of comment lines for label check
diff --git python/lib/ucscgenomics/qa.py python/lib/ucscgenomics/qa.py
index b265c2d..4cc9dc3 100644
--- python/lib/ucscgenomics/qa.py
+++ python/lib/ucscgenomics/qa.py
@@ -1,286 +1,290 @@
 #!/hive/groups/encode/dcc/bin/python
 import sys, os, re, argparse, subprocess, math, threading, datetime, time, signal
 
 """
 	A collection of functions useful to the ENCODE QA Team.
 
 	Programs known to be dependent on it:
 	mkChangeNotes
 	qaInit
 
 	Functions:
 	getGbdbTables - takes in a database and a set of tables,
 		returns a set of tables that are gbdb files in reality
 	sorted_nicely - a neat way of alphanumerically sorting 
 		anything sortable, a nice snippet of functional programming
 		Google found.
 	countPerChrom - takes in Db & tables, returns a list of strings for 
 		outputting and a dictionary of counts
 		dict->{Table}->{Chrom} = countPerChrom
 		*automatically filters out Gbdbs*
 	checkTableDescriptions - takes in Db and tables, returns a list 
 		of strings and a set of tables with no description
 	checkTableIndex - takes in Db & tables, returns a list of strings 
 		and a set of tables with no index
 		*automatically filters out Gbdbs*
 	checkTableName - takes in a set of tables, returns string output
 		and a set of tables with underscores in the name
 	checkLabels - takes in a trackDb.ra filepath, returns a string 
 		output list and a set of tuples, ([label, #tooLongBy],[labe...)
 """
 
 
 def getGbdbTables(database, tableset):
 	""" Remove tables that aren't pointers to Gbdb files."""
 	sep = "','"
 	tablestr = sep.join(tableset)
 	tablestr = "'" + tablestr + "'"
 
 	cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr)
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 	cmdoutput = p.stdout.read()
 	gbdbtableset = set(cmdoutput.split("\n")[1:-1])
 	return gbdbtableset
 
 def sorted_nicely(l): 
     """ Sort the given iterable in the way that humans expect.""" 
     convert = lambda text: int(text) if text.isdigit() else text 
     alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
     return sorted(l, key = alphanum_key)
 
 def countPerChrom(database, tables):
 	""" Count the amount of rows per chromosome."""
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	tablecounts = dict()
 	output = []
 	globalseen = set()
 	localseen = dict()
 	
 	cmd = "hgsql %s -e \"select chrom from chromInfo\"" % database
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 	cmdoutput = p.stdout.read()
 
 	chrlist = set(cmdoutput.split("\n")[1:-1])
 	if not chrlist:
 		output.append("Can't get chromInfo from %s for countPerChrom" % database)
 		return (output, tablecounts)
 	
 	
 	if not tables:
 		output.append("No Tables to count")
 		output.append("")
 		return (output, tablecounts)
 	for i in notgbdbtablelist:
 		counts = dict()
 		cmd = "hgsql %s -e \"select chrom from %s\"" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 		cmdoutput = p.stdout.read()
 
 		chrs = cmdoutput.split("\n")[1:-1]
 		localseen[i] = set()
 		
 		for j in chrs:
 			globalseen.add(j)
 			if counts.has_key(j):
 				counts[j] = counts[j] + 1
 			else: 
 				localseen[i].add(j)
 				counts[j] = 1
 		tablecounts[i] = counts
 
 	for i in sorted(tablecounts):
 		output.append(i)
 		used = set()
 		for j in sorted_nicely(tablecounts[i]):
 			output.append("%s = %s" % (j, tablecounts[i][j]))
 
 		notused = chrlist - (localseen[i] | (chrlist - globalseen))
 		if notused:
 			output.append("Seen by others, but not used here:")
 			for j in sorted_nicely(notused):
 				output.append(j)
 		output.append("")
 	globalnotused = chrlist - globalseen
 	if globalnotused:
 		output.append("Not seen anywhere:")
 		for i in sorted_nicely(globalnotused):
 			output.append(i)
 	output.append("")
 	return (output, tablecounts)
 
 def checkTableDescriptions(database, tables):
 	""" Check if each table has a description or not."""
 	tablelist = list()
 	missing = set()
 	output = []
 	orstr = ""
 	for i in tables:
 		tablelist.append("tableName = '%s'" % i)
 		orsep = " OR "
 		orstr = orsep.join(tablelist)
 	cmd = "hgsql %s -e \"select tableName from tableDescriptions where %s\"" % (database, orstr)
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 	cmdoutput = p.stdout.read()
 
 	described = set(cmdoutput.split("\n")[1:-1])
 	missing = tables - described
 	if missing:
 		output.append("Tables missing a description:")
 		for i in missing:
 			output.append(i)
 		output.append("")
 	else:
 		output.append("No tables missing a description")
 		output.append("")
 	
 	return (output, missing)
 	
 def checkTableIndex(database, tables):
 	""" Check if each table has an index or not."""
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	tablelist = list()
 	missing = set()
 	output = []
 
 	for i in notgbdbtablelist:
 		cmd = "hgsql %s -e \"show indexes from %s\"" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 		cmdoutput = p.stdout.read()
 		
 		index = cmdoutput.split("\n")[1:-1]
 		if index:
 			pass
 		else:
 			missing.add(i)
 	if missing:
 		output.append("Tables missing an index:")
 		for i in missing:
 			output.append(i)
 		output.append("")
 	else:
 		output.append("No missing indices")
 		output.append("")
 
 	return (output, missing)
 
 def checkTableName(tables):
 	""" Check if table name has an underscore or not."""
 	bad = set()
 	output = []
 	for i in tables:
 		if re.search('.*_.*', i):
 			bad.add(i)
 	if bad:
 		output.append("These tables have underscores in the name")
 		for i in bad:
 			output.append(i)
 		output.append("")
 	else:
 		output.append("No malformed table names")
 		output.append("")
 	return (output, bad)
 
 def checkLabels(trackDb):
 	""" Check if long and short labels are too long."""
 	f = open(trackDb, "r")
 	lines = f.readlines()
 	seenlabel = dict()
 	output = []
 	toolong = list()
 	p1 = re.compile('^\s+longLabel\s+(.*)$')
 	p2 = re.compile('^\s+shortLabel\s+(.*)$')
+	p3 = re.compile('^\s+#.*$')
 	for i in lines:
 		m1 = p1.match(i)
 		m2 = p2.match(i)
+		m3 = p3.match(i)
+		if m3:
+			continue
 		if m1:
 			if seenlabel.has_key(m1.group(1)):
 				seenlabel[m1.group(1)] = seenlabel[m1.group(1)] + 1
 			else:
 				seenlabel[m1.group(1)] = 1
 			if len(m1.group(1)) > 80:
 				toolong.append([m1.group(1), len(m1.group(1))])
 				output.append("longLabel '%s' is too long: %s" % (m1.group(1), len(m1.group(1))))
 		if m2:
 			#short labels are allowed to repeat
 			#if seenlabel.has_key(m2.group(1)):
 				#seenlabel[m2.group(1)] = seenlabel[m2.group(1)] + 1
 			#else:
 				#seenlabel[m2.group(1)] = 1
 			if len(m2.group(1)) > 17:
 				toolong.append([m2.group(1), len(m2.group(1))])
 				output.append("shortLabel '%s' is too long: %s" % (m2.group(1), len(m2.group(1))))
 	for i in seenlabel:
 		if seenlabel[i] > 1:
 			output.append("%s label seen more than once: %s" % (i, seenlabel[i]))
 			
 	if output:
 		output.insert(0,"Label errors:")
 		output.append("")
 	else:
 		output.append("No labels are incorrect")
 		output.append("")
 
 	return (output, toolong)
 
 
 def checkTableCoords(database, tables):
 	"""Runs checkTableCoords externally against a set of tables, timeout is 10 seconds"""
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	results = []
 	output = []
 	timeout = 10
 	for i in sorted(notgbdbtablelist):
 		start = datetime.datetime.now()
 		cmd = "checkTableCoords %s %s" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
 		killed = 0
 		while p.poll() is None:
 			time.sleep(0.1)
 			now = datetime.datetime.now()
 			if (now - start).seconds > timeout:
 				p.kill()
 				killed = 1
 		if not killed:
 			cmdoutput = p.stdout.read()
 			cmderr = p.stderr.read()
 			
 			if cmdoutput:
 				results.append(cmdoutput)
 			if cmderr:
 				results.append(cmderr)
 		elif killed:
 			results.append("Process timeout for table: %s" % i)
 			results.append("You might want to manually run: '%s'" % cmd)
 			results.append("")
 		
 	if results:
 		output.append("These tables have coordinate errors:")
 		for i in results:
 			output.append(i)
 	else:
 		output.append("No coordinate errors")
 		output.append("")
 	return (output, results)
 
 def positionalTblCheck(database, tables):
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	results = []
 	output = []
 	for i in notgbdbtablelist:
 		cmd = "positionalTblCheck %s %s" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
 		cmdoutput = p.stdout.read()
 		cmderr = p.stderr.read()
 		if cmdoutput:
 			results.append(cmdoutput)
 		if cmderr:
 			results.append(cmderr)
 	if results:
 		output.append("These tables have position errors:")
 		for i in results:
 			output.append(i)
 	else:
 		output.append("No position errors")
 		output.append("")
 	return (output, results)