python/lib/ucscgenomics/qa.py dcd6e8bb1a69f2b49621fe94cd8b521be001dcc0

dcd6e8bb1a69f2b49621fe94cd8b521be001dcc0
wong
  Thu Feb 9 16:12:56 2012 -0800
made it 0 or more spaces before labels
diff --git python/lib/ucscgenomics/qa.py python/lib/ucscgenomics/qa.py
index 4ec96af..2f95063 100644
--- python/lib/ucscgenomics/qa.py
+++ python/lib/ucscgenomics/qa.py
@@ -1,336 +1,336 @@
 #!/hive/groups/encode/dcc/bin/python
 import sys, os, re, argparse, subprocess, math, threading, datetime, time, signal
 
 """
 	A collection of functions useful to the ENCODE QA Team.
 
 	Programs known to be dependent on it:
 	mkChangeNotes
 	qaInit
 
 	Functions:
 	getGbdbTables - takes in a database and a set of tables,
 		returns a set of tables that are gbdb files in reality
 	sorted_nicely - a neat way of alphanumerically sorting 
 		anything sortable, a nice snippet of functional programming
 		Google found.
 	countPerChrom - takes in Db & tables, returns a list of strings for 
 		outputting and a dictionary of counts
 		dict->{Table}->{Chrom} = countPerChrom
 		*automatically filters out Gbdbs*
 	checkTableDescriptions - takes in Db and tables, returns a list 
 		of strings and a set of tables with no description
 	checkTableIndex - takes in Db & tables, returns a list of strings 
 		and a set of tables with no index
 		*automatically filters out Gbdbs*
 	checkTableName - takes in a set of tables, returns string output
 		and a set of tables with underscores in the name
 	checkLabels - takes in a trackDb.ra filepath, returns a string 
 		output list and a set of tuples, ([label, #tooLongBy],[labe...)
 """
 
 
 def getGbdbTables(database, tableset):
 	""" Remove tables that aren't pointers to Gbdb files."""
 	sep = "','"
 	tablestr = sep.join(tableset)
 	tablestr = "'" + tablestr + "'"
 
 	cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr)
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 	cmdoutput = p.stdout.read()
 	gbdbtableset = set(cmdoutput.split("\n")[1:-1])
 	return gbdbtableset
 
 def sorted_nicely(l): 
 	""" Sort the given iterable in the way that humans expect.""" 
 	convert = lambda text: int(text) if text.isdigit() else text 
 	alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
 	return sorted(l, key = alphanum_key)
 
 def countPerChrom(database, tables):
 	""" Count the amount of rows per chromosome."""
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	tablecounts = dict()
 	output = []
 	globalseen = set()
 	localseen = dict()
 	
 	cmd = "hgsql %s -e \"select chrom from chromInfo\"" % database
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 	cmdoutput = p.stdout.read()
 
 	chrlist = set(cmdoutput.split("\n")[1:-1])
 	notPositionalTable = set()
 	if not chrlist:
 		output.append("Can't get chromInfo from %s for countPerChrom" % database)
 		return (output, tablecounts)
 	
 	
 	if not notgbdbtablelist:
 		output.append("No tables to count chroms")
 		output.append("")
 		return (output, tablecounts)
 	for i in notgbdbtablelist:
 		counts = dict()
 		cmd = "hgsql %s -e \"select chrom from %s\"" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 		cmdoutput = p.stdout.read()
 
 		chrs = cmdoutput.split("\n")[1:-1]
 		localseen[i] = set()
 		
 		if not chrs:
 			notPositionalTable.add(i)
 			continue
 		for j in chrs:
 			globalseen.add(j)
 			if counts.has_key(j):
 				counts[j] = counts[j] + 1
 			else: 
 				localseen[i].add(j)
 				counts[j] = 1
 		tablecounts[i] = counts
 
 	for i in sorted(tablecounts):
 		output.append(i)
 		used = set()
 		for j in sorted_nicely(tablecounts[i]):
 			output.append("%s = %s" % (j, tablecounts[i][j]))
 
 		notused = chrlist - (localseen[i] | (chrlist - globalseen))
 		if notused:
 			output.append("Seen by others, but not used here:")
 			for j in sorted_nicely(notused):
 				output.append(j)
 		output.append("")
 	globalnotused = chrlist - globalseen
 	if globalnotused:
 		output.append("Not seen anywhere:")
 		for i in sorted_nicely(globalnotused):
 			output.append(i)
 	output.append("")
 	if notPositionalTable:
 		output.append("Not a positional table:")
 		for i in notPositionalTable:
 			output.append(i)
 	return (output, tablecounts)
 
 def checkTableDescriptions(database, tables):
 	""" Check if each table has a description or not."""
 	tablelist = list()
 	missing = set()
 	output = []
 	orstr = ""
 	for i in tables:
 		tablelist.append("tableName = '%s'" % i)
 		orsep = " OR "
 		orstr = orsep.join(tablelist)
 	cmd = "hgsql %s -e \"select tableName from tableDescriptions where %s\"" % (database, orstr)
 	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 	cmdoutput = p.stdout.read()
 
 	described = set(cmdoutput.split("\n")[1:-1])
 	missing = tables - described
 	if missing:
 		output.append("Tables missing a description:")
 		for i in missing:
 			output.append(i)
 		output.append("")
 	else:
 		output.append("No tables missing a description")
 		output.append("")
 	
 	return (output, missing)
 	
 def checkTableIndex(database, tables):
 	""" Check if each table has an index or not."""
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	tablelist = list()
 	missing = set()
 	output = []
 
 	if not notgbdbtablelist:
 		output.append("No tables require an index")
 		output.append("")
 		return (output, missing)
 
 	for i in notgbdbtablelist:
 		cmd = "hgsql %s -e \"show indexes from %s\"" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
 		cmdoutput = p.stdout.read()
 		
 		index = cmdoutput.split("\n")[1:-1]
 		if index:
 			pass
 		else:
 			missing.add(i)
 	if missing:
 		output.append("Tables missing an index:")
 		for i in missing:
 			output.append(i)
 		output.append("")
 	else:
 		output.append("No missing indices")
 		output.append("")
 
 	return (output, missing)
 
 def checkTableName(tables):
 	""" Check if table name has an underscore or not."""
 	bad = set()
 	output = []
 	for i in tables:
 		if re.search('.*_.*', i):
 			bad.add(i)
 	if bad:
 		output.append("These tables have underscores in the name")
 		for i in bad:
 			output.append(i)
 		output.append("")
 	else:
 		output.append("No malformed table names")
 		output.append("")
 	return (output, bad)
 
 def checkLabels(trackDb):
 	""" Check if long and short labels are too long."""
 	f = open(trackDb, "r")
 	lines = f.readlines()
 	seenlabel = dict()
 	output = []
 	toolong = list()
-	p1 = re.compile('^\s+longLabel\s+(.*)$')
-	p2 = re.compile('^\s+shortLabel\s+(.*)$')
-	p3 = re.compile('^\s+#.*$')
+	p1 = re.compile('^\s*longLabel\s+(.*)$')
+	p2 = re.compile('^\s*shortLabel\s+(.*)$')
+	p3 = re.compile('^\s*#.*$')
 	for i in lines:
 		m1 = p1.match(i)
 		m2 = p2.match(i)
 		m3 = p3.match(i)
 		if m3:
 			continue
 		if m1:
 			if seenlabel.has_key(m1.group(1)):
 				seenlabel[m1.group(1)] = seenlabel[m1.group(1)] + 1
 			else:
 				seenlabel[m1.group(1)] = 1
 			if re.search('autogenerated', m1.group(1)):
 				toolong.append([m1.group(1), -1])
 				output.append("longLabel '%s' is still autogenerated, please tell the wrangler to fix this" % m1.group(1))
 			if len(m1.group(1)) > 80:
 				toolong.append([m1.group(1), len(m1.group(1))])
 				output.append("longLabel '%s' is too long: %s" % (m1.group(1), len(m1.group(1))))
 		if m2:
 			#short labels are allowed to repeat
 			#if seenlabel.has_key(m2.group(1)):
 				#seenlabel[m2.group(1)] = seenlabel[m2.group(1)] + 1
 			#else:
 				#seenlabel[m2.group(1)] = 1
 			if len(m2.group(1)) > 17:
 				toolong.append([m2.group(1), len(m2.group(1))])
 				output.append("shortLabel '%s' is too long: %s" % (m2.group(1), len(m2.group(1))))
 	for i in seenlabel:
 		if seenlabel[i] > 1:
 			output.append("%s label seen more than once: %s" % (i, seenlabel[i]))
 			
 	if output:
 		output.insert(0,"Label errors:")
 		output.append("")
 	else:
 		output.append("No labels are incorrect")
 		output.append("")
 
 	return (output, toolong)
 
 
 def checkTableCoords(database, tables):
 	"""Runs checkTableCoords externally against a set of tables, timeout is 10 seconds"""
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 	results = []
 	output = []
 	
 	if not notgbdbtablelist:
 		output.append("No tables have coordinates")
 		output.append("")
 		return (output, results)
 
 
 	timeout = 20
 	for i in sorted(notgbdbtablelist):
 		start = datetime.datetime.now()
 		cmd = "checkTableCoords %s %s" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
 		killed = 0
 		while p.poll() is None:
 			time.sleep(0.1)
 			now = datetime.datetime.now()
 			if (now - start).seconds > timeout:
 				p.kill()
 				killed = 1
 		if not killed:
 			cmdoutput = p.stdout.read()
 			cmderr = p.stderr.read()
 			
 			if cmdoutput:
 				results.append(cmdoutput)
 			if cmderr:
 				results.append(cmderr)
 		elif killed:
 			results.append("Process timeout after %d seconds, for table: %s" % (timeout, i))
 			results.append("You might want to manually run: '%s'" % cmd)
 			results.append("")
 		
 	if results:
 		output.append("These tables have coordinate errors:")
 		for i in results:
 			output.append(i)
 	else:
 		output.append("No coordinate errors")
 		output.append("")
 	return (output, results)
 
 def positionalTblCheck(database, tables):
 	notgbdbtablelist = tables - getGbdbTables(database, tables)
 
 
 	results = []
 	output = []
 	
 	if not notgbdbtablelist:
 		output.append("No tables are positional")
 		output.append("")
 		return (output, results)
 
 	for i in notgbdbtablelist:
 		cmd = "positionalTblCheck %s %s" % (database, i)
 		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
 		cmdoutput = p.stdout.read()
 		cmderr = p.stderr.read()
 		if cmdoutput:
 			results.append(cmdoutput)
 		if cmderr:
 			results.append(cmderr)
 	if results:
 		p = re.compile('(.*)does not appear to be a positional table')
 		outResults = list()
 		nonPositional = list()
 		for i in results:
 			m = p.search(i)
 			if m:
 				nonPositional.append(m.group(1))
 			else:
 				outResults.append(i)
 
 		output.append("These tables have position errors:")
 		for i in outResults:
 			output.append(i)
 		if nonPositional:
 			output.append("These tables are non-positional:")
 			for i in nonPositional:
 				output.append(i)
 		output.append("")
 	else:
 		output.append("No position errors")
 		output.append("")
 	return (output, results)