993b4688947a0c743dce640264ccde6b601ff4f2
wong
  Mon Oct 24 15:55:51 2011 -0700
started qa algorithms for checking track stuff
diff --git python/lib/ucscgenomics/qa.py python/lib/ucscgenomics/qa.py
new file mode 100644
index 0000000..49f9ac2
--- /dev/null
+++ python/lib/ucscgenomics/qa.py
@@ -0,0 +1,123 @@
+##!/hive/groups/encode/dcc/bin/python
+import sys, os, re, argparse, subprocess, math
+from ucscgenomics import ra, track
+
+def getGbdbTables(database, tableset):
+		sep = "','"
+		tablestr = sep.join(tableset)
+		tablestr = "'" + tablestr + "'"
+
+		cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr)
+		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+		output = p.stdout.read()
+
+		gbdbtableset = set(output.split("\n")[1:-1])
+        
+		return gbdbtableset
+
+def sorted_nicely(l): 
+    """ Sort the given iterable in the way that humans expect.""" 
+    convert = lambda text: int(text) if text.isdigit() else text 
+    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
+    return sorted(l, key = alphanum_key)
+
+def countPerChrom(database, tables):
+	notgbdbtablelist = tables - getGbdbTables(database, tables)
+	tablecounts = dict()
+	cmd = "hgsql %s -e \"select chrom from chromInfo\"" % database
+	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+	output = p.stdout.read()
+
+	chrlist = set(output.split("\n")[1:-1])
+	globalseen = set()
+	localseen = dict()
+	for i in notgbdbtablelist:
+		counts = dict()
+		cmd = "hgsql %s -e \"select chrom from %s\"" % (database, i)
+		p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+		output = p.stdout.read()
+
+		chrs = output.split("\n")[1:-1]
+		localseen[i] = set()
+		
+		for j in chrs:
+			globalseen.add(j)
+			if counts.has_key(j):
+				counts[j] = counts[j] + 1
+			else: 
+				localseen[i].add(j)
+				counts[j] = 1
+		tablecounts[i] = counts
+
+	for i in sorted(tablecounts):
+		print ""
+		print i
+		used = set()
+		for j in sorted_nicely(tablecounts[i]):
+			print "%s = %s" % (j, tablecounts[i][j])
+		
+		notused = chrlist - (localseen[i] | (chrlist - globalseen))
+		if notused:
+			print "Seen by others, but not used here:"
+			for j in sorted_nicely(notused):
+				print j
+	print ""
+	print "Not seen anywhere:"
+	globalnotused = chrlist - globalseen
+	for i in sorted_nicely(globalnotused):
+		print i
+	return tablecounts
+
+def checkTableDescriptions(database, tables):
+	tablelist = list()
+	remain = set()
+	for i in tables:
+		tablelist.append("tableName = '%s'" % i)
+		orsep = " OR "
+		orstr = orsep.join(tablelist)
+	cmd = "hgsql %s -e \"select tableName from tableDescriptions where %s\"" % (database, orstr)
+	p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
+	output = p.stdout.read()
+
+	described = set(output.split("\n")[1:-1])
+	remain = tables - described
+	return
+		
+def main():
+
+	parser = argparse.ArgumentParser(
+        prog='qaChecks',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description='A series of checks for QA',
+        epilog=
+    """Examples:
+
+qaChecks hg19 tableList
+qaChecks hg19 tableList /path/to/trackDb.ra
+qaChecks hg19 tableList ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeSydhTfbs.new.ra
+
+    """
+        )
+	parser.add_argument('database', help='The database, typically hg19 or mm9')
+	parser.add_argument('tableList', help='The file containing a list of tables')
+	#parser.add_argument('trackDb', help='The trackDb file to check')
+
+	if len(sys.argv) == 1:
+		parser.print_help()
+		return
+
+	args = parser.parse_args(sys.argv[1:])
+
+	f = open(args.tableList, "r")
+	lines = f.readlines()
+	tables = set()
+	for i in lines:
+		tables.add(i.rstrip())
+
+	
+	tableCounts = countPerChrom(args.database, tables)
+	noDescriptions = checkTableDescriptions(args.database, tables)
+
+
+if __name__ == "__main__":
+	main()
\ No newline at end of file