3b0ae0a28655b958c4bc033435c9d10cee82ca65 rhead Fri Apr 6 19:28:54 2012 -0700 Changed hgsql calls to use callHgsql function, except for the one in countPerChrom, which needs more work before changing. Moved callHgsql to top of file. diff --git python/lib/ucscgenomics/qa.py python/lib/ucscgenomics/qa.py index 0df2044..f0be336 100644 --- python/lib/ucscgenomics/qa.py +++ python/lib/ucscgenomics/qa.py @@ -1,80 +1,79 @@ #!/usr/bin/env python2.7 import re import argparse import subprocess import datetime import time import pipes """ A collection of functions useful to the ENCODE and GB QA Teams. """ +def callHgsql(database, command): + """ Run hgsql command using subprocess, return stdout data if no error.""" + cmd = ["hgsql", database, "-Ne", command] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + cmdout, cmderr = p.communicate() + if p.returncode != 0: + # keep command arguments nicely quoted + cmdstr = " ".join([pipes.quote(arg) for arg in cmd]) + raise Exception("Error from: " + cmdstr + ": " + cmderr) + return cmdout def getGbdbTables(database, tableset): """ Remove tables that aren't pointers to Gbdb files.""" sep = "','" tablestr = sep.join(tableset) tablestr = "'" + tablestr + "'" - - cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr) - p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) - cmdoutput = p.stdout.read() - gbdbtableset = set(cmdoutput.split("\n")[1:-1]) + hgsqlOut = callHgsql(database, "select table_name from information_schema.columns where table_name in (" + + tablestr + ") and column_name = 'fileName'") + gbdbtableset = set(hgsqlOut.split()) return gbdbtableset def sorted_nicely(l): """ Sort the given iterable in the way that humans expect.""" convert = lambda text: int(text) if text.isdigit() else text alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key = alphanum_key) -def callHgsql(database, command, options="-Ne"): - """Run hgsql command using subprocess, return stdout data if no error.""" - cmd = ["hgsql", database, options, command] - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - cmdout, cmderr = p.communicate() - if p.returncode != 0: - # keep command arguments nicely quoted - cmdstr = " ".join([pipes.quote(arg) for arg in cmd]) - raise Exception("Error from: " + cmdstr + ": " + cmderr) - return cmdout - def countPerChrom(database, tables): """ Count the amount of rows per chromosome.""" notgbdbtablelist = tables - getGbdbTables(database, tables) tablecounts = dict() output = [] globalseen = set() localseen = dict() hgsqlOut = callHgsql(database, "select chrom from chromInfo") chrlist = set(hgsqlOut.split()) notPositionalTable = set() if not notgbdbtablelist: output.append("No tables to count chroms") output.append("") return (output, tablecounts) for i in notgbdbtablelist: counts = dict() + cmd = "hgsql %s -e \"select chrom from %s\"" % (database, i) p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) cmdoutput = p.stdout.read() chrs = cmdoutput.split("\n")[1:-1] + localseen[i] = set() if not chrs: notPositionalTable.add(i) continue for j in chrs: globalseen.add(j) if counts.has_key(j): counts[j] = counts[j] + 1 else: localseen[i].add(j) counts[j] = 1 tablecounts[i] = counts for i in sorted(tablecounts): @@ -99,65 +98,58 @@ output.append("Not a positional table:") for i in notPositionalTable: output.append(i) return (output, tablecounts) def checkTableDescriptions(database, tables): """ Check if each table has a description or not.""" tablelist = list() missing = set() output = [] orstr = "" for i in tables: tablelist.append("tableName = '%s'" % i) orsep = " OR " orstr = orsep.join(tablelist) - cmd = "hgsql %s -e \"select tableName from tableDescriptions where %s\"" % (database, orstr) - p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) - cmdoutput = p.stdout.read() - - described = set(cmdoutput.split("\n")[1:-1]) + hgsqlOut = callHgsql(database, "select tableName from tableDescriptions where " + orstr) + described = set(hgsqlOut.split()) missing = tables - described if missing: output.append("Tables missing a description:") for i in missing: output.append(i) output.append("") else: output.append("No tables missing a description") output.append("") - return (output, missing) def checkTableIndex(database, tables): """ Check if each table has an index or not.""" notgbdbtablelist = tables - getGbdbTables(database, tables) tablelist = list() missing = set() output = [] if not notgbdbtablelist: output.append("No tables require an index") output.append("") return (output, missing) for i in notgbdbtablelist: - cmd = "hgsql %s -e \"show indexes from %s\"" % (database, i) - p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) - cmdoutput = p.stdout.read() - - index = cmdoutput.split("\n")[1:-1] + hgsqlOut = callHgsql(database, "show indexes from " + i) + index = hgsqlOut.split() if index: pass else: missing.add(i) if missing: output.append("Tables missing an index:") for i in missing: output.append(i) output.append("") else: output.append("No missing indices") output.append("") return (output, missing)