c6f909d8467773eeeb57ae68f0be20c697948645 rhead Wed Mar 21 11:49:24 2012 -0700 Made callHgsql function. diff --git python/lib/ucscgenomics/qa.py python/lib/ucscgenomics/qa.py index dfdbd80..0df2044 100644 --- python/lib/ucscgenomics/qa.py +++ python/lib/ucscgenomics/qa.py @@ -1,62 +1,69 @@ #!/usr/bin/env python2.7 import re import argparse import subprocess import datetime import time +import pipes """ A collection of functions useful to the ENCODE and GB QA Teams. """ def getGbdbTables(database, tableset): """ Remove tables that aren't pointers to Gbdb files.""" sep = "','" tablestr = sep.join(tableset) tablestr = "'" + tablestr + "'" cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr) p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) cmdoutput = p.stdout.read() gbdbtableset = set(cmdoutput.split("\n")[1:-1]) return gbdbtableset def sorted_nicely(l): """ Sort the given iterable in the way that humans expect.""" convert = lambda text: int(text) if text.isdigit() else text alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key = alphanum_key) +def callHgsql(database, command, options="-Ne"): + """Run hgsql command using subprocess, return stdout data if no error.""" + cmd = ["hgsql", database, options, command] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + cmdout, cmderr = p.communicate() + if p.returncode != 0: + # keep command arguments nicely quoted + cmdstr = " ".join([pipes.quote(arg) for arg in cmd]) + raise Exception("Error from: " + cmdstr + ": " + cmderr) + return cmdout + def countPerChrom(database, tables): """ Count the amount of rows per chromosome.""" notgbdbtablelist = tables - getGbdbTables(database, tables) tablecounts = dict() output = [] globalseen = set() localseen = dict() - cmd = "hgsql %s -e \"select chrom from chromInfo\"" % database - p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) - cmdoutput = p.stdout.read() + hgsqlOut = callHgsql(database, "select chrom from chromInfo") + chrlist = set(hgsqlOut.split()) - chrlist = set(cmdoutput.split("\n")[1:-1]) notPositionalTable = set() - if not chrlist: - output.append("Can't get chromInfo from %s for countPerChrom" % database) - return (output, tablecounts) if not notgbdbtablelist: output.append("No tables to count chroms") output.append("") return (output, tablecounts) for i in notgbdbtablelist: counts = dict() cmd = "hgsql %s -e \"select chrom from %s\"" % (database, i) p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) cmdoutput = p.stdout.read() chrs = cmdoutput.split("\n")[1:-1] localseen[i] = set() if not chrs: notPositionalTable.add(i)