8ee187d7188ba8de29781dd6c2497f69056c3ccc rhead Wed Apr 25 11:52:47 2012 -0700 Moved qa.py to qa/encode/tableCheck.py. diff --git python/lib/ucscgenomics/qa.py python/lib/ucscgenomics/qa.py deleted file mode 100644 index f0be336..0000000 --- python/lib/ucscgenomics/qa.py +++ /dev/null @@ -1,312 +0,0 @@ -#!/usr/bin/env python2.7 -import re -import argparse -import subprocess -import datetime -import time -import pipes - -""" - A collection of functions useful to the ENCODE and GB QA Teams. -""" - -def callHgsql(database, command): - """ Run hgsql command using subprocess, return stdout data if no error.""" - cmd = ["hgsql", database, "-Ne", command] - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - cmdout, cmderr = p.communicate() - if p.returncode != 0: - # keep command arguments nicely quoted - cmdstr = " ".join([pipes.quote(arg) for arg in cmd]) - raise Exception("Error from: " + cmdstr + ": " + cmderr) - return cmdout - -def getGbdbTables(database, tableset): - """ Remove tables that aren't pointers to Gbdb files.""" - sep = "','" - tablestr = sep.join(tableset) - tablestr = "'" + tablestr + "'" - hgsqlOut = callHgsql(database, "select table_name from information_schema.columns where table_name in (" - + tablestr + ") and column_name = 'fileName'") - gbdbtableset = set(hgsqlOut.split()) - return gbdbtableset - -def sorted_nicely(l): - """ Sort the given iterable in the way that humans expect.""" - convert = lambda text: int(text) if text.isdigit() else text - alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] - return sorted(l, key = alphanum_key) - -def countPerChrom(database, tables): - """ Count the amount of rows per chromosome.""" - notgbdbtablelist = tables - getGbdbTables(database, tables) - tablecounts = dict() - output = [] - globalseen = set() - localseen = dict() - - hgsqlOut = callHgsql(database, "select chrom from chromInfo") - chrlist = set(hgsqlOut.split()) - - notPositionalTable = set() - if not notgbdbtablelist: - output.append("No tables to count chroms") - output.append("") - return (output, tablecounts) - for i in notgbdbtablelist: - counts = dict() - - cmd = "hgsql %s -e \"select chrom from %s\"" % (database, i) - p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) - cmdoutput = p.stdout.read() - - chrs = cmdoutput.split("\n")[1:-1] - - localseen[i] = set() - - if not chrs: - notPositionalTable.add(i) - continue - for j in chrs: - globalseen.add(j) - if counts.has_key(j): - counts[j] = counts[j] + 1 - else: - localseen[i].add(j) - counts[j] = 1 - tablecounts[i] = counts - - for i in sorted(tablecounts): - output.append(i) - used = set() - for j in sorted_nicely(tablecounts[i]): - output.append("%s = %s" % (j, tablecounts[i][j])) - - notused = chrlist - (localseen[i] | (chrlist - globalseen)) - if notused: - output.append("Seen by others, but not used here:") - for j in sorted_nicely(notused): - output.append(j) - output.append("") - globalnotused = chrlist - globalseen - if globalnotused: - output.append("Not seen anywhere:") - for i in sorted_nicely(globalnotused): - output.append(i) - output.append("") - if notPositionalTable: - output.append("Not a positional table:") - for i in notPositionalTable: - output.append(i) - return (output, tablecounts) - -def checkTableDescriptions(database, tables): - """ Check if each table has a description or not.""" - tablelist = list() - missing = set() - output = [] - orstr = "" - for i in tables: - tablelist.append("tableName = '%s'" % i) - orsep = " OR " - orstr = orsep.join(tablelist) - hgsqlOut = callHgsql(database, "select tableName from tableDescriptions where " + orstr) - described = set(hgsqlOut.split()) - missing = tables - described - if missing: - output.append("Tables missing a description:") - for i in missing: - output.append(i) - output.append("") - else: - output.append("No tables missing a description") - output.append("") - return (output, missing) - -def checkTableIndex(database, tables): - """ Check if each table has an index or not.""" - notgbdbtablelist = tables - getGbdbTables(database, tables) - tablelist = list() - missing = set() - output = [] - - if not notgbdbtablelist: - output.append("No tables require an index") - output.append("") - return (output, missing) - - for i in notgbdbtablelist: - hgsqlOut = callHgsql(database, "show indexes from " + i) - index = hgsqlOut.split() - if index: - pass - else: - missing.add(i) - if missing: - output.append("Tables missing an index:") - for i in missing: - output.append(i) - output.append("") - else: - output.append("No missing indices") - output.append("") - - return (output, missing) - -def checkTableName(tables): - """ Check if table name has an underscore or not.""" - bad = set() - output = [] - for i in tables: - if re.search('.*_.*', i): - bad.add(i) - if bad: - output.append("These tables have underscores in the name") - for i in bad: - output.append(i) - output.append("") - else: - output.append("No malformed table names") - output.append("") - return (output, bad) - -def checkLabels(trackDb): - """ Check if long and short labels are too long, are duplicated or are auto-generated.""" - f = open(trackDb, "r") - lines = f.readlines() - seenlabel = dict() - output = [] - toolong = list() - p1 = re.compile('^\s*longLabel\s+(.*)$') - p2 = re.compile('^\s*shortLabel\s+(.*)$') - p3 = re.compile('^\s*#.*$') - for i in lines: - m1 = p1.match(i) - m2 = p2.match(i) - m3 = p3.match(i) - if m3: - continue - if m1: - if seenlabel.has_key(m1.group(1)): - seenlabel[m1.group(1)] = seenlabel[m1.group(1)] + 1 - else: - seenlabel[m1.group(1)] = 1 - if re.search('autogenerated', m1.group(1)): - toolong.append([m1.group(1), -1]) - output.append("longLabel '%s' is still autogenerated, please tell the wrangler to fix this" % m1.group(1)) - if len(m1.group(1)) > 80: - toolong.append([m1.group(1), len(m1.group(1))]) - output.append("longLabel '%s' is too long: %s" % (m1.group(1), len(m1.group(1)))) - if m2: - #short labels are allowed to repeat - #if seenlabel.has_key(m2.group(1)): - #seenlabel[m2.group(1)] = seenlabel[m2.group(1)] + 1 - #else: - #seenlabel[m2.group(1)] = 1 - if len(m2.group(1)) > 17: - toolong.append([m2.group(1), len(m2.group(1))]) - output.append("shortLabel '%s' is too long: %s" % (m2.group(1), len(m2.group(1)))) - for i in seenlabel: - if seenlabel[i] > 1: - output.append("%s label seen more than once: %s" % (i, seenlabel[i])) - - if output: - output.insert(0,"Label errors:") - output.append("") - else: - output.append("No labels are incorrect") - output.append("") - - return (output, toolong) - - -def checkTableCoords(database, tables): - """Runs checkTableCoords externally against a set of tables, timeout is 10 seconds""" - notgbdbtablelist = tables - getGbdbTables(database, tables) - results = [] - output = [] - - if not notgbdbtablelist: - output.append("No tables have coordinates") - output.append("") - return (output, results) - - - timeout = 20 - for i in sorted(notgbdbtablelist): - start = datetime.datetime.now() - cmd = "checkTableCoords %s %s" % (database, i) - p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) - killed = 0 - while p.poll() is None: - time.sleep(0.1) - now = datetime.datetime.now() - if (now - start).seconds > timeout: - p.kill() - killed = 1 - if not killed: - cmdoutput = p.stdout.read() - cmderr = p.stderr.read() - - if cmdoutput: - results.append(cmdoutput) - if cmderr: - results.append(cmderr) - elif killed: - results.append("Process timeout after %d seconds, for table: %s" % (timeout, i)) - results.append("You might want to manually run: '%s'" % cmd) - results.append("") - - if results: - output.append("These tables have coordinate errors:") - for i in results: - output.append(i) - else: - output.append("No coordinate errors") - output.append("") - return (output, results) - -def positionalTblCheck(database, tables): - notgbdbtablelist = tables - getGbdbTables(database, tables) - - - results = [] - output = [] - - if not notgbdbtablelist: - output.append("No tables are positional") - output.append("") - return (output, results) - - for i in notgbdbtablelist: - cmd = "positionalTblCheck %s %s" % (database, i) - p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) - cmdoutput = p.stdout.read() - cmderr = p.stderr.read() - if cmdoutput: - results.append(cmdoutput) - if cmderr: - results.append(cmderr) - if results: - p = re.compile('(.*)does not appear to be a positional table') - outResults = list() - nonPositional = list() - for i in results: - m = p.search(i) - if m: - nonPositional.append(m.group(1)) - else: - outResults.append(i) - - output.append("These tables have position errors:") - for i in outResults: - output.append(i) - if nonPositional: - output.append("These tables are non-positional:") - for i in nonPositional: - output.append(i) - output.append("") - else: - output.append("No position errors") - output.append("") - return (output, results)