454a424e3b2d8660132d6cafc408a03d348c9d57 rhead Mon Mar 19 13:08:51 2012 -0700 Changing to conform with new style guide: changed tabs to 4 spaces, changed interpreter directive, put imports on their own lines and removed unused imports, removed repetitive documentation from beginning. diff --git python/lib/ucscgenomics/qa.py python/lib/ucscgenomics/qa.py index 27d24eb..dfdbd80 100644 --- python/lib/ucscgenomics/qa.py +++ python/lib/ucscgenomics/qa.py @@ -1,46 +1,24 @@ -#!/hive/groups/encode/dcc/bin/python -import sys, os, re, argparse, subprocess, math, threading, datetime, time, signal +#!/usr/bin/env python2.7 +import re +import argparse +import subprocess +import datetime +import time """ - A collection of functions useful to the ENCODE QA Team. - - Programs known to be dependent on it: - mkChangeNotes - qaInit - qaEncodeTracks2 - qaGbTracks - - Functions: - getGbdbTables - takes in a database and a set of tables, - returns a set of tables that are gbdb files in reality - sorted_nicely - a neat way of alphanumerically sorting - anything sortable, a nice snippet of functional programming - Google found. - countPerChrom - takes in Db & tables, returns a list of strings for - outputting and a dictionary of counts - dict->{Table}->{Chrom} = countPerChrom - *automatically filters out Gbdbs* - checkTableDescriptions - takes in Db and tables, returns a list - of strings and a set of tables with no description - checkTableIndex - takes in Db & tables, returns a list of strings - and a set of tables with no index - *automatically filters out Gbdbs* - checkTableName - takes in a set of tables, returns string output - and a set of tables with underscores in the name - checkLabels - takes in a trackDb.ra filepath, returns a string - output list and a set of tuples, ([label, #tooLongBy],[labe...) + A collection of functions useful to the ENCODE and GB QA Teams. """ def getGbdbTables(database, tableset): """ Remove tables that aren't pointers to Gbdb files.""" sep = "','" tablestr = sep.join(tableset) tablestr = "'" + tablestr + "'" cmd = "hgsql %s -e \"select table_name from information_schema.columns where table_name in (%s) and column_name = 'fileName'\"" % (database, tablestr) p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) cmdoutput = p.stdout.read() gbdbtableset = set(cmdoutput.split("\n")[1:-1]) return gbdbtableset @@ -55,32 +33,30 @@ notgbdbtablelist = tables - getGbdbTables(database, tables) tablecounts = dict() output = [] globalseen = set() localseen = dict() cmd = "hgsql %s -e \"select chrom from chromInfo\"" % database p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) cmdoutput = p.stdout.read() chrlist = set(cmdoutput.split("\n")[1:-1]) notPositionalTable = set() if not chrlist: output.append("Can't get chromInfo from %s for countPerChrom" % database) return (output, tablecounts) - - if not notgbdbtablelist: output.append("No tables to count chroms") output.append("") return (output, tablecounts) for i in notgbdbtablelist: counts = dict() cmd = "hgsql %s -e \"select chrom from %s\"" % (database, i) p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) cmdoutput = p.stdout.read() chrs = cmdoutput.split("\n")[1:-1] localseen[i] = set() if not chrs: notPositionalTable.add(i) @@ -184,31 +160,31 @@ output = [] for i in tables: if re.search('.*_.*', i): bad.add(i) if bad: output.append("These tables have underscores in the name") for i in bad: output.append(i) output.append("") else: output.append("No malformed table names") output.append("") return (output, bad) def checkLabels(trackDb): - """ Check if long and short labels are too long.""" + """ Check if long and short labels are too long, are duplicated or are auto-generated.""" f = open(trackDb, "r") lines = f.readlines() seenlabel = dict() output = [] toolong = list() p1 = re.compile('^\s*longLabel\s+(.*)$') p2 = re.compile('^\s*shortLabel\s+(.*)$') p3 = re.compile('^\s*#.*$') for i in lines: m1 = p1.match(i) m2 = p2.match(i) m3 = p3.match(i) if m3: continue if m1: @@ -323,16 +299,15 @@ else: outResults.append(i) output.append("These tables have position errors:") for i in outResults: output.append(i) if nonPositional: output.append("These tables are non-positional:") for i in nonPositional: output.append(i) output.append("") else: output.append("No position errors") output.append("") return (output, results) -