3c096e3bcb20f44cd5f5dd0ea02b9438f5c45059 mmaddren Mon Aug 1 14:00:20 2011 -0700 added commandline functionality to mkGeoPkg and trackInfo. also changed library files to now point at the correct package. diff --git python/programs/mkGeoPkg/trackInfo python/programs/mkGeoPkg/trackInfo index 8b08846..18f70a3 100755 --- python/programs/mkGeoPkg/trackInfo +++ python/programs/mkGeoPkg/trackInfo @@ -1,285 +1,170 @@ #!/hive/groups/encode/dcc/bin/python -import sys, os, shutil, stat -from rafile.RaFile import * -from softfile.SoftFile import * -from cvfile.CvFile import * - -class DataType(object): - - def __init__(self, molecule, strategy, source, selection, soft): - self.molecule = molecule - self.strategy = strategy - self.source = source - self.selection = selection - self.soft = soft - -datatypes = { - 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile), - 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile), - 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile), - 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), - 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), - 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), - 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile), - 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile), - 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), - 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile), - 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile), - 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile), - - #these need to be curated - '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None) -} -cvDetails = { - 'cell': [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ], - 'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ] -} - -#if the term appears in the mdb and must overriding the value in the cv -cvOverride = [ 'sex' ] - -#talk to Venkat lol -cvPretend = { 'antibody Input': 'control' } - -#if its not in cvDetails, which things should we check by default -cvDefaults = [ 'description' ] - -mdbWhitelist = [ - 'age', - 'bioRep', - 'control', - 'controlId', - 'fragSize', - 'labExpId', - 'labVersion', - 'mapAlgorithm', - 'obtainedBy', - 'phase', - 'readType', - 'region', - 'replicate', - 'restrictionEnzyme', - 'run', - 'softwareVersion', - 'spikeInPool', - 'strain' -] - -# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields -# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current) -# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. -rnaExtractMapping = { - 'shortPolyA': 'polyA RNA', - 'longPolyA': 'polyA RNA', - 'polyA': 'polyA RNA' -} - -localizationMapping = { - 'cytosol': 'cytoplasmic RNA', - 'polysome': 'cytoplasmic RNA', - 'membraneFraction': 'cytoplasmic RNA', - 'mitochondria': 'cytoplasmic RNA', - 'nucleus': 'nuclear RNA', - 'nucleolus': 'nuclear RNA', - 'nucleoplasm': 'nuclear RNA', - 'nuclearMatrix': 'nuclear RNA', - 'chromatin': 'nuclear RNA', - 'cell': 'total RNA' -} - -# map our instrument names to GEO's names -instrumentModels = { - 'Illumina_GA2x': 'Illumina Genome Analyzer II' -} - -organisms = { - 'hg19': 'human', - 'hg18': 'human', - 'mm9': 'mouse' -} +import sys, os, shutil, argparse +from ucscgenomics.rafile.RaFile import * +from ucscgenomics.softfile.SoftFile import * +from ucscgenomics.cvfile.CvFile import * +from ucscgenomics.compositetrack.CompositeTrack import * +from ucscgenomics.textstyle.TextStyle import TextStyle def filesize(val): if val > 1099511627776: return str(round(float(val) / 1099511627776, 2)) + 'TB' if val > 1073741824: return str(round(float(val) / 1073741824, 2)) + 'GB' if val > 1048576: return str(round(float(val) / 1048576, 2)) + 'MB' if val > 1024: return str(round(float(val) / 1024, 2)) + 'KB' else: return str(val) + 'B' - - def getFileType(filename): filename.replace('.gz', '') return filename.rsplit('.')[1] def isRawFile(filename): return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta') def isSupplimentaryFile(filename): return not isRawFile(filename) - def createMappings(mdb): expIds = dict() geoMapping = dict() - expVars = None series = None - datatype = None for stanza in mdb.itervalues(): if 'objType' in stanza and stanza['objType'] == 'composite': series = stanza - expVars = stanza['expVars'].split(',') continue if 'expId' not in stanza: - #print stanza.name + ': no expId' continue - # if this hasn't been submitted to GEO yet, we'll add it to the submission list - if stanza['expId'] not in expIds: - expIds[stanza['expId']] = list() + expId = int(stanza['expId']) - expIds[stanza['expId']].append(stanza) + if expId not in expIds: + expIds[expId] = list() + + expIds[expId].append(stanza) if 'geoSampleAccession' in stanza: # otherwise we keep track of the geo number for partially submitted samples - if stanza['expId'] not in geoMapping: - geoMapping[stanza['expId']] = stanza['geoSampleAccession'] - elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: - geoMapping[stanza['expId']] = 'Inconsistent' - print stanza.name + ': inconsistent geo mapping' - - if datatype == None and 'dataType' in stanza: - datatype = stanza['dataType'] - elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: - raise KeyError(stanza.name + ': inconsistent data type') - + if expId not in geoMapping: + geoMapping[expId] = stanza['geoSampleAccession'] + elif geoMapping[expId] != 'Inconsistent' and geoMapping[expId] != stanza['geoSampleAccession']: + geoMapping[expId] = 'Inconsistent' - - datatype = datatypes[datatype] - - return expIds, expVars, geoMapping, series, datatype + return expIds, geoMapping, series def main(): - database = sys.argv[1] - composite = sys.argv[2] - organism = organisms[database] - - #list everything - mode = 0 - - if len(sys.argv) > 3: - submitStart = sys.argv[3] - #list individual - mode = 1 - - if len(sys.argv) > 4: - submitSize = int(sys.argv[4]) - #list range - mode = 2 + parser = argparse.ArgumentParser(description = 'Provides information about a composite track.\nRed - Missing\nBlue - Already submitted\nYellow - Inconsistent GEO Accession per sample\nGreen - GEO Accession Number\nWhite - Unsubmitted file') + parser.add_argument('-u', '--unsubmitted', action='store_true', default=False, help='Do not list samples that have already been submitted') + parser.add_argument('-m', '--missing', action='store_true', default=False, help='List only missing files') + parser.add_argument('-s', '--size', action='store_true', default=False, help='Show file sizes') + parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/') + parser.add_argument('database', help='The database, typically hg19 or mm9') + parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance') + parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file') - mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE - trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra' + if len(sys.argv) == 1: + parser.print_usage() + return - downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/' + args = parser.parse_args(sys.argv[1:]) + compositeTrack = CompositeTrack(args.database, args.composite, args.trackPath) - mdb = RaFile(mdbPath) - track = RaFile(trackPath) - expIds, expVars, geoMapping, series, datatype = createMappings(mdb) + ids = list() - submission = dict() - sortedIds = expIds.keys() - sortedIds.sort() + for id in args.expIds: + if '-' in id: + start, end = id.split('-', 1) + ids.extend(range(int(start), int(end) + 1)) + else: + ids.append(int(id)) - if mode == 1: - sortedIds = [submitStart] - elif mode == 2: - sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize] + expIds, geoMapping, series = createMappings(compositeTrack.metaDb) + if len(ids) == 0: + ids = expIds.keys() - minId = min(sortedIds) - maxId = max(sortedIds) out = list() totalsize = 0 filecount = 0 - # 'Generating soft using expIds ' + minId + ' to ' + maxId - - for idNum in sortedIds: + for idNum in ids: samplesize = 0 + samplefiles = 0 expId = expIds[idNum] for stanza in expId: - if os.path.exists(downloadsDirectory + stanza['fileName']): + if 'geoSampleAccession' in stanza and args.unsubmitted: + continue - st = os.stat(downloadsDirectory + stanza['fileName']) - samplesize = samplesize + st.st_size - totalsize = totalsize + st.st_size + if stanza['fileName'] in compositeTrack.files and not args.missing: + file = compositeTrack.files[stanza['fileName']] + samplesize = samplesize + file.size + samplefiles = samplefiles + 1 + totalsize = totalsize + file.size filecount = filecount + 1 - strsub = '[Unsubmitted]' - if idNum in geoMapping and geoMapping[idNum] == 'Inconsistent': - strsub = '[Inconsistent]' - if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': - strsub = '[' + geoMapping[idNum] + ']' + size = '' + if args.size: + size = '[%s]' % filesize(samplesize) - out.append(' + ' + expId[0]['metaObject'] + ' (' + str(idNum) + ')' + '[' + filesize(samplesize) + ']' + strsub + ' - ' + str(len(expId)) + ' files') + if idNum in geoMapping: + if geoMapping[idNum] == 'Inconsistent': + out.append('\t%s %s %s%s - %s files' % (str(idNum), TextStyle.style(expId[0]['metaObject'], 'yellow'), TextStyle.style('[%s]' % geoMapping[idNum], 'green'), size, str(samplefiles))) + else: + out.append('\t%s %s %s%s - %s files' % (str(idNum), TextStyle.style(expId[0]['metaObject'], 'blue'), TextStyle.style('[%s]' % geoMapping[idNum], 'green'), size, str(samplefiles))) + else: + out.append('\t%s %s %s - %s files' % (str(idNum), expId[0]['metaObject'], size, str(samplefiles))) for stanza in expId: - if not os.path.exists(downloadsDirectory + stanza['fileName']): - out.append(' | ' + stanza['fileName'] + ' MISSING FILE!') - else: + if 'geoSampleAccession' in stanza and args.unsubmitted: + continue - st = os.stat(downloadsDirectory + stanza['fileName']) - #out.append(' | ' + stanza['fileName'] + ' [' + filesize(st.st_size) + ']') + if stanza['fileName'] in compositeTrack.files and not args.missing: + file = compositeTrack.files[stanza['fileName']] + size = '' + if args.size: + size = '[%s]' % file.size - strsub = '[Unsubmitted]' + if 'geoSampleAccession' not in stanza: + out.append('\t\t%s %s' % (file.name, size)) + elif idNum in geoMapping and geoMapping[idNum] == 'Inconsistent': + out.append('\t\t%s %s%s' % (TextStyle.style(file.name, 'blue'), TextStyle.style('[%s]' % stanza['geoSampleAccession'], 'green'), size)) + else: + out.append('\t\t%s %s' % (TextStyle.style(file.name, 'blue'), size)) + else: + out.append('\t\t%s %s' % (TextStyle.style(file.name, 'red'), size)) + + strsub = TextStyle.style('[Unsubmitted]', 'blue') if 'geoSeriesAccession' in series: - strsub = '[' + series['geoSeriesAccession'] + ']' + strsub = TextStyle.style('[%s]' % series['geoSeriesAccession'], 'green') modestr = '' - if mode == 1: - modestr = ' <' + minId + '>' - elif mode == 2: - modestr = ' <' + minId + '-' + maxId + '>' + for id in args.expIds: + modestr = modestr + id + ',' + modestr = modestr[:len(modestr) - 1] + + size = '' + if args.size: + size = '[%s]' % filesize(totalsize) - out.insert(0, composite + ' [' + filesize(totalsize) + ']' + strsub + modestr + ' - ' + str(filecount) + ' files') + out.insert(0, '%s %s%s%s - %s files' % (compositeTrack.name, size, strsub, modestr, str(filecount))) for line in out: print line if __name__ == '__main__': main() \ No newline at end of file