464aa35bb5735dfb8f565621dbde8b3b2ead1581 mmaddren Thu May 10 14:53:09 2012 -0700 fixed ucscGb and some of the more important scripts using it since they broke during the move to the new library structure diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg index fdac241..96b1519 100755 --- python/programs/mkGeoPkg/mkGeoPkg +++ python/programs/mkGeoPkg/mkGeoPkg @@ -1,19 +1,26 @@ -#!/hive/groups/encode/dcc/bin/python +#!/usr/bin/env python2.7 import sys, os, shutil, stat, argparse, datetime, hashlib -from ucscgenomics import track, ra, soft, cv, mdb, geo, encode + +from ucscGb.gbData.ra.raFile import RaFile +from ucscGb.externalData.geo import submission +from ucscGb.externalData.geo import soft +from ucscGb.encode import encodeUtils +from ucscGb.encode.cv import CvFile +from ucscGb.encode.track import CompositeTrack, TrackFile + ''' mkGeoPkg - create a soft file and upload script for a given track, so that it may be sumbitted to GEO. To invoke the script, you must pass the composite and track name: mkGeoPkg hg19 wgEncodeSomeTrack This is typically not enough however; most tracks are not completely covered by their metadata, and it is necessary to supply additional information. The most commonly needed information is: !Series_summary - this is taken from the track's html page description. In most cases it can be copied, one paragraph per line. !Series_overall_design - this is taken from the Methods section on the track's page. As with summary, 1 paragraph per line. @@ -98,34 +105,34 @@ 'labVersion', 'mapAlgorithm', 'obtainedBy', 'phase', 'readType', 'region', 'replicate', 'restrictionEnzyme', 'run', 'softwareVersion', 'spikeInPool', 'strain' ] def isRawFile(file): - return (file.extension == 'fastq' or file.extension == 'fasta') + return (file.extension == 'fastq' or file.extension == 'csfasta' or file.extension == 'csqual') def isSupplimentaryFile(file): - return not isRawFile(file) + return (not isRawFile(file)) and file.extension != 'fasta' and file.extension != 'bam' and file.extension != 'bai' def sampleTitle(stanza, expVars, warn=False): concat = stanza[expVars[0]].replace('-m', '') for expVar in expVars[1:len(expVars)]: if expVar in stanza and stanza[expVar] != 'None': concat += '_' + stanza[expVar] elif warn: print 'warning: %s is None or not in %s' % (expVar, stanza.name) return concat def linkName(file, track): return '%s_%s' % (track.database, file.name) def createMappings(metadb): expIds = dict() @@ -159,31 +166,31 @@ else: # otherwise we keep track of the geo number for partially submitted samples if stanza['expId'] not in geoMapping: geoMapping[stanza['expId']] = stanza['geoSampleAccession'] elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: geoMapping[stanza['expId']] = 'Inconsistent' print stanza.name + ': inconsistent geo mapping' if datatype == None and 'dataType' in stanza: datatype = stanza['dataType'] elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: raise KeyError(stanza.name + ': inconsistent data type') try: dt = datatype - datatype = encode.dataTypes[dt] + datatype = encodeUtils.dataTypes[dt] datatype.name = dt except KeyError: raise KeyError(datatype) return expIds, expVars, geoMapping, series, datatype def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries): if 'geoSeriesAccession' in series: print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession'] return print 'Writing series ' + series['composite'] @@ -204,31 +211,31 @@ else: print 'no series overall design found. Please include in replace file.' seriesStanza['!Series_overall_design'] = '[REPLACE]' if audit: print seriesStanza.name + ': no overall design' seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ] if '!Series_contributor' in replace: seriesStanza['!Series_contributor'] = replace['!Series_contributor'] else: seriesStanza['!Series_contributor'] = '[REPLACE]' if audit: print seriesStanza.name + ': no contributor' - seriesStanza['!Series_gp_id'] = mdb.gpIds[compositeTrack.organism + ' ' + datatype.source] + seriesStanza['!Series_gp_id'] = encodeUtils.gpIds[compositeTrack.organism + ' ' + datatype.source] # could use !Series_variable_* and !Series_repeats_* if not argseries: seriesStanza['!Series_sample_id'] = list() for idNum in expIds.iterkeys(): if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': seriesStanza['!Series_sample_id'].append(geoMapping[idNum]) else: seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars)) softfile[series['composite']] = seriesStanza def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit, tarpath, argseries): @@ -252,105 +259,111 @@ sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1) sample['!Sample_type'] = 'SRA' sample['!Sample_title'] = sample['^SAMPLE'] if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] count = 1 #figure out if the instrument model is consistent across the entire sample instrumentModel = None for stanza in expId: if 'seqPlatform' in stanza: if instrumentModel == None: - instrumentModel = geo.instrumentModels[stanza['seqPlatform']] + instrumentModel = submission.instrumentModels[stanza['seqPlatform']] else: - if instrumentModel != geo.instrumentModels[stanza['seqPlatform']]: + if instrumentModel != submission.instrumentModels[stanza['seqPlatform']]: instrumentModel = None if audit: print 'expId' + str(expId) + ': inconsistent instrument model' break for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] filelist = list() + if file.extension == 'fasta': + print 'WARNING: fastas detected!!!' + if isRawFile(file): if file.name.endswith('.tgz') or file.name.endswith('.tar.gz'): if tarpath == None: raise IOError('this track contains tarred fastqs. Please specify a path through the -z option') dirname = tarpath + file.name.split('.')[0] + '/' if os.path.exists(dirname): - raise IOError(dirname + ' already exists') + print dirname + ' already exists, so not unzipping' else: os.mkdir(dirname) - os.system('tar -xf %s -C %s' % (file.path + file.name, dirname)) for root, dirnames, filenames in os.walk(dirname): for filename in filenames: if filename.endswith('.fastq') or filename.endswith('.txt'): os.system('gzip %s' % (root + '/' + filename)) for root, dirnames, filenames in os.walk(dirname): for filename in filenames: print root + '/' + filename - filelist.append(track.TrackFile(root + '/' + filename)) + filelist.append(TrackFile(root + '/' + filename)) else: filelist.append(file) for f in filelist: sample['!Sample_raw_file_' + str(count)] = linkName(f, compositeTrack) if f.extension == 'txt': sample['!Sample_raw_file_type_' + str(count)] = 'fastq' + elif f.extension == 'csfasta': + sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_csfasta' + elif f.extension == 'csqual': + sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_qual' else: sample['!Sample_raw_file_type_' + str(count)] = f.extension sample['!Sample_raw_file_checksum_' + str(count)] = f.md5sum if instrumentModel == None and 'seqPlatform' in stanza: - sample['!Sample_raw_file_instrument_model_' + str(count)] = geo.instrumentModels[stanza['seqPlatform']] + sample['!Sample_raw_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']] fileList.append(f) count = count + 1 count = 1 for stanza in expId: for fname in stanza['fileName'].split(','): file = compositeTrack.files[fname] if isSupplimentaryFile(file): sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack) if file.md5sum != None: sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database if instrumentModel == None and 'seqPlatform' in stanza: - sample['!Sample_supplementary_file_instrument_model_' + str(count)] = geo.instrumentModels[stanza['seqPlatform']] + sample['!Sample_supplementary_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']] fileList.append(file) count = count + 1 sample['!Sample_source_name'] = firstStanza['cell'] sample['!Sample_organism'] = compositeTrack.organism sample['!Sample_characteristics'] = list() allVars = expVars + mdbWhitelist for var in allVars: if var in firstStanza: foobar = var sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var]) for pretend in cvPretend.iterkeys(): @@ -367,60 +380,60 @@ if firstStanza[var] in cv and cvVar in cv[firstStanza[var]]: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar]) sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName'] if 'treatment' in firstStanza: sample['!Sample_treatment_protocol'] = firstStanza['treatment'] if 'protocol' in cv[firstStanza['cell']]: for protocol in cv[firstStanza['cell']]['protocol'].split(' '): key, val = protocol.split(':') if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']: sample['!Sample_growth_protocol'] = val if datatype.molecule == 'RNA': - if firstStanza['rnaExtract'] in geo.rnaExtractMapping: - sample['!Sample_molecule'] = geo.rnaExtractMapping[firstStanza['rnaExtract']] - elif firstStanza['localization'] in geo.localizationMapping: - sample['!Sample_molecule'] = geo.localizationMapping[firstStanza['localization']] + if firstStanza['rnaExtract'] in submission.rnaExtractMapping: + sample['!Sample_molecule'] = submission.rnaExtractMapping[firstStanza['rnaExtract']] + elif firstStanza['localization'] in submission.localizationMapping: + sample['!Sample_molecule'] = submission.localizationMapping[firstStanza['localization']] else: sample['!Sample_molecule'] = datatype.molecule if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown': - sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (geo.instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url) + sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (submission.instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url) else: sample['!Sample_extract_protocol'] = compositeTrack.url sample['!Sample_library_strategy'] = datatype.strategy sample['!Sample_library_source'] = datatype.source sample['!Sample_library_selection'] = datatype.selection # if the instrumentModel is consistent, just use that # otherwise take the first seqPlatform value from metadata # if that still fails, check the replacement file # finally just make it say [REPLACE] if instrumentModel != None: sample['!Sample_instrument_model'] = instrumentModel else: for stanza in expId: if 'seqPlatform' in stanza: - sample['!Sample_instrument_model'] = geo.instrumentModels[stanza['seqPlatform']] + sample['!Sample_instrument_model'] = submission.instrumentModels[stanza['seqPlatform']] break if '!Sample_instrument_model' not in sample: if '!Sample_instrument_model' in replace: - sample['!Sample_instrument_model'] = geo.instrumentModels[replace['!Sample_instrument_model'][0]] + sample['!Sample_instrument_model'] = submission.instrumentModels[replace['!Sample_instrument_model'][0]] if '!Sample_instrument_model' not in sample: sample['!Sample_instrument_model'] = '[REPLACE]' if audit: print stanza.name + ': no instrument' sample['!Sample_data_processing'] = compositeTrack.url if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': sample['!Sample_geo_accession'] = geoMapping[idNum] softfile[firstStanza['metaObject']] = sample return softfile, fileList @@ -557,34 +570,34 @@ parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/') parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc) and instrument model') parser.add_argument('-a', '--audit', action='store_true', default=False, help='Instead of building the files, will just give you a list of errors') parser.add_argument('-z', '--zip', help='Specifies a directory path to unzip tarred fastqs to, only applicable for tracks with tarred fastqs') parser.add_argument('-s', '--series', action='store_true', default=False, help='Only generates the series stanza, instead of generating the entire soft file') parser.add_argument('database', help='The database, typically hg19 or mm9') parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance') parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file') if len(sys.argv) == 1: parser.print_usage() return args = parser.parse_args(sys.argv[1:]) - compositeTrack = track.CompositeTrack(args.database, args.composite, args.trackPath) + compositeTrack = CompositeTrack(args.database, args.composite, args.trackPath) cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra' - controlledVocab = cv.CvFile(cvPath) + controlledVocab = CvFile(cvPath) if args.zip != None and not args.zip.endswith('/'): args.zip += '/' replace = dict() if args.replace != None: for line in open(args.replace): if line == '': continue key, val = map(str.strip, line.split('=', 1)) if key not in replace: replace[key] = list() replace[key].append(val) tempids = list()