3d3123fbf82fccd2e122c98eb19ebb8aea7d0968 mmaddren Mon Aug 15 14:49:46 2011 -0700 mkGeoPkg/trackInfo incremental update. cvValidate added duplicate key detection. diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg index a028712..5dd0ec7 100755 --- python/programs/mkGeoPkg/mkGeoPkg +++ python/programs/mkGeoPkg/mkGeoPkg @@ -1,514 +1,575 @@ #!/hive/groups/encode/dcc/bin/python import sys, os, shutil, stat, argparse, datetime from ucscgenomics.compositetrack.CompositeTrack import * from ucscgenomics.rafile.RaFile import * from ucscgenomics.softfile.SoftFile import * from ucscgenomics.cvfile.CvFile import * class DataType(object): def __init__(self, molecule, strategy, source, selection, soft): self.molecule = molecule self.strategy = strategy self.source = source self.selection = selection self.soft = soft datatypes = { 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile), 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile), 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile), 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile), 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile), 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile), 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile), 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile), #these need to be curated '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', MicroArraySoftFile), 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None) } + +#compare this to the source in datatype, give GP ids depending on the type +gpIds = { + 'genomic': '63443', + 'transcriptomic': '30709', + 'protein': '63447' +} + cvDetails = { 'cell': [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ], 'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ] } #if the term appears in the mdb and must overriding the value in the cv cvOverride = [ 'sex' ] #talk to Venkat lol cvPretend = { 'antibody Input': 'control' } #if its not in cvDetails, which things should we check by default cvDefaults = [ 'description' ] mdbWhitelist = [ 'age', 'bioRep', 'control', 'controlId', 'fragSize', 'labExpId', 'labVersion', 'mapAlgorithm', 'obtainedBy', 'phase', 'readType', 'region', 'replicate', 'restrictionEnzyme', 'run', 'softwareVersion', 'spikeInPool', 'strain' ] # if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields # first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current) # polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. rnaExtractMapping = { 'shortPolyA': 'polyA RNA', 'longPolyA': 'polyA RNA', 'polyA': 'polyA RNA' } localizationMapping = { 'cytosol': 'cytoplasmic RNA', 'polysome': 'cytoplasmic RNA', 'membraneFraction': 'cytoplasmic RNA', 'mitochondria': 'cytoplasmic RNA', 'nucleus': 'nuclear RNA', 'nucleolus': 'nuclear RNA', 'nucleoplasm': 'nuclear RNA', 'nuclearMatrix': 'nuclear RNA', 'chromatin': 'nuclear RNA', 'cell': 'total RNA' } # map our instrument names to GEO's names instrumentModels = { 'Illumina_GA2x': 'Illumina Genome Analyzer II', 'Illumina_GA2': 'Illumina Genome Analyzer II', 'Illumina_HiSeq_2000': 'Illumina HiSeq 2000' } def isRawFile(file): return (file.extension == 'fastq' or file.extension == 'fasta') def isSupplimentaryFile(file): return not isRawFile(file) def sampleTitle(stanza, expVars): concat = stanza[expVars[0]].replace('-m', '') for expVar in expVars[1:len(expVars)]: + if expVar in stanza: concat += '_' + stanza[expVar] + else: + print 'warning: %s not in %s' % (expVar, stanza.name) return concat def createMappings(mdb): expIds = dict() geoMapping = dict() expVars = None series = None datatype = None for stanza in mdb.itervalues(): if 'objType' in stanza and stanza['objType'] == 'composite': series = stanza expVars = stanza['expVars'].split(',') continue if 'expId' not in stanza: print stanza.name + ': no expId' continue if 'geoSampleAccession' not in stanza: # if this hasn't been submitted to GEO yet, we'll add it to the submission list if stanza['expId'] not in expIds: expIds[stanza['expId']] = list() expIds[stanza['expId']].append(stanza) else: # otherwise we keep track of the geo number for partially submitted samples if stanza['expId'] not in geoMapping: geoMapping[stanza['expId']] = stanza['geoSampleAccession'] elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: geoMapping[stanza['expId']] = 'Inconsistent' print stanza.name + ': inconsistent geo mapping' if datatype == None and 'dataType' in stanza: datatype = stanza['dataType'] elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: raise KeyError(stanza.name + ': inconsistent data type') datatype = datatypes[datatype] return expIds, expVars, geoMapping, series, datatype -def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series): +def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace): if 'geoSeriesAccession' in series: print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession'] return print 'Writing series ' + series['composite'] seriesStanza = SeriesStanza() seriesStanza['^SERIES'] = series['composite'] seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT + + if '!Series_summary' in replace: + seriesStanza['!Series_summary'] = replace['!Series_summary'] + else: + raise Error('no series summary found. Please include in replace file.') seriesStanza['!Series_summary'] = '[REPLACE]' + + if '!Series_overall_design' in replace: + seriesStanza['!Series_overall_design'] = replace['!Series_overall_design'] + else: + raise Error('no series overall design found. Please include in replace file.') seriesStanza['!Series_overall_design'] = '[REPLACE]' + seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ] + + if '!Series_contributor' in replace: + seriesStanza['!Series_contributor'] = replace['!Series_contributor'] + else: seriesStanza['!Series_contributor'] = '[REPLACE]' - seriesStanza['!Series_gp_id'] = '[REPLACE]' + + seriesStanza['!Series_gp_id'] = gpIds[datatype.source] # could use !Series_variable_* and !Series_repeats_* seriesStanza['!Series_sample_id'] = list() for idNum in expIds.iterkeys(): if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': seriesStanza['!Series_sample_id'].append(geoMapping[idNum]) else: seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars)) softfile[series['composite']] = seriesStanza -def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype): +def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, instrument, replace): print 'Creating HighThroughput soft file' - softfile = SoftFile() + softfile = HighThroughputSoftFile() fileList = list() - createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series) + createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza() sample['^SAMPLE'] = sampleTitle(firstStanza, expVars) sample['!Sample_type'] = 'SRA' sample['!Sample_title'] = sample['^SAMPLE'] if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] count = 1 for stanza in expId: file = compositeTrack.files[stanza['fileName']] if isRawFile(file): sample['!Sample_raw_file_' + str(count)] = file.name sample['!Sample_raw_file_type_' + str(count)] = file.extension if file.md5sum != None: sample['!Sample_raw_file_checksum_' + str(count)] = file.md5sum fileList.append(file) count = count + 1 count = 1 for stanza in expId: file = compositeTrack.files[stanza['fileName']] if isSupplimentaryFile(file): sample['!Sample_supplementary_file_' + str(count)] = file.name if file.md5sum != None: sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database fileList.append(file) count = count + 1 sample['!Sample_source_name'] = firstStanza['cell'] sample['!Sample_organism'] = compositeTrack.organism sample['!Sample_characteristics'] = list() allVars = expVars + mdbWhitelist for var in allVars: if var in firstStanza: foobar = var sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var]) for pretend in cvPretend.iterkeys(): if var + ' ' + firstStanza[var] == pretend: foobar = cvPretend[pretend] if foobar in cvDetails: for cvVar in cvDetails[foobar]: if cvVar in cvOverride and cvVar in firstStanza: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + firstStanza[cvVar]) elif cvVar in cv[firstStanza[var]]: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar]) else: for cvVar in cvDefaults: if firstStanza[var] in cv and cvVar in cv[firstStanza[var]]: sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar]) sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName'] if 'treatment' in firstStanza: sample['!Sample_treatment_protocol'] = firstStanza['treatment'] if 'protocol' in cv[firstStanza['cell']]: for protocol in cv[firstStanza['cell']]['protocol'].split(' '): key, val = protocol.split(':') if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']: sample['!Sample_growth_protocol'] = val if datatype.molecule == 'OVERRIDE RNA': if firstStanza['rnaExtract'] in rnaExtractMapping: sample['!Sample_molecule'] = rnaExtractMapping[firstStanza['rnaExtract']] elif firstStanza['localization'] in localizationMapping: sample['!Sample_molecule'] = localizationMapping[firstStanza['localization']] else: sample['!Sample_molecule'] = datatype.molecule sample['!Sample_extract_protocol'] = compositeTrack.url sample['!Sample_library_strategy'] = datatype.strategy sample['!Sample_library_source'] = datatype.source sample['!Sample_library_selection'] = datatype.selection - # set to replace for if nothing has a seqPlatform + # set to replace for if nothing has a seqPlatform and no instrument model is specified. sample['!Sample_instrument_model'] = '[REPLACE]' for stanza in expId: if 'seqPlatform' in stanza: sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']] + break + elif instrument != None: + sample['!Sample_instrument_model'] = instrumentModels[instrument] + break + if sample['!Sample_instrument_model'] == '[REPLACE]': + if '!Sample_instrument_model' in replace: + sample['!Sample_instrument_model'] = replace['!Sample_instrument_model'] sample['!Sample_data_processing'] = compositeTrack.url if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': sample['!Sample_geo_accession'] = geoMapping[idNum] softfile[firstStanza['metaObject']] = sample return softfile, fileList def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): + raise KeyError('microarray') + print 'Creating MicroArray soft file' softfile = SoftFile() fileList = list() createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = MicroArraySampleStanza() sample['^SAMPLE'] = sampleTitle(firstStanza, expVars) if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] sample['!Sample_title'] = concat count = 1 for stanza in expId: if isSupplimentaryFile(stanza['fileName']): sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] if 'checksum' in stanza: sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] elif md5sums != None and stanza['fileName'] in md5sums: sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] fileList.append(stanza['fileName']) count = count + 1 # sample['!Sample_table'] = KeyOptional # CEL file sample['!Sample_source_name_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered - sample['!Sample_organism_ch'] = '[REPLACE]' #KeyOnePlusNumbered - sample['!Sample_characteristics_ch'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_organism_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_characteristics_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered sample['!Sample_molecule_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered sample['!Sample_extract_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered sample['!Sample_label_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered sample['!Sample_label_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered sample['!Sample_hyb_protocol'] = '[REPLACE]' #KeyOnePlus sample['!Sample_scan_protocol'] = '[REPLACE]' #KeyOnePlus sample['!Sample_data_processing'] = '[REPLACE]' #KeyOnePlus sample['!Sample_description'] = '[REPLACE]' #KeyZeroPlus sample['!Sample_platform_id'] = '[REPLACE]' # sample['!Sample_geo_accession'] = KeyOptional # sample['!Sample_anchor'] = KeyRequired SAGE ONLY # sample['!Sample_type'] = KeyRequired SAGE ONLY # sample['!Sample_tag_count'] = KeyRequired SAGE ONLY # sample['!Sample_tag_length'] = KeyRequired SAGE ONLY sample['!Sample_table_begin'] = '' sample['!Sample_table_end'] = '' softfile[firstStanza['accession']] = sample return softfile, fileList def createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): softfile = SoftFile() fileList = list() createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza() hasbigwig = 0 for stanza in expId: if getFileType(stanza['fileName']) == 'bigWig': hasbigwig = 1 if hasbigwig == 0: continue sample['^SAMPLE'] = firstStanza['geoSampleAccession'] if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession'] count = 1 for stanza in expId: if getFileType(stanza['fileName']) == 'bigWig': sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] if 'checksum' in stanza: sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] elif md5sums != None and stanza['fileName'] in md5sums: sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] # sample['!Sample_supplementary_file_build_' + str(count)] = database fileList.append(stanza['fileName']) count = count + 1 softfile[firstStanza['geoSampleAccession']] = sample return softfile, fileList def main(): parser = argparse.ArgumentParser(description = 'Prepares a submission to GEO. Creates a soft file and shell script with the correct call to aspera.') parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/') + parser.add_argument('-i', '--instrument', help='If specified, expIds without instruments listed will default to this value. Use the no-spacing name eg Illumina_GA2') + parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc)') parser.add_argument('database', help='The database, typically hg19 or mm9') parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance') parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file') if len(sys.argv) == 1: parser.print_usage() return args = parser.parse_args(sys.argv[1:]) - compositeTrack = CompositeTrack(args.database, args.composite) + compositeTrack = CompositeTrack(args.database, args.composite, args.trackPath) cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra' cv = CvFile(cvPath) + replace = dict() + if args.replace != None: + for line in open(args.replace): + if line == '': + continue + key, val = map(str.strip, line.split('=', 1)) + if key not in replace: + replace[key] = list() + replace[key].append(val) + + ids = list() for id in args.expIds: if '-' in id: start, end = id.split('-', 1) ids.extend(range(int(start), int(end) + 1)) else: ids.append(int(id)) - expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.metaDb) + expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb) submission = dict() if len(ids) == 0: submission = expIds else: for expId in ids: submission[expId] = expIds[expId] expIdStr = ' ' for id in args.expIds: expIdStr = expIdStr + id + ',' expIdStr = expIdStr[:len(expIdStr) - 1] print 'Generating soft using expIds ' + expIdStr if datatype.soft == HighThroughputSoftFile: - softfile, fileList = createHighThroughputSoftFile(compositeTrack, cv, submission, expVars, geoMapping, series, datatype) + softfile, fileList = createHighThroughputSoftFile(compositeTrack, cv, submission, expVars, geoMapping, series, datatype, args.instrument, replace) elif datatype.soft == MicroArraySoftFile: softfile, fileList = createMicroArraySoftFile(compositeTrack, cv, submission, expVars, geoMapping, series, datatype) else: raise KeyError('unsupported type') - print 'Writing soft file' + print 'Creating directory' + d = datetime.datetime.today() - datestring = '%4d%02d%02d' % (d.year, d.month, d.day) - outfileName = '%s%s.soft' % (compositeTrack.name, datestring) + datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day) + + dirname = '%s_%s/' % (compositeTrack.name, datestring) + os.mkdir(dirname) + + print 'Writing file' + + outfileName = '%s%s.soft' % (dirname, compositeTrack.name) outfile = open(outfileName, 'w') outfile.write(str(softfile)) - outscript = open(compositeTrack.name + datestring + '.sh', 'w') + fileslistname = '%sfiles.txt' % dirname + fileslist = open(fileslistname, 'w') + scriptname = '%supload.sh' % dirname + outscript = open(scriptname, 'w') outscript.write('#!/bin/sh\n\n') outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m \\\n') for file in fileList: if not os.path.exists(file.path): print IOError(file.path + ' does not exist') else: outscript.write(file.path + ' \\\n') + fileslist.write(file.name + '\n') outscript.write('asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n') outscript.close() + fileslist.close() - os.system('chmod +x ' + compositeTrack.name + datestring + '.sh') + os.system('chmod +x ' + scriptname) print 'Finished!' if __name__ == '__main__': main() \ No newline at end of file