64f127cb71252486ce096a832ac7fad3deb324a5 mmaddren Wed Sep 14 17:04:53 2011 -0700 large-scale renaming change to allow python to be built into cluster/bin, also mkGeoPkg now renames files diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg index a83a9bb..e85c252 100755 --- python/programs/mkGeoPkg/mkGeoPkg +++ python/programs/mkGeoPkg/mkGeoPkg @@ -1,48 +1,45 @@ #!/hive/groups/encode/dcc/bin/python import sys, os, shutil, stat, argparse, datetime -from ucscgenomics.compositetrack.CompositeTrack import * -from ucscgenomics.rafile.RaFile import * -from ucscgenomics.softfile.SoftFile import * -from ucscgenomics.cvfile.CvFile import * +from ucscgenomics import track, ra, soft, cv class DataType(object): def __init__(self, molecule, strategy, source, selection, soft): self.molecule = molecule self.strategy = strategy self.source = source self.selection = selection self.soft = soft datatypes = { - 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile), - 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile), - 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile), - 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), - 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), - 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), - 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile), - 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile), - 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), - 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile), - 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile), - 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile), + 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', soft.HighThroughputSoftFile), + 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', soft.HighThroughputSoftFile), + 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', soft.HighThroughputSoftFile), + 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile), + 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile), + 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile), + 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', soft.HighThroughputSoftFile), + 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', soft.HighThroughputSoftFile), + 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile), + 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', soft.HighThroughputSoftFile), + 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', soft.HighThroughputSoftFile), + 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', soft.HighThroughputSoftFile), #these need to be curated '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', MicroArraySoftFile), + 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', soft.MicroArraySoftFile), 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), @@ -173,128 +170,153 @@ geoMapping[stanza['expId']] = stanza['geoSampleAccession'] elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: geoMapping[stanza['expId']] = 'Inconsistent' print stanza.name + ': inconsistent geo mapping' if datatype == None and 'dataType' in stanza: datatype = stanza['dataType'] elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: raise KeyError(stanza.name + ': inconsistent data type') datatype = datatypes[datatype] return expIds, expVars, geoMapping, series, datatype -def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace): +def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit): if 'geoSeriesAccession' in series: print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession'] return print 'Writing series ' + series['composite'] - seriesStanza = SeriesStanza() + seriesStanza = soft.SeriesStanza() seriesStanza['^SERIES'] = series['composite'] seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT if '!Series_summary' in replace: seriesStanza['!Series_summary'] = replace['!Series_summary'] else: print 'warning: no series summary found. Please include in replace file.' seriesStanza['!Series_summary'] = '[REPLACE]' + if audit: + print seriesStanza.name + ': no summary' if '!Series_overall_design' in replace: seriesStanza['!Series_overall_design'] = replace['!Series_overall_design'] else: print 'no series overall design found. Please include in replace file.' seriesStanza['!Series_overall_design'] = '[REPLACE]' + if audit: + print seriesStanza.name + ': no overall design' seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ] if '!Series_contributor' in replace: seriesStanza['!Series_contributor'] = replace['!Series_contributor'] else: seriesStanza['!Series_contributor'] = '[REPLACE]' + if audit: + print seriesStanza.name + ': no contributor' seriesStanza['!Series_gp_id'] = gpIds[compositeTrack.organism + ' ' + datatype.source] # could use !Series_variable_* and !Series_repeats_* seriesStanza['!Series_sample_id'] = list() for idNum in expIds.iterkeys(): if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': seriesStanza['!Series_sample_id'].append(geoMapping[idNum]) else: seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars)) softfile[series['composite']] = seriesStanza -def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, instrument, replace): +def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit): print 'Creating HighThroughput soft file' - softfile = HighThroughputSoftFile() + softfile = soft.HighThroughputSoftFile() fileList = list() - createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace) + createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' - sample = HighThroughputSampleStanza() + sample = soft.HighThroughputSampleStanza() sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1) sample['!Sample_type'] = 'SRA' sample['!Sample_title'] = sample['^SAMPLE'] if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] count = 1 + #figure out if the instrument model is consistent across the entire sample + instrumentModel = None + for stanza in expId: + if 'seqPlatform' in stanza: + if instrumentModel == None: + instrumentModel = stanza['seqPlatform'] + else: + if instrumentModel != stanza['seqPlatform']: + instrumentModel = None + if audit: + print 'expId' + str(expId) + ': inconsistent instrument model' + break + for stanza in expId: file = compositeTrack.files[stanza['fileName']] if isRawFile(file): sample['!Sample_raw_file_' + str(count)] = file.name sample['!Sample_raw_file_type_' + str(count)] = file.extension if file.md5sum != None: sample['!Sample_raw_file_checksum_' + str(count)] = file.md5sum + if instrumentModel == None and 'seqPlatform' in stanza: + sample['!Sample_raw_file_instrument_model_' + str(count)] = stanza['seqPlatform'] + fileList.append(file) count = count + 1 count = 1 for stanza in expId: file = compositeTrack.files[stanza['fileName']] if isSupplimentaryFile(file): sample['!Sample_supplementary_file_' + str(count)] = file.name if file.md5sum != None: sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database + if instrumentModel == None and 'seqPlatform' in stanza: + sample['!Sample_supplementary_file_instrument_model_' + str(count)] = stanza['seqPlatform'] + fileList.append(file) count = count + 1 sample['!Sample_source_name'] = firstStanza['cell'] sample['!Sample_organism'] = compositeTrack.organism sample['!Sample_characteristics'] = list() allVars = expVars + mdbWhitelist for var in allVars: if var in firstStanza: foobar = var sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var]) for pretend in cvPretend.iterkeys(): if var + ' ' + firstStanza[var] == pretend: @@ -323,42 +345,48 @@ if datatype.molecule == 'OVERRIDE RNA': if firstStanza['rnaExtract'] in rnaExtractMapping: sample['!Sample_molecule'] = rnaExtractMapping[firstStanza['rnaExtract']] elif firstStanza['localization'] in localizationMapping: sample['!Sample_molecule'] = localizationMapping[firstStanza['localization']] else: sample['!Sample_molecule'] = datatype.molecule sample['!Sample_extract_protocol'] = compositeTrack.url sample['!Sample_library_strategy'] = datatype.strategy sample['!Sample_library_source'] = datatype.source sample['!Sample_library_selection'] = datatype.selection - # set to replace for if nothing has a seqPlatform and no instrument model is specified. - sample['!Sample_instrument_model'] = '[REPLACE]' + # if the instrumentModel is consistent, just use that + # otherwise take the first seqPlatform value from metadata + # if that still fails, check the replacement file + # finally just make it say [REPLACE] + if instrumentModel != None: + sample['!Sample_instrument_model'] = instrumentModel + else: for stanza in expId: if 'seqPlatform' in stanza: sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']] break - elif instrument != None: - sample['!Sample_instrument_model'] = instrumentModels[instrument] - break - if sample['!Sample_instrument_model'] == '[REPLACE]': + if '!Sample_instrument_model' not in sample: if '!Sample_instrument_model' in replace: - sample['!Sample_instrument_model'] = replace['!Sample_instrument_model'] + sample['!Sample_instrument_model'] = instrumentModels[replace['!Sample_instrument_model'][0]] + if '!Sample_instrument_model' not in sample: + sample['!Sample_instrument_model'] = '[REPLACE]' + if audit: + print stanza.name + ': no instrument' sample['!Sample_data_processing'] = compositeTrack.url if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': sample['!Sample_geo_accession'] = geoMapping[idNum] softfile[firstStanza['metaObject']] = sample return softfile, fileList def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): raise KeyError('microarray') @@ -469,46 +497,46 @@ sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] # sample['!Sample_supplementary_file_build_' + str(count)] = database fileList.append(stanza['fileName']) count = count + 1 softfile[firstStanza['geoSampleAccession']] = sample return softfile, fileList def main(): parser = argparse.ArgumentParser(description = 'Prepares a submission to GEO. Creates a soft file and shell script with the correct call to aspera.') parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/') - parser.add_argument('-i', '--instrument', help='If specified, expIds without instruments listed will default to this value. Use the no-spacing name eg Illumina_GA2') - parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc)') + parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc) and instrument model') + parser.add_argument('-a', '--audit', action='store_true', default=False, help='Instead of building the files, will just give you a list of errors') parser.add_argument('database', help='The database, typically hg19 or mm9') parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance') parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file') if len(sys.argv) == 1: parser.print_usage() return args = parser.parse_args(sys.argv[1:]) - compositeTrack = CompositeTrack(args.database, args.composite, args.trackPath) + compositeTrack = track.CompositeTrack(args.database, args.composite, args.trackPath) cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra' - cv = CvFile(cvPath) + controlledVocab = cv.CvFile(cvPath) replace = dict() if args.replace != None: for line in open(args.replace): if line == '': continue key, val = map(str.strip, line.split('=', 1)) if key not in replace: replace[key] = list() replace[key].append(val) ids = list() for id in args.expIds: @@ -521,59 +549,71 @@ expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb) submission = dict() if len(ids) == 0: submission = expIds else: for expId in ids: submission[str(expId)] = expIds[str(expId)] expIdStr = ' ' for id in args.expIds: expIdStr = expIdStr + id + ',' expIdStr = expIdStr[:len(expIdStr) - 1] print 'Generating soft using expIds ' + expIdStr - if datatype.soft == HighThroughputSoftFile: - softfile, fileList = createHighThroughputSoftFile(compositeTrack, cv, submission, expVars, geoMapping, series, datatype, args.instrument, replace) - elif datatype.soft == MicroArraySoftFile: - softfile, fileList = createMicroArraySoftFile(compositeTrack, cv, submission, expVars, geoMapping, series, datatype) + if datatype.soft == soft.HighThroughputSoftFile: + softfile, fileList = createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit) + elif datatype.soft == soft.MicroArraySoftFile: + softfile, fileList = createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype) else: raise KeyError('unsupported type') + if not args.audit: print 'Creating directory' d = datetime.datetime.today() datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day) - dirname = '%s_%s/' % (compositeTrack.name, datestring) + dirname = '%s_%s_%s/' % (compositeTrack.database, compositeTrack.name, datestring) + asperadirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name) + linkdirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name) + os.mkdir(dirname) + os.mkdir(dirname + linkdirname) print 'Writing file' - outfileName = '%s%s.soft' % (dirname, compositeTrack.name) + outfileName = '%s%s_%s.soft' % (dirname, compositeTrack.database, compositeTrack.name) outfile = open(outfileName, 'w') outfile.write(str(softfile)) fileslistname = '%sfiles.txt' % dirname fileslist = open(fileslistname, 'w') scriptname = '%supload.sh' % dirname outscript = open(scriptname, 'w') + outscript.write('#!/bin/sh\n\n') - outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m \\\n') + outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname) + outscript.close() for file in fileList: if not os.path.exists(file.path): print IOError(file.path + ' does not exist') - else: - outscript.write(file.path + ' \\\n') - fileslist.write(file.name + '\n') + elif not args.audit: + linkname = '%s_%s' % (compositeTrack.database, file.name) + linkpath = linkdirname + linkname + os.symlink(file.fullname, dirname + linkpath) + + #outscript.write(linkpath + ' \\\n') + fileslist.write(linkname + '\n') + + if not args.audit: + #outscript.write() - outscript.write('asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n') - outscript.close() fileslist.close() os.system('chmod +x ' + scriptname) print 'Finished!' if __name__ == '__main__': main() \ No newline at end of file