f1be2cca63ee3217c5337b55dca6ba0248370cc1 mmaddren Mon Oct 31 15:33:45 2011 -0700 minor updates to mkGeoPkg diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg index e85c252..22cef27 100755 --- python/programs/mkGeoPkg/mkGeoPkg +++ python/programs/mkGeoPkg/mkGeoPkg @@ -1,58 +1,130 @@ #!/hive/groups/encode/dcc/bin/python import sys, os, shutil, stat, argparse, datetime from ucscgenomics import track, ra, soft, cv +""" +mkGeoPkg - create a soft file and upload script for a given track, so that it +may be sumbitted to GEO. + +To invoke the script, you must pass the composite and track name: + mkGeoPkg hg19 wgEncodeSomeTrack + +This is typically not enough however; most tracks are not completely covered +by their metadata, and it is necessary to supply additional information. The +most commonly needed information is: + !Series_summary - this is taken from the track's html page description. + In most cases it can be copied, one paragraph per line. + !Series_overall_design - this is taken from the Methods section on the + track's page. As with summary, 1 paragraph per line. + !Series_contributor - this is taken from the contributors list on the + track's page. It has a special format: "Last,First" where each person + is listed on a separate line. + !Sample_instrument_model - this is a bit more difficult, as it technically + supposed to be a per-sample field. Occasionally this appears in the + metaDb for newer tracks, if so, it's automatically added in. Otherwise + it must be either specified on a whole-series basis, or added to the + metadata. In many cases, we don't actually know all of them. This is + okay. GEO will allow us to submit whatever information we do have, and + they can take care of the rest. The instrumentModels dictionary below + gives a list of the allowable entered values (the keys). The values + map to the "human readable" version used by GEO. + +You can supply all of the above information in an additional file and specify +it using the r (replace) option: + mkGeoPkg -r somefile hg19 wgEncodeSomeTrack + +The replace file must be organized with each of the above key value pairs as +in a soft file: + !Series_summary = some summary stuff ... + !Series_summary = another paragraph of summary ... + !Series_overall_design = ... + !Series_contributor = Researcher,Some + !Series_contributor = Researcher,Another + !Sample_instrument_model = Illumina_GA2 + +There is a template for this, named replaceTemplate in the directory. + +You may need to only submit a part of the track, in this case you can specify +experiment ID numbers to include: + +One problem you may run into while running the script is that the DataType is +not valid. This means that the huge dictionary called datatypes below isn't +filled in for that entry. If you can get ahold of the information, modify the +script and push the changes back out. + +You may also get an error when trying to submit MicroArray data. This is to be +expected: MicroArray is currently not functional at all. We have no way as of +current to map our data to the format expected by GEO, so we've punted on this +issue for the time being. + +Once you've successfully run the script, you'll have generated a soft file and +a script that will handle the uploading process. All of this will be put into +a directory named 'database_wgEncodeSomeTrack_year-month-day'. To begin the +upload, simply cd into the directoy and run upload.sh. This will start the +aspera copy program, copying files, a files list, and the soft file itself. + +For each submission, you need to email GEO. Our current contact is: + Steve Wilhite, wilhite@ncbi.nlm.nih.gov + +You must specify which track you're submitting. GEO will only allow us 1TB of +space dedicated to ENCODE, so you must break down submissions larger than 1TB +and only submit as many submissions as they have room to process at any given +time. In a few days, GEO will get back to you with a list of accession numbers +which need to be put back into our metadata (see encodeAddGeoAccessions). + +""" + class DataType(object): def __init__(self, molecule, strategy, source, selection, soft): self.molecule = molecule self.strategy = strategy self.source = source self.selection = selection self.soft = soft datatypes = { 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', soft.HighThroughputSoftFile), 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', soft.HighThroughputSoftFile), 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', soft.HighThroughputSoftFile), 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile), 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile), 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile), 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', soft.HighThroughputSoftFile), 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', soft.HighThroughputSoftFile), 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile), 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', soft.HighThroughputSoftFile), 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', soft.HighThroughputSoftFile), 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', soft.HighThroughputSoftFile), #these need to be curated '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', soft.MicroArraySoftFile), 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Genotype': DataType('genomic DNA', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RipGeneSt': DataType('OVERRIDE RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', soft.HighThroughputSoftFile), #this isn't correct 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None) } #compare this to the source in datatype, give GP ids depending on the type gpIds = { 'human genomic': '63443', 'human transcriptomic': '30709', 'human protein': '63447', 'mouse genomic': '63471', 'mouse transcriptomic': '66167', @@ -108,88 +180,100 @@ 'polysome': 'cytoplasmic RNA', 'membraneFraction': 'cytoplasmic RNA', 'mitochondria': 'cytoplasmic RNA', 'nucleus': 'nuclear RNA', 'nucleolus': 'nuclear RNA', 'nucleoplasm': 'nuclear RNA', 'nuclearMatrix': 'nuclear RNA', 'chromatin': 'nuclear RNA', 'cell': 'total RNA' } # map our instrument names to GEO's names instrumentModels = { 'Illumina_GA2x': 'Illumina Genome Analyzer II', 'Illumina_GA2': 'Illumina Genome Analyzer II', - 'Illumina_HiSeq_2000': 'Illumina HiSeq 2000' + 'Illumina_HiSeq_2000': 'Illumina HiSeq 2000', + 'Illumina_GA1': 'Illumina Genome Analyzer', + 'Illumina_GA1_or_GA2': 'Illumina Genome Analyzer, Illumina Genome Analyzer II', + 'SOLiD_Unknown': 'SOLiD', + 'Unknown': 'Illumina Genome Analyzer' } def isRawFile(file): return (file.extension == 'fastq' or file.extension == 'fasta') def isSupplimentaryFile(file): return not isRawFile(file) def sampleTitle(stanza, expVars, warn=False): concat = stanza[expVars[0]].replace('-m', '') for expVar in expVars[1:len(expVars)]: if expVar in stanza and stanza[expVar] != 'None': concat += '_' + stanza[expVar] elif warn: print 'warning: %s is None or not in %s' % (expVar, stanza.name) return concat +def linkName(file, track): + return '%s_%s' % (track.database, file.name) + def createMappings(mdb): expIds = dict() geoMapping = dict() expVars = None series = None datatype = None for stanza in mdb.itervalues(): if 'objType' in stanza and stanza['objType'] == 'composite': series = stanza expVars = stanza['expVars'].split(',') continue if 'expId' not in stanza: print stanza.name + ': no expId' continue - if 'geoSampleAccession' not in stanza: + if 'geoSampleAccession' not in stanza or 1: # if this hasn't been submitted to GEO yet, we'll add it to the submission list if stanza['expId'] not in expIds: expIds[stanza['expId']] = list() expIds[stanza['expId']].append(stanza) else: # otherwise we keep track of the geo number for partially submitted samples if stanza['expId'] not in geoMapping: geoMapping[stanza['expId']] = stanza['geoSampleAccession'] elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: geoMapping[stanza['expId']] = 'Inconsistent' print stanza.name + ': inconsistent geo mapping' if datatype == None and 'dataType' in stanza: datatype = stanza['dataType'] elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: raise KeyError(stanza.name + ': inconsistent data type') - datatype = datatypes[datatype] + try: + dt = datatype + datatype = datatypes[dt] + datatype.name = dt + except KeyError: + raise KeyError(datatype) return expIds, expVars, geoMapping, series, datatype def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit): if 'geoSeriesAccession' in series: print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession'] return print 'Writing series ' + series['composite'] seriesStanza = soft.SeriesStanza() seriesStanza['^SERIES'] = series['composite'] seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT @@ -261,53 +345,55 @@ #figure out if the instrument model is consistent across the entire sample instrumentModel = None for stanza in expId: if 'seqPlatform' in stanza: if instrumentModel == None: instrumentModel = stanza['seqPlatform'] else: if instrumentModel != stanza['seqPlatform']: instrumentModel = None if audit: print 'expId' + str(expId) + ': inconsistent instrument model' break for stanza in expId: - file = compositeTrack.files[stanza['fileName']] + for fname in stanza['fileName'].split(','): + file = compositeTrack.files[fname] if isRawFile(file): - sample['!Sample_raw_file_' + str(count)] = file.name + sample['!Sample_raw_file_' + str(count)] = linkName(file, compositeTrack) sample['!Sample_raw_file_type_' + str(count)] = file.extension if file.md5sum != None: sample['!Sample_raw_file_checksum_' + str(count)] = file.md5sum if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_raw_file_instrument_model_' + str(count)] = stanza['seqPlatform'] fileList.append(file) count = count + 1 count = 1 for stanza in expId: - file = compositeTrack.files[stanza['fileName']] + for fname in stanza['fileName'].split(','): + file = compositeTrack.files[fname] if isSupplimentaryFile(file): - sample['!Sample_supplementary_file_' + str(count)] = file.name + sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack) if file.md5sum != None: sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database if instrumentModel == None and 'seqPlatform' in stanza: sample['!Sample_supplementary_file_instrument_model_' + str(count)] = stanza['seqPlatform'] fileList.append(file) count = count + 1 sample['!Sample_source_name'] = firstStanza['cell'] sample['!Sample_organism'] = compositeTrack.organism @@ -340,30 +426,33 @@ if 'protocol' in cv[firstStanza['cell']]: for protocol in cv[firstStanza['cell']]['protocol'].split(' '): key, val = protocol.split(':') if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']: sample['!Sample_growth_protocol'] = val if datatype.molecule == 'OVERRIDE RNA': if firstStanza['rnaExtract'] in rnaExtractMapping: sample['!Sample_molecule'] = rnaExtractMapping[firstStanza['rnaExtract']] elif firstStanza['localization'] in localizationMapping: sample['!Sample_molecule'] = localizationMapping[firstStanza['localization']] else: sample['!Sample_molecule'] = datatype.molecule + if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown': + sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url) + else: sample['!Sample_extract_protocol'] = compositeTrack.url sample['!Sample_library_strategy'] = datatype.strategy sample['!Sample_library_source'] = datatype.source sample['!Sample_library_selection'] = datatype.selection # if the instrumentModel is consistent, just use that # otherwise take the first seqPlatform value from metadata # if that still fails, check the replacement file # finally just make it say [REPLACE] if instrumentModel != None: sample['!Sample_instrument_model'] = instrumentModel else: for stanza in expId: if 'seqPlatform' in stanza: sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']] @@ -554,64 +643,63 @@ else: for expId in ids: submission[str(expId)] = expIds[str(expId)] expIdStr = ' ' for id in args.expIds: expIdStr = expIdStr + id + ',' expIdStr = expIdStr[:len(expIdStr) - 1] print 'Generating soft using expIds ' + expIdStr if datatype.soft == soft.HighThroughputSoftFile: softfile, fileList = createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit) elif datatype.soft == soft.MicroArraySoftFile: softfile, fileList = createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype) else: - raise KeyError('unsupported type') + raise KeyError('unsupported type ' + datatype.name) if not args.audit: print 'Creating directory' d = datetime.datetime.today() datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day) dirname = '%s_%s_%s/' % (compositeTrack.database, compositeTrack.name, datestring) - asperadirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name) linkdirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name) os.mkdir(dirname) os.mkdir(dirname + linkdirname) print 'Writing file' - outfileName = '%s%s_%s.soft' % (dirname, compositeTrack.database, compositeTrack.name) + outfileName = '%s%s_%s.soft' % (dirname + linkdirname, compositeTrack.database, compositeTrack.name) outfile = open(outfileName, 'w') outfile.write(str(softfile)) - fileslistname = '%sfiles.txt' % dirname + fileslistname = '%sfiles.txt' % (dirname + linkdirname) fileslist = open(fileslistname, 'w') scriptname = '%supload.sh' % dirname outscript = open(scriptname, 'w') outscript.write('#!/bin/sh\n\n') - outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname) + outscript.write('/cluster/home/mmaddren/.aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname) outscript.close() for file in fileList: if not os.path.exists(file.path): print IOError(file.path + ' does not exist') elif not args.audit: - linkname = '%s_%s' % (compositeTrack.database, file.name) + linkname = linkName(file, compositeTrack) linkpath = linkdirname + linkname os.symlink(file.fullname, dirname + linkpath) #outscript.write(linkpath + ' \\\n') fileslist.write(linkname + '\n') if not args.audit: #outscript.write() fileslist.close() os.system('chmod +x ' + scriptname) print 'Finished!'