dbf1ed2fb55f85e23ce20f7ea1781dfe35f162e9 mmaddren Mon Oct 31 16:40:50 2011 -0700 somehow these got removed when I pulled??? I put them back diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg new file mode 100755 index 0000000..22cef27 --- /dev/null +++ python/programs/mkGeoPkg/mkGeoPkg @@ -0,0 +1,707 @@ +#!/hive/groups/encode/dcc/bin/python +import sys, os, shutil, stat, argparse, datetime +from ucscgenomics import track, ra, soft, cv + +""" +mkGeoPkg - create a soft file and upload script for a given track, so that it +may be sumbitted to GEO. + +To invoke the script, you must pass the composite and track name: + mkGeoPkg hg19 wgEncodeSomeTrack + +This is typically not enough however; most tracks are not completely covered +by their metadata, and it is necessary to supply additional information. The +most commonly needed information is: + !Series_summary - this is taken from the track's html page description. + In most cases it can be copied, one paragraph per line. + !Series_overall_design - this is taken from the Methods section on the + track's page. As with summary, 1 paragraph per line. + !Series_contributor - this is taken from the contributors list on the + track's page. It has a special format: "Last,First" where each person + is listed on a separate line. + !Sample_instrument_model - this is a bit more difficult, as it technically + supposed to be a per-sample field. Occasionally this appears in the + metaDb for newer tracks, if so, it's automatically added in. Otherwise + it must be either specified on a whole-series basis, or added to the + metadata. In many cases, we don't actually know all of them. This is + okay. GEO will allow us to submit whatever information we do have, and + they can take care of the rest. The instrumentModels dictionary below + gives a list of the allowable entered values (the keys). The values + map to the "human readable" version used by GEO. + +You can supply all of the above information in an additional file and specify +it using the r (replace) option: + mkGeoPkg -r somefile hg19 wgEncodeSomeTrack + +The replace file must be organized with each of the above key value pairs as +in a soft file: + !Series_summary = some summary stuff ... + !Series_summary = another paragraph of summary ... + !Series_overall_design = ... + !Series_contributor = Researcher,Some + !Series_contributor = Researcher,Another + !Sample_instrument_model = Illumina_GA2 + +There is a template for this, named replaceTemplate in the directory. + +You may need to only submit a part of the track, in this case you can specify +experiment ID numbers to include: + +One problem you may run into while running the script is that the DataType is +not valid. This means that the huge dictionary called datatypes below isn't +filled in for that entry. If you can get ahold of the information, modify the +script and push the changes back out. + +You may also get an error when trying to submit MicroArray data. This is to be +expected: MicroArray is currently not functional at all. We have no way as of +current to map our data to the format expected by GEO, so we've punted on this +issue for the time being. + +Once you've successfully run the script, you'll have generated a soft file and +a script that will handle the uploading process. All of this will be put into +a directory named 'database_wgEncodeSomeTrack_year-month-day'. To begin the +upload, simply cd into the directoy and run upload.sh. This will start the +aspera copy program, copying files, a files list, and the soft file itself. + +For each submission, you need to email GEO. Our current contact is: + Steve Wilhite, wilhite@ncbi.nlm.nih.gov + +You must specify which track you're submitting. GEO will only allow us 1TB of +space dedicated to ENCODE, so you must break down submissions larger than 1TB +and only submit as many submissions as they have room to process at any given +time. In a few days, GEO will get back to you with a list of accession numbers +which need to be put back into our metadata (see encodeAddGeoAccessions). + +""" + +class DataType(object): + + def __init__(self, molecule, strategy, source, selection, soft): + self.molecule = molecule + self.strategy = strategy + self.source = source + self.selection = selection + self.soft = soft + +datatypes = { + 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', soft.HighThroughputSoftFile), + 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', soft.HighThroughputSoftFile), + 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', soft.HighThroughputSoftFile), + 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile), + 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile), + 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile), + 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', soft.HighThroughputSoftFile), + 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', soft.HighThroughputSoftFile), + 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile), + 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', soft.HighThroughputSoftFile), + 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', soft.HighThroughputSoftFile), + 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', soft.HighThroughputSoftFile), + + #these need to be curated + '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', soft.MicroArraySoftFile), + 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Genotype': DataType('genomic DNA', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RipGeneSt': DataType('OVERRIDE RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', soft.HighThroughputSoftFile), #this isn't correct + 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None) +} + +#compare this to the source in datatype, give GP ids depending on the type +gpIds = { + 'human genomic': '63443', + 'human transcriptomic': '30709', + 'human protein': '63447', + + 'mouse genomic': '63471', + 'mouse transcriptomic': '66167', + 'mouse protein': '63475' +} + +cvDetails = { + 'cell': [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ], + 'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ] +} + +#if the term appears in the mdb and must overriding the value in the cv +cvOverride = [ 'sex' ] + +#talk to Venkat lol +cvPretend = { 'antibody Input': 'control' } + +#if its not in cvDetails, which things should we check by default +cvDefaults = [ 'description' ] + +mdbWhitelist = [ + 'age', + 'bioRep', + 'control', + 'controlId', + 'fragSize', + 'labExpId', + 'labVersion', + 'mapAlgorithm', + 'obtainedBy', + 'phase', + 'readType', + 'region', + 'replicate', + 'restrictionEnzyme', + 'run', + 'softwareVersion', + 'spikeInPool', + 'strain' +] + +# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields +# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current) +# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. +rnaExtractMapping = { + 'shortPolyA': 'polyA RNA', + 'longPolyA': 'polyA RNA', + 'polyA': 'polyA RNA' +} + +localizationMapping = { + 'cytosol': 'cytoplasmic RNA', + 'polysome': 'cytoplasmic RNA', + 'membraneFraction': 'cytoplasmic RNA', + 'mitochondria': 'cytoplasmic RNA', + 'nucleus': 'nuclear RNA', + 'nucleolus': 'nuclear RNA', + 'nucleoplasm': 'nuclear RNA', + 'nuclearMatrix': 'nuclear RNA', + 'chromatin': 'nuclear RNA', + 'cell': 'total RNA' +} + +# map our instrument names to GEO's names +instrumentModels = { + 'Illumina_GA2x': 'Illumina Genome Analyzer II', + 'Illumina_GA2': 'Illumina Genome Analyzer II', + 'Illumina_HiSeq_2000': 'Illumina HiSeq 2000', + 'Illumina_GA1': 'Illumina Genome Analyzer', + 'Illumina_GA1_or_GA2': 'Illumina Genome Analyzer, Illumina Genome Analyzer II', + 'SOLiD_Unknown': 'SOLiD', + 'Unknown': 'Illumina Genome Analyzer' +} + + +def isRawFile(file): + return (file.extension == 'fastq' or file.extension == 'fasta') + +def isSupplimentaryFile(file): + return not isRawFile(file) + +def sampleTitle(stanza, expVars, warn=False): + concat = stanza[expVars[0]].replace('-m', '') + for expVar in expVars[1:len(expVars)]: + if expVar in stanza and stanza[expVar] != 'None': + concat += '_' + stanza[expVar] + elif warn: + print 'warning: %s is None or not in %s' % (expVar, stanza.name) + return concat + +def linkName(file, track): + return '%s_%s' % (track.database, file.name) + +def createMappings(mdb): + expIds = dict() + geoMapping = dict() + expVars = None + series = None + datatype = None + + for stanza in mdb.itervalues(): + + if 'objType' in stanza and stanza['objType'] == 'composite': + series = stanza + expVars = stanza['expVars'].split(',') + continue + + if 'expId' not in stanza: + print stanza.name + ': no expId' + continue + + if 'geoSampleAccession' not in stanza or 1: + # if this hasn't been submitted to GEO yet, we'll add it to the submission list + if stanza['expId'] not in expIds: + expIds[stanza['expId']] = list() + + expIds[stanza['expId']].append(stanza) + + else: + # otherwise we keep track of the geo number for partially submitted samples + if stanza['expId'] not in geoMapping: + geoMapping[stanza['expId']] = stanza['geoSampleAccession'] + elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: + geoMapping[stanza['expId']] = 'Inconsistent' + print stanza.name + ': inconsistent geo mapping' + + if datatype == None and 'dataType' in stanza: + datatype = stanza['dataType'] + elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: + raise KeyError(stanza.name + ': inconsistent data type') + + try: + dt = datatype + datatype = datatypes[dt] + datatype.name = dt + except KeyError: + raise KeyError(datatype) + + return expIds, expVars, geoMapping, series, datatype + + +def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit): + + if 'geoSeriesAccession' in series: + print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession'] + return + + print 'Writing series ' + series['composite'] + + seriesStanza = soft.SeriesStanza() + seriesStanza['^SERIES'] = series['composite'] + seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT + + if '!Series_summary' in replace: + seriesStanza['!Series_summary'] = replace['!Series_summary'] + else: + print 'warning: no series summary found. Please include in replace file.' + seriesStanza['!Series_summary'] = '[REPLACE]' + if audit: + print seriesStanza.name + ': no summary' + + if '!Series_overall_design' in replace: + seriesStanza['!Series_overall_design'] = replace['!Series_overall_design'] + else: + print 'no series overall design found. Please include in replace file.' + seriesStanza['!Series_overall_design'] = '[REPLACE]' + if audit: + print seriesStanza.name + ': no overall design' + + seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ] + + if '!Series_contributor' in replace: + seriesStanza['!Series_contributor'] = replace['!Series_contributor'] + else: + seriesStanza['!Series_contributor'] = '[REPLACE]' + if audit: + print seriesStanza.name + ': no contributor' + + seriesStanza['!Series_gp_id'] = gpIds[compositeTrack.organism + ' ' + datatype.source] + + # could use !Series_variable_* and !Series_repeats_* + + seriesStanza['!Series_sample_id'] = list() + + for idNum in expIds.iterkeys(): + if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': + seriesStanza['!Series_sample_id'].append(geoMapping[idNum]) + else: + seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars)) + + softfile[series['composite']] = seriesStanza + +def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit): + + print 'Creating HighThroughput soft file' + + softfile = soft.HighThroughputSoftFile() + fileList = list() + + createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit) + + for idNum in expIds.iterkeys(): + + expId = expIds[idNum] + firstStanza = expId[0] + print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' + sample = soft.HighThroughputSampleStanza() + + sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1) + sample['!Sample_type'] = 'SRA' + sample['!Sample_title'] = sample['^SAMPLE'] + + if 'geoSeriesAccession' in series: + sample['!Sample_series_id'] = series['geoSeriesAccession'] + + count = 1 + + #figure out if the instrument model is consistent across the entire sample + instrumentModel = None + for stanza in expId: + if 'seqPlatform' in stanza: + if instrumentModel == None: + instrumentModel = stanza['seqPlatform'] + else: + if instrumentModel != stanza['seqPlatform']: + instrumentModel = None + if audit: + print 'expId' + str(expId) + ': inconsistent instrument model' + break + + for stanza in expId: + + for fname in stanza['fileName'].split(','): + file = compositeTrack.files[fname] + + if isRawFile(file): + sample['!Sample_raw_file_' + str(count)] = linkName(file, compositeTrack) + sample['!Sample_raw_file_type_' + str(count)] = file.extension + + if file.md5sum != None: + sample['!Sample_raw_file_checksum_' + str(count)] = file.md5sum + + if instrumentModel == None and 'seqPlatform' in stanza: + sample['!Sample_raw_file_instrument_model_' + str(count)] = stanza['seqPlatform'] + + fileList.append(file) + count = count + 1 + + count = 1 + + for stanza in expId: + + for fname in stanza['fileName'].split(','): + file = compositeTrack.files[fname] + + if isSupplimentaryFile(file): + sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack) + + if file.md5sum != None: + sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum + + sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database + + if instrumentModel == None and 'seqPlatform' in stanza: + sample['!Sample_supplementary_file_instrument_model_' + str(count)] = stanza['seqPlatform'] + + fileList.append(file) + count = count + 1 + + sample['!Sample_source_name'] = firstStanza['cell'] + sample['!Sample_organism'] = compositeTrack.organism + + sample['!Sample_characteristics'] = list() + allVars = expVars + mdbWhitelist + + for var in allVars: + if var in firstStanza: + foobar = var + sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var]) + for pretend in cvPretend.iterkeys(): + if var + ' ' + firstStanza[var] == pretend: + foobar = cvPretend[pretend] + if foobar in cvDetails: + for cvVar in cvDetails[foobar]: + if cvVar in cvOverride and cvVar in firstStanza: + sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + firstStanza[cvVar]) + elif cvVar in cv[firstStanza[var]]: + sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar]) + else: + for cvVar in cvDefaults: + if firstStanza[var] in cv and cvVar in cv[firstStanza[var]]: + sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar]) + + sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName'] + + if 'treatment' in firstStanza: + sample['!Sample_treatment_protocol'] = firstStanza['treatment'] + + if 'protocol' in cv[firstStanza['cell']]: + for protocol in cv[firstStanza['cell']]['protocol'].split(' '): + key, val = protocol.split(':') + if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']: + sample['!Sample_growth_protocol'] = val + + if datatype.molecule == 'OVERRIDE RNA': + if firstStanza['rnaExtract'] in rnaExtractMapping: + sample['!Sample_molecule'] = rnaExtractMapping[firstStanza['rnaExtract']] + elif firstStanza['localization'] in localizationMapping: + sample['!Sample_molecule'] = localizationMapping[firstStanza['localization']] + + else: + sample['!Sample_molecule'] = datatype.molecule + + if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown': + sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url) + else: + sample['!Sample_extract_protocol'] = compositeTrack.url + sample['!Sample_library_strategy'] = datatype.strategy + sample['!Sample_library_source'] = datatype.source + sample['!Sample_library_selection'] = datatype.selection + + # if the instrumentModel is consistent, just use that + # otherwise take the first seqPlatform value from metadata + # if that still fails, check the replacement file + # finally just make it say [REPLACE] + if instrumentModel != None: + sample['!Sample_instrument_model'] = instrumentModel + else: + for stanza in expId: + if 'seqPlatform' in stanza: + sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']] + break + if '!Sample_instrument_model' not in sample: + if '!Sample_instrument_model' in replace: + sample['!Sample_instrument_model'] = instrumentModels[replace['!Sample_instrument_model'][0]] + if '!Sample_instrument_model' not in sample: + sample['!Sample_instrument_model'] = '[REPLACE]' + if audit: + print stanza.name + ': no instrument' + + sample['!Sample_data_processing'] = compositeTrack.url + + if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': + sample['!Sample_geo_accession'] = geoMapping[idNum] + + softfile[firstStanza['metaObject']] = sample + + return softfile, fileList + + +def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): + + raise KeyError('microarray') + + print 'Creating MicroArray soft file' + + softfile = SoftFile() + fileList = list() + + createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) + + for idNum in expIds.iterkeys(): + + expId = expIds[idNum] + firstStanza = expId[0] + print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' + sample = MicroArraySampleStanza() + + sample['^SAMPLE'] = sampleTitle(firstStanza, expVars) + + if 'geoSeriesAccession' in series: + sample['!Sample_series_id'] = series['geoSeriesAccession'] + + sample['!Sample_title'] = concat + + count = 1 + + for stanza in expId: + + if isSupplimentaryFile(stanza['fileName']): + sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] + + if 'checksum' in stanza: + sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] + elif md5sums != None and stanza['fileName'] in md5sums: + sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] + + fileList.append(stanza['fileName']) + count = count + 1 + + # sample['!Sample_table'] = KeyOptional # CEL file + sample['!Sample_source_name_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_organism_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_characteristics_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered + # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered + # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered + sample['!Sample_molecule_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_extract_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_label_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_label_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_hyb_protocol'] = '[REPLACE]' #KeyOnePlus + sample['!Sample_scan_protocol'] = '[REPLACE]' #KeyOnePlus + sample['!Sample_data_processing'] = '[REPLACE]' #KeyOnePlus + sample['!Sample_description'] = '[REPLACE]' #KeyZeroPlus + sample['!Sample_platform_id'] = '[REPLACE]' + # sample['!Sample_geo_accession'] = KeyOptional + # sample['!Sample_anchor'] = KeyRequired SAGE ONLY + # sample['!Sample_type'] = KeyRequired SAGE ONLY + # sample['!Sample_tag_count'] = KeyRequired SAGE ONLY + # sample['!Sample_tag_length'] = KeyRequired SAGE ONLY + sample['!Sample_table_begin'] = '' + sample['!Sample_table_end'] = '' + + softfile[firstStanza['accession']] = sample + + return softfile, fileList + + +def createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): + softfile = SoftFile() + fileList = list() + + createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) + + for idNum in expIds.iterkeys(): + + expId = expIds[idNum] + firstStanza = expId[0] + print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' + sample = HighThroughputSampleStanza() + + hasbigwig = 0 + for stanza in expId: + + if getFileType(stanza['fileName']) == 'bigWig': + hasbigwig = 1 + + if hasbigwig == 0: + continue + + sample['^SAMPLE'] = firstStanza['geoSampleAccession'] + + if 'geoSeriesAccession' in series: + sample['!Sample_series_id'] = series['geoSeriesAccession'] + + sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession'] + + count = 1 + + for stanza in expId: + + if getFileType(stanza['fileName']) == 'bigWig': + sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] + + if 'checksum' in stanza: + sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] + elif md5sums != None and stanza['fileName'] in md5sums: + sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] + + # sample['!Sample_supplementary_file_build_' + str(count)] = database + + fileList.append(stanza['fileName']) + count = count + 1 + + softfile[firstStanza['geoSampleAccession']] = sample + + return softfile, fileList + +def main(): + + parser = argparse.ArgumentParser(description = 'Prepares a submission to GEO. Creates a soft file and shell script with the correct call to aspera.') + parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/') + parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc) and instrument model') + parser.add_argument('-a', '--audit', action='store_true', default=False, help='Instead of building the files, will just give you a list of errors') + parser.add_argument('database', help='The database, typically hg19 or mm9') + parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance') + parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file') + + if len(sys.argv) == 1: + parser.print_usage() + return + + args = parser.parse_args(sys.argv[1:]) + + compositeTrack = track.CompositeTrack(args.database, args.composite, args.trackPath) + + cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra' + controlledVocab = cv.CvFile(cvPath) + + replace = dict() + if args.replace != None: + for line in open(args.replace): + if line == '': + continue + key, val = map(str.strip, line.split('=', 1)) + if key not in replace: + replace[key] = list() + replace[key].append(val) + + + ids = list() + + for id in args.expIds: + if '-' in id: + start, end = id.split('-', 1) + ids.extend(range(int(start), int(end) + 1)) + else: + ids.append(int(id)) + + expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb) + + submission = dict() + if len(ids) == 0: + submission = expIds + else: + for expId in ids: + submission[str(expId)] = expIds[str(expId)] + + expIdStr = ' ' + for id in args.expIds: + expIdStr = expIdStr + id + ',' + expIdStr = expIdStr[:len(expIdStr) - 1] + print 'Generating soft using expIds ' + expIdStr + + if datatype.soft == soft.HighThroughputSoftFile: + softfile, fileList = createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit) + elif datatype.soft == soft.MicroArraySoftFile: + softfile, fileList = createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype) + else: + raise KeyError('unsupported type ' + datatype.name) + + if not args.audit: + print 'Creating directory' + + d = datetime.datetime.today() + datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day) + + dirname = '%s_%s_%s/' % (compositeTrack.database, compositeTrack.name, datestring) + linkdirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name) + + os.mkdir(dirname) + os.mkdir(dirname + linkdirname) + + print 'Writing file' + + outfileName = '%s%s_%s.soft' % (dirname + linkdirname, compositeTrack.database, compositeTrack.name) + outfile = open(outfileName, 'w') + outfile.write(str(softfile)) + fileslistname = '%sfiles.txt' % (dirname + linkdirname) + fileslist = open(fileslistname, 'w') + scriptname = '%supload.sh' % dirname + outscript = open(scriptname, 'w') + + outscript.write('#!/bin/sh\n\n') + outscript.write('/cluster/home/mmaddren/.aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname) + outscript.close() + + for file in fileList: + if not os.path.exists(file.path): + print IOError(file.path + ' does not exist') + elif not args.audit: + linkname = linkName(file, compositeTrack) + linkpath = linkdirname + linkname + os.symlink(file.fullname, dirname + linkpath) + + #outscript.write(linkpath + ' \\\n') + fileslist.write(linkname + '\n') + + if not args.audit: + #outscript.write() + + fileslist.close() + + os.system('chmod +x ' + scriptname) + + print 'Finished!' + +if __name__ == '__main__': + main() \ No newline at end of file