fa1fd1e6225697d08803d99446cab7d4e0d549e9 vsmalladi Fri Oct 21 14:38:24 2011 -0700 Moved all python scripts from /python/programs to live in sr/hg/encode as per Kate's request for all encode programs in a common place. diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg deleted file mode 100755 index e85c252..0000000 --- python/programs/mkGeoPkg/mkGeoPkg +++ /dev/null @@ -1,619 +0,0 @@ -#!/hive/groups/encode/dcc/bin/python -import sys, os, shutil, stat, argparse, datetime -from ucscgenomics import track, ra, soft, cv - -class DataType(object): - - def __init__(self, molecule, strategy, source, selection, soft): - self.molecule = molecule - self.strategy = strategy - self.source = source - self.selection = selection - self.soft = soft - -datatypes = { - 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', soft.HighThroughputSoftFile), - 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', soft.HighThroughputSoftFile), - 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', soft.HighThroughputSoftFile), - 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile), - 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile), - 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile), - 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', soft.HighThroughputSoftFile), - 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', soft.HighThroughputSoftFile), - 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile), - 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', soft.HighThroughputSoftFile), - 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', soft.HighThroughputSoftFile), - 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', soft.HighThroughputSoftFile), - - #these need to be curated - '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', soft.MicroArraySoftFile), - 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None) -} - -#compare this to the source in datatype, give GP ids depending on the type -gpIds = { - 'human genomic': '63443', - 'human transcriptomic': '30709', - 'human protein': '63447', - - 'mouse genomic': '63471', - 'mouse transcriptomic': '66167', - 'mouse protein': '63475' -} - -cvDetails = { - 'cell': [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ], - 'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ] -} - -#if the term appears in the mdb and must overriding the value in the cv -cvOverride = [ 'sex' ] - -#talk to Venkat lol -cvPretend = { 'antibody Input': 'control' } - -#if its not in cvDetails, which things should we check by default -cvDefaults = [ 'description' ] - -mdbWhitelist = [ - 'age', - 'bioRep', - 'control', - 'controlId', - 'fragSize', - 'labExpId', - 'labVersion', - 'mapAlgorithm', - 'obtainedBy', - 'phase', - 'readType', - 'region', - 'replicate', - 'restrictionEnzyme', - 'run', - 'softwareVersion', - 'spikeInPool', - 'strain' -] - -# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields -# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current) -# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. -rnaExtractMapping = { - 'shortPolyA': 'polyA RNA', - 'longPolyA': 'polyA RNA', - 'polyA': 'polyA RNA' -} - -localizationMapping = { - 'cytosol': 'cytoplasmic RNA', - 'polysome': 'cytoplasmic RNA', - 'membraneFraction': 'cytoplasmic RNA', - 'mitochondria': 'cytoplasmic RNA', - 'nucleus': 'nuclear RNA', - 'nucleolus': 'nuclear RNA', - 'nucleoplasm': 'nuclear RNA', - 'nuclearMatrix': 'nuclear RNA', - 'chromatin': 'nuclear RNA', - 'cell': 'total RNA' -} - -# map our instrument names to GEO's names -instrumentModels = { - 'Illumina_GA2x': 'Illumina Genome Analyzer II', - 'Illumina_GA2': 'Illumina Genome Analyzer II', - 'Illumina_HiSeq_2000': 'Illumina HiSeq 2000' -} - - -def isRawFile(file): - return (file.extension == 'fastq' or file.extension == 'fasta') - -def isSupplimentaryFile(file): - return not isRawFile(file) - -def sampleTitle(stanza, expVars, warn=False): - concat = stanza[expVars[0]].replace('-m', '') - for expVar in expVars[1:len(expVars)]: - if expVar in stanza and stanza[expVar] != 'None': - concat += '_' + stanza[expVar] - elif warn: - print 'warning: %s is None or not in %s' % (expVar, stanza.name) - return concat - -def createMappings(mdb): - expIds = dict() - geoMapping = dict() - expVars = None - series = None - datatype = None - - for stanza in mdb.itervalues(): - - if 'objType' in stanza and stanza['objType'] == 'composite': - series = stanza - expVars = stanza['expVars'].split(',') - continue - - if 'expId' not in stanza: - print stanza.name + ': no expId' - continue - - if 'geoSampleAccession' not in stanza: - # if this hasn't been submitted to GEO yet, we'll add it to the submission list - if stanza['expId'] not in expIds: - expIds[stanza['expId']] = list() - - expIds[stanza['expId']].append(stanza) - - else: - # otherwise we keep track of the geo number for partially submitted samples - if stanza['expId'] not in geoMapping: - geoMapping[stanza['expId']] = stanza['geoSampleAccession'] - elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: - geoMapping[stanza['expId']] = 'Inconsistent' - print stanza.name + ': inconsistent geo mapping' - - if datatype == None and 'dataType' in stanza: - datatype = stanza['dataType'] - elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: - raise KeyError(stanza.name + ': inconsistent data type') - - datatype = datatypes[datatype] - - return expIds, expVars, geoMapping, series, datatype - - -def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit): - - if 'geoSeriesAccession' in series: - print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession'] - return - - print 'Writing series ' + series['composite'] - - seriesStanza = soft.SeriesStanza() - seriesStanza['^SERIES'] = series['composite'] - seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT - - if '!Series_summary' in replace: - seriesStanza['!Series_summary'] = replace['!Series_summary'] - else: - print 'warning: no series summary found. Please include in replace file.' - seriesStanza['!Series_summary'] = '[REPLACE]' - if audit: - print seriesStanza.name + ': no summary' - - if '!Series_overall_design' in replace: - seriesStanza['!Series_overall_design'] = replace['!Series_overall_design'] - else: - print 'no series overall design found. Please include in replace file.' - seriesStanza['!Series_overall_design'] = '[REPLACE]' - if audit: - print seriesStanza.name + ': no overall design' - - seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ] - - if '!Series_contributor' in replace: - seriesStanza['!Series_contributor'] = replace['!Series_contributor'] - else: - seriesStanza['!Series_contributor'] = '[REPLACE]' - if audit: - print seriesStanza.name + ': no contributor' - - seriesStanza['!Series_gp_id'] = gpIds[compositeTrack.organism + ' ' + datatype.source] - - # could use !Series_variable_* and !Series_repeats_* - - seriesStanza['!Series_sample_id'] = list() - - for idNum in expIds.iterkeys(): - if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': - seriesStanza['!Series_sample_id'].append(geoMapping[idNum]) - else: - seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars)) - - softfile[series['composite']] = seriesStanza - -def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit): - - print 'Creating HighThroughput soft file' - - softfile = soft.HighThroughputSoftFile() - fileList = list() - - createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit) - - for idNum in expIds.iterkeys(): - - expId = expIds[idNum] - firstStanza = expId[0] - print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' - sample = soft.HighThroughputSampleStanza() - - sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1) - sample['!Sample_type'] = 'SRA' - sample['!Sample_title'] = sample['^SAMPLE'] - - if 'geoSeriesAccession' in series: - sample['!Sample_series_id'] = series['geoSeriesAccession'] - - count = 1 - - #figure out if the instrument model is consistent across the entire sample - instrumentModel = None - for stanza in expId: - if 'seqPlatform' in stanza: - if instrumentModel == None: - instrumentModel = stanza['seqPlatform'] - else: - if instrumentModel != stanza['seqPlatform']: - instrumentModel = None - if audit: - print 'expId' + str(expId) + ': inconsistent instrument model' - break - - for stanza in expId: - - file = compositeTrack.files[stanza['fileName']] - - if isRawFile(file): - sample['!Sample_raw_file_' + str(count)] = file.name - sample['!Sample_raw_file_type_' + str(count)] = file.extension - - if file.md5sum != None: - sample['!Sample_raw_file_checksum_' + str(count)] = file.md5sum - - if instrumentModel == None and 'seqPlatform' in stanza: - sample['!Sample_raw_file_instrument_model_' + str(count)] = stanza['seqPlatform'] - - fileList.append(file) - count = count + 1 - - count = 1 - - for stanza in expId: - - file = compositeTrack.files[stanza['fileName']] - - if isSupplimentaryFile(file): - sample['!Sample_supplementary_file_' + str(count)] = file.name - - if file.md5sum != None: - sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum - - sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database - - if instrumentModel == None and 'seqPlatform' in stanza: - sample['!Sample_supplementary_file_instrument_model_' + str(count)] = stanza['seqPlatform'] - - fileList.append(file) - count = count + 1 - - sample['!Sample_source_name'] = firstStanza['cell'] - sample['!Sample_organism'] = compositeTrack.organism - - sample['!Sample_characteristics'] = list() - allVars = expVars + mdbWhitelist - - for var in allVars: - if var in firstStanza: - foobar = var - sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var]) - for pretend in cvPretend.iterkeys(): - if var + ' ' + firstStanza[var] == pretend: - foobar = cvPretend[pretend] - if foobar in cvDetails: - for cvVar in cvDetails[foobar]: - if cvVar in cvOverride and cvVar in firstStanza: - sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + firstStanza[cvVar]) - elif cvVar in cv[firstStanza[var]]: - sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar]) - else: - for cvVar in cvDefaults: - if firstStanza[var] in cv and cvVar in cv[firstStanza[var]]: - sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar]) - - sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName'] - - if 'treatment' in firstStanza: - sample['!Sample_treatment_protocol'] = firstStanza['treatment'] - - if 'protocol' in cv[firstStanza['cell']]: - for protocol in cv[firstStanza['cell']]['protocol'].split(' '): - key, val = protocol.split(':') - if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']: - sample['!Sample_growth_protocol'] = val - - if datatype.molecule == 'OVERRIDE RNA': - if firstStanza['rnaExtract'] in rnaExtractMapping: - sample['!Sample_molecule'] = rnaExtractMapping[firstStanza['rnaExtract']] - elif firstStanza['localization'] in localizationMapping: - sample['!Sample_molecule'] = localizationMapping[firstStanza['localization']] - - else: - sample['!Sample_molecule'] = datatype.molecule - - sample['!Sample_extract_protocol'] = compositeTrack.url - sample['!Sample_library_strategy'] = datatype.strategy - sample['!Sample_library_source'] = datatype.source - sample['!Sample_library_selection'] = datatype.selection - - # if the instrumentModel is consistent, just use that - # otherwise take the first seqPlatform value from metadata - # if that still fails, check the replacement file - # finally just make it say [REPLACE] - if instrumentModel != None: - sample['!Sample_instrument_model'] = instrumentModel - else: - for stanza in expId: - if 'seqPlatform' in stanza: - sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']] - break - if '!Sample_instrument_model' not in sample: - if '!Sample_instrument_model' in replace: - sample['!Sample_instrument_model'] = instrumentModels[replace['!Sample_instrument_model'][0]] - if '!Sample_instrument_model' not in sample: - sample['!Sample_instrument_model'] = '[REPLACE]' - if audit: - print stanza.name + ': no instrument' - - sample['!Sample_data_processing'] = compositeTrack.url - - if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': - sample['!Sample_geo_accession'] = geoMapping[idNum] - - softfile[firstStanza['metaObject']] = sample - - return softfile, fileList - - -def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): - - raise KeyError('microarray') - - print 'Creating MicroArray soft file' - - softfile = SoftFile() - fileList = list() - - createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) - - for idNum in expIds.iterkeys(): - - expId = expIds[idNum] - firstStanza = expId[0] - print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' - sample = MicroArraySampleStanza() - - sample['^SAMPLE'] = sampleTitle(firstStanza, expVars) - - if 'geoSeriesAccession' in series: - sample['!Sample_series_id'] = series['geoSeriesAccession'] - - sample['!Sample_title'] = concat - - count = 1 - - for stanza in expId: - - if isSupplimentaryFile(stanza['fileName']): - sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] - - if 'checksum' in stanza: - sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] - elif md5sums != None and stanza['fileName'] in md5sums: - sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] - - fileList.append(stanza['fileName']) - count = count + 1 - - # sample['!Sample_table'] = KeyOptional # CEL file - sample['!Sample_source_name_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered - sample['!Sample_organism_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered - sample['!Sample_characteristics_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered - # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered - # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered - # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered - sample['!Sample_molecule_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered - sample['!Sample_extract_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered - sample['!Sample_label_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered - sample['!Sample_label_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered - sample['!Sample_hyb_protocol'] = '[REPLACE]' #KeyOnePlus - sample['!Sample_scan_protocol'] = '[REPLACE]' #KeyOnePlus - sample['!Sample_data_processing'] = '[REPLACE]' #KeyOnePlus - sample['!Sample_description'] = '[REPLACE]' #KeyZeroPlus - sample['!Sample_platform_id'] = '[REPLACE]' - # sample['!Sample_geo_accession'] = KeyOptional - # sample['!Sample_anchor'] = KeyRequired SAGE ONLY - # sample['!Sample_type'] = KeyRequired SAGE ONLY - # sample['!Sample_tag_count'] = KeyRequired SAGE ONLY - # sample['!Sample_tag_length'] = KeyRequired SAGE ONLY - sample['!Sample_table_begin'] = '' - sample['!Sample_table_end'] = '' - - softfile[firstStanza['accession']] = sample - - return softfile, fileList - - -def createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): - softfile = SoftFile() - fileList = list() - - createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) - - for idNum in expIds.iterkeys(): - - expId = expIds[idNum] - firstStanza = expId[0] - print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' - sample = HighThroughputSampleStanza() - - hasbigwig = 0 - for stanza in expId: - - if getFileType(stanza['fileName']) == 'bigWig': - hasbigwig = 1 - - if hasbigwig == 0: - continue - - sample['^SAMPLE'] = firstStanza['geoSampleAccession'] - - if 'geoSeriesAccession' in series: - sample['!Sample_series_id'] = series['geoSeriesAccession'] - - sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession'] - - count = 1 - - for stanza in expId: - - if getFileType(stanza['fileName']) == 'bigWig': - sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] - - if 'checksum' in stanza: - sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] - elif md5sums != None and stanza['fileName'] in md5sums: - sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] - - # sample['!Sample_supplementary_file_build_' + str(count)] = database - - fileList.append(stanza['fileName']) - count = count + 1 - - softfile[firstStanza['geoSampleAccession']] = sample - - return softfile, fileList - -def main(): - - parser = argparse.ArgumentParser(description = 'Prepares a submission to GEO. Creates a soft file and shell script with the correct call to aspera.') - parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/') - parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc) and instrument model') - parser.add_argument('-a', '--audit', action='store_true', default=False, help='Instead of building the files, will just give you a list of errors') - parser.add_argument('database', help='The database, typically hg19 or mm9') - parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance') - parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file') - - if len(sys.argv) == 1: - parser.print_usage() - return - - args = parser.parse_args(sys.argv[1:]) - - compositeTrack = track.CompositeTrack(args.database, args.composite, args.trackPath) - - cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra' - controlledVocab = cv.CvFile(cvPath) - - replace = dict() - if args.replace != None: - for line in open(args.replace): - if line == '': - continue - key, val = map(str.strip, line.split('=', 1)) - if key not in replace: - replace[key] = list() - replace[key].append(val) - - - ids = list() - - for id in args.expIds: - if '-' in id: - start, end = id.split('-', 1) - ids.extend(range(int(start), int(end) + 1)) - else: - ids.append(int(id)) - - expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb) - - submission = dict() - if len(ids) == 0: - submission = expIds - else: - for expId in ids: - submission[str(expId)] = expIds[str(expId)] - - expIdStr = ' ' - for id in args.expIds: - expIdStr = expIdStr + id + ',' - expIdStr = expIdStr[:len(expIdStr) - 1] - print 'Generating soft using expIds ' + expIdStr - - if datatype.soft == soft.HighThroughputSoftFile: - softfile, fileList = createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit) - elif datatype.soft == soft.MicroArraySoftFile: - softfile, fileList = createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype) - else: - raise KeyError('unsupported type') - - if not args.audit: - print 'Creating directory' - - d = datetime.datetime.today() - datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day) - - dirname = '%s_%s_%s/' % (compositeTrack.database, compositeTrack.name, datestring) - asperadirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name) - linkdirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name) - - os.mkdir(dirname) - os.mkdir(dirname + linkdirname) - - print 'Writing file' - - outfileName = '%s%s_%s.soft' % (dirname, compositeTrack.database, compositeTrack.name) - outfile = open(outfileName, 'w') - outfile.write(str(softfile)) - fileslistname = '%sfiles.txt' % dirname - fileslist = open(fileslistname, 'w') - scriptname = '%supload.sh' % dirname - outscript = open(scriptname, 'w') - - outscript.write('#!/bin/sh\n\n') - outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname) - outscript.close() - - for file in fileList: - if not os.path.exists(file.path): - print IOError(file.path + ' does not exist') - elif not args.audit: - linkname = '%s_%s' % (compositeTrack.database, file.name) - linkpath = linkdirname + linkname - os.symlink(file.fullname, dirname + linkpath) - - #outscript.write(linkpath + ' \\\n') - fileslist.write(linkname + '\n') - - if not args.audit: - #outscript.write() - - fileslist.close() - - os.system('chmod +x ' + scriptname) - - print 'Finished!' - -if __name__ == '__main__': - main() \ No newline at end of file