91cb10cf4bacaac09c32b12cbbe30a5f1e47da2b mmaddren Mon Jul 25 14:14:24 2011 -0700 slight reworking in preparation for new module integration diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg index ee808d4..c7732ac 100755 --- python/programs/mkGeoPkg/mkGeoPkg +++ python/programs/mkGeoPkg/mkGeoPkg @@ -146,38 +146,38 @@ expVars = None series = None datatype = None for stanza in mdb.itervalues(): if 'objType' in stanza and stanza['objType'] == 'composite': series = stanza expVars = stanza['expVars'].split(',') continue if 'expId' not in stanza: print stanza.name + ': no expId' continue - #if 'geoSampleAccession' not in stanza: + if 'geoSampleAccession' not in stanza: # if this hasn't been submitted to GEO yet, we'll add it to the submission list if stanza['expId'] not in expIds: expIds[stanza['expId']] = list() expIds[stanza['expId']].append(stanza) - if 'geoSampleAccession' in stanza: + else: # otherwise we keep track of the geo number for partially submitted samples if stanza['expId'] not in geoMapping: geoMapping[stanza['expId']] = stanza['geoSampleAccession'] elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: geoMapping[stanza['expId']] = 'Inconsistent' print stanza.name + ': inconsistent geo mapping' if datatype == None and 'dataType' in stanza: datatype = stanza['dataType'] elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: raise KeyError(stanza.name + ': inconsistent data type') datatype = datatypes[datatype] @@ -224,36 +224,37 @@ def createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): print 'Creating HighThroughput soft file' softfile = SoftFile() fileList = list() createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza() - sample['^SAMPLE'] = firstStanza['metaObject'] - sample['!Sample_type'] = 'SRA' concat = expVars[0] for expVar in expVars[1:len(expVars)]: concat += '_' + firstStanza[expVar] + + sample['^SAMPLE'] = concat + sample['!Sample_type'] = 'SRA' sample['!Sample_title'] = concat if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] count = 1 for stanza in expId: if isRawFile(stanza['fileName']): sample['!Sample_raw_file_' + str(count)] = stanza['fileName'] sample['!Sample_raw_file_type_' + str(count)] = getFileType(stanza['fileName']) if 'checksum' in stanza: sample['!Sample_raw_file_checksum_' + str(count)] = stanza['checksum'] @@ -348,106 +349,82 @@ def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): print 'Creating MicroArray soft file' softfile = SoftFile() fileList = list() createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = MicroArraySampleStanza() - sample['^SAMPLE'] = firstStanza['accession'] + + concat = expVars[0] + for expVar in expVars[1:len(expVars)]: + concat += '_' + firstStanza[expVar] + + sample['^SAMPLE'] = concat if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] - #concat = expVars[0] - #for expVar in expVars[1:len(expVars)]: - # concat += '_' + firstStanza[expVar] - #sample['!Sample_title'] = concat - - sample['!Sample_geo_accession'] = firstStanza['accession'] + sample['!Sample_title'] = concat count = 1 for stanza in expId: if isSupplimentaryFile(stanza['fileName']): sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] if 'checksum' in stanza: sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] elif md5sums != None and stanza['fileName'] in md5sums: sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] - # sample['!Sample_supplementary_file_build_' + str(count)] = database - fileList.append(stanza['fileName']) count = count + 1 - softfile[firstStanza['accession']] = sample - # sample['!Sample_table'] = KeyOptional # CEL file - # sample['!Sample_source_name_ch'] = KeyOnePlusNumbered - # sample['!Sample_organism_ch'] = KeyOnePlusNumbered - # sample['!Sample_characteristics_ch'] = KeyOnePlusNumbered + sample['!Sample_source_name_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_organism_ch'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_characteristics_ch'] = '[REPLACE]' #KeyOnePlusNumbered # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered - # sample['!Sample_molecule_ch'] = KeyOnePlusNumbered - # sample['!Sample_extract_protocol_ch'] = KeyOnePlusNumbered - # sample['!Sample_label_ch'] = KeyOnePlusNumbered - # sample['!Sample_label_protocol_ch'] = KeyOnePlusNumbered - # sample['!Sample_hyb_protocol'] = KeyOnePlus - # sample['!Sample_scan_protocol'] = KeyOnePlus - # sample['!Sample_data_processing'] = KeyOnePlus - # sample['!Sample_description'] = KeyZeroPlus - # sample['!Sample_platform_id'] = KeyRequired + sample['!Sample_molecule_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_extract_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_label_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_label_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered + sample['!Sample_hyb_protocol'] = '[REPLACE]' #KeyOnePlus + sample['!Sample_scan_protocol'] = '[REPLACE]' #KeyOnePlus + sample['!Sample_data_processing'] = '[REPLACE]' #KeyOnePlus + sample['!Sample_description'] = '[REPLACE]' #KeyZeroPlus + sample['!Sample_platform_id'] = '[REPLACE]' # sample['!Sample_geo_accession'] = KeyOptional - # sample['!Sample_anchor'] = KeyRequired - # sample['!Sample_type'] = KeyRequired - # sample['!Sample_tag_count'] = KeyRequired - # sample['!Sample_tag_length'] = KeyRequired - # sample['!Sample_table_begin'] = KeyRequired - # sample['!Sample_table_end'] = KeyRequired - - # for idk: - - # platform = PlatformStanza() - - # platform['^PLATFORM'] = KeyRequired - # platform['!Platform_title'] = KeyRequired - # platform['!Platform_distribution'] = KeyRequired - # platform['!Platform_technology'] = KeyRequired - # platform['!Platform_organism'] = KeyOnePlus - # platform['!Platform_manufacturer'] = KeyRequired - # platform['!Platform_manufacture_protocol'] = KeyOnePlus - # platform['!Platform_catalog_number'] = KeyZeroPlus - # platform['!Platform_web_link'] = KeyZeroPlus - # platform['!Platform_support'] = KeyOptional - # platform['!Platform_coating'] = KeyOptional - # platform['!Platform_description'] = KeyZeroPlus - # platform['!Platform_contributor'] = KeyZeroPlus - # platform['!Platform_pubmed_id'] = KeyZeroPlus - # platform['!Platform_geo_accession'] = KeyOptional - # platform['!Platform_table_begin'] = KeyRequired - # platform['!Platform_table_end'] = KeyRequired + # sample['!Sample_anchor'] = KeyRequired SAGE ONLY + # sample['!Sample_type'] = KeyRequired SAGE ONLY + # sample['!Sample_tag_count'] = KeyRequired SAGE ONLY + # sample['!Sample_tag_length'] = KeyRequired SAGE ONLY + sample['!Sample_table_begin'] = '' + sample['!Sample_table_end'] = '' + + softfile[firstStanza['accession']] = sample return softfile, fileList def createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): softfile = SoftFile() fileList = list() createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' @@ -520,43 +497,45 @@ submission = dict() sortedIds = expIds.keys() sortedIds.sort() print sortedIds if wholeComposite == 0: sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize] minId = min(sortedIds) maxId = max(sortedIds) print 'Generating soft using expIds ' + minId + ' to ' + maxId for expId in sortedIds: submission[expId] = expIds[expId] if datatype.soft == HighThroughputSoftFile: - softfile, fileList = createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype) + softfile, fileList = createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype) elif datatype.soft == MicroArraySoftFile: softfile, fileList = createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype) else: raise KeyError('unsupported type') print 'Writing soft file' outfileName = os.path.dirname(sys.argv[0]) + composite + '.soft' outfile = open(outfileName, 'w') outfile.write(str(softfile)) fileString = outfileName for file in fileList: + if not os.path.exists(file): + raise IOError(file) fileString = fileString + ' ' + downloadsDirectory + file fileString.strip() callString = '/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m ' + fileString + ' asp-geo@upload.ncbi.nlm.nih.gov:ENCODE/' + composite outscript = open(composite + minId + '-' + maxId + '.sh', 'w') outscript.write('#!/bin/sh\n\n') outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m \\\n') outscript.write(os.path.dirname(sys.argv[0]) + composite + '.soft' + ' \\\n') for file in fileList: if not os.path.exists(downloadsDirectory + file): raise FileError(downloadsDirectory + file + ' does not exist') outscript.write(downloadsDirectory + file + ' \\\n')