8aea88625794a4942add9bf0b2ed9660f98404bf mmaddren Tue Jul 19 15:40:41 2011 -0700 temporary commit for adding additional bigWigs to cshllong diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg index 2930f66..ee808d4 100755 --- python/programs/mkGeoPkg/mkGeoPkg +++ python/programs/mkGeoPkg/mkGeoPkg @@ -146,38 +146,38 @@ expVars = None series = None datatype = None for stanza in mdb.itervalues(): if 'objType' in stanza and stanza['objType'] == 'composite': series = stanza expVars = stanza['expVars'].split(',') continue if 'expId' not in stanza: print stanza.name + ': no expId' continue - if 'geoSampleAccession' not in stanza: + #if 'geoSampleAccession' not in stanza: # if this hasn't been submitted to GEO yet, we'll add it to the submission list if stanza['expId'] not in expIds: expIds[stanza['expId']] = list() expIds[stanza['expId']].append(stanza) - else: + if 'geoSampleAccession' in stanza: # otherwise we keep track of the geo number for partially submitted samples if stanza['expId'] not in geoMapping: geoMapping[stanza['expId']] = stanza['geoSampleAccession'] elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: geoMapping[stanza['expId']] = 'Inconsistent' print stanza.name + ': inconsistent geo mapping' if datatype == None and 'dataType' in stanza: datatype = stanza['dataType'] elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: raise KeyError(stanza.name + ': inconsistent data type') datatype = datatypes[datatype] @@ -209,31 +209,31 @@ #stanza['!Series_repeats_sample_list_1'] = 'replist1' # ^ seriesStanza['!Series_sample_id'] = list() for idNum in expIds.iterkeys(): if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': seriesStanza['!Series_sample_id'].append(geoMapping[idNum]) else: seriesStanza['!Series_sample_id'].append(expIds[idNum][0]['metaObject']) if 'geoAccession' in series: seriesStanza['!Series_geo_accession'] = series['geoAccession'] softfile[series['composite']] = seriesStanza -def createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory): +def createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): print 'Creating HighThroughput soft file' softfile = SoftFile() fileList = list() createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza() sample['^SAMPLE'] = firstStanza['metaObject'] @@ -333,31 +333,31 @@ sample['!Sample_instrument_model'] = '[REPLACE]' for stanza in expId: if 'seqPlatform' in stanza: sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']] sample['!Sample_data_processing'] = compositeUrl if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': sample['!Sample_geo_accession'] = geoMapping[idNum] softfile[firstStanza['metaObject']] = sample return softfile, fileList -def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory): +def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): print 'Creating MicroArray soft file' softfile = SoftFile() fileList = list() createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = MicroArraySampleStanza() sample['^SAMPLE'] = firstStanza['accession'] @@ -428,75 +428,125 @@ # platform['!Platform_manufacture_protocol'] = KeyOnePlus # platform['!Platform_catalog_number'] = KeyZeroPlus # platform['!Platform_web_link'] = KeyZeroPlus # platform['!Platform_support'] = KeyOptional # platform['!Platform_coating'] = KeyOptional # platform['!Platform_description'] = KeyZeroPlus # platform['!Platform_contributor'] = KeyZeroPlus # platform['!Platform_pubmed_id'] = KeyZeroPlus # platform['!Platform_geo_accession'] = KeyOptional # platform['!Platform_table_begin'] = KeyRequired # platform['!Platform_table_end'] = KeyRequired return softfile, fileList +def createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype): + softfile = SoftFile() + fileList = list() + + createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) + + for idNum in expIds.iterkeys(): + + expId = expIds[idNum] + firstStanza = expId[0] + print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' + sample = HighThroughputSampleStanza() + + hasbigwig = 0 + for stanza in expId: + + if getFileType(stanza['fileName']) == 'bigWig': + hasbigwig = 1 + + if hasbigwig == 0: + continue + + sample['^SAMPLE'] = firstStanza['geoSampleAccession'] + + if 'geoSeriesAccession' in series: + sample['!Sample_series_id'] = series['geoSeriesAccession'] + + sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession'] + + count = 1 + + for stanza in expId: + + if getFileType(stanza['fileName']) == 'bigWig': + sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] + + if 'checksum' in stanza: + sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] + elif md5sums != None and stanza['fileName'] in md5sums: + sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] + + # sample['!Sample_supplementary_file_build_' + str(count)] = database + + fileList.append(stanza['fileName']) + count = count + 1 + + softfile[firstStanza['geoSampleAccession']] = sample + + return softfile, fileList + def main(): database = sys.argv[1] composite = sys.argv[2] wholeComposite = 1 if len(sys.argv) == 5: submitStart = sys.argv[3] submitSize = int(sys.argv[4]) wholeComposite = 0 organism = organisms[database] mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #CHANGE trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra' md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt' downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/' compositeUrl = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + composite mdb = RaFile(mdbPath) cv = CvFile(cvPath) track = RaFile(trackPath) md5sums = readMd5sums(md5path) - print md5sums expIds, expVars, geoMapping, series, datatype = createMappings(mdb) submission = dict() sortedIds = expIds.keys() sortedIds.sort() + print sortedIds if wholeComposite == 0: sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize] minId = min(sortedIds) maxId = max(sortedIds) print 'Generating soft using expIds ' + minId + ' to ' + maxId for expId in sortedIds: submission[expId] = expIds[expId] if datatype.soft == HighThroughputSoftFile: - softfile, fileList = createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype, copyDirectory) + softfile, fileList = createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype) elif datatype.soft == MicroArraySoftFile: - softfile, fileList = createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype, copyDirectory) + softfile, fileList = createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype) else: raise KeyError('unsupported type') print 'Writing soft file' outfileName = os.path.dirname(sys.argv[0]) + composite + '.soft' outfile = open(outfileName, 'w') outfile.write(str(softfile)) fileString = outfileName for file in fileList: fileString = fileString + ' ' + downloadsDirectory + file fileString.strip() callString = '/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m ' + fileString + ' asp-geo@upload.ncbi.nlm.nih.gov:ENCODE/' + composite outscript = open(composite + minId + '-' + maxId + '.sh', 'w')