20e6eff593096b1ec381de4ee5058c5edafd2bba mmaddren Tue Aug 30 15:47:54 2011 -0700 added protocol checking functionality, partial support for deprecated, and revised the error messages to be more consistent diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg index 5dd0ec7..a83a9bb 100755 --- python/programs/mkGeoPkg/mkGeoPkg +++ python/programs/mkGeoPkg/mkGeoPkg @@ -41,33 +41,37 @@ 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None) } #compare this to the source in datatype, give GP ids depending on the type gpIds = { - 'genomic': '63443', - 'transcriptomic': '30709', - 'protein': '63447' + 'human genomic': '63443', + 'human transcriptomic': '30709', + 'human protein': '63447', + + 'mouse genomic': '63471', + 'mouse transcriptomic': '66167', + 'mouse protein': '63475' } cvDetails = { 'cell': [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ], 'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ] } #if the term appears in the mdb and must overriding the value in the cv cvOverride = [ 'sex' ] #talk to Venkat lol cvPretend = { 'antibody Input': 'control' } #if its not in cvDetails, which things should we check by default cvDefaults = [ 'description' ] @@ -117,37 +121,37 @@ # map our instrument names to GEO's names instrumentModels = { 'Illumina_GA2x': 'Illumina Genome Analyzer II', 'Illumina_GA2': 'Illumina Genome Analyzer II', 'Illumina_HiSeq_2000': 'Illumina HiSeq 2000' } def isRawFile(file): return (file.extension == 'fastq' or file.extension == 'fasta') def isSupplimentaryFile(file): return not isRawFile(file) -def sampleTitle(stanza, expVars): +def sampleTitle(stanza, expVars, warn=False): concat = stanza[expVars[0]].replace('-m', '') for expVar in expVars[1:len(expVars)]: - if expVar in stanza: + if expVar in stanza and stanza[expVar] != 'None': concat += '_' + stanza[expVar] - else: - print 'warning: %s not in %s' % (expVar, stanza.name) + elif warn: + print 'warning: %s is None or not in %s' % (expVar, stanza.name) return concat def createMappings(mdb): expIds = dict() geoMapping = dict() expVars = None series = None datatype = None for stanza in mdb.itervalues(): if 'objType' in stanza and stanza['objType'] == 'composite': series = stanza expVars = stanza['expVars'].split(',') continue @@ -184,77 +188,77 @@ def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace): if 'geoSeriesAccession' in series: print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession'] return print 'Writing series ' + series['composite'] seriesStanza = SeriesStanza() seriesStanza['^SERIES'] = series['composite'] seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT if '!Series_summary' in replace: seriesStanza['!Series_summary'] = replace['!Series_summary'] else: - raise Error('no series summary found. Please include in replace file.') + print 'warning: no series summary found. Please include in replace file.' seriesStanza['!Series_summary'] = '[REPLACE]' if '!Series_overall_design' in replace: seriesStanza['!Series_overall_design'] = replace['!Series_overall_design'] else: - raise Error('no series overall design found. Please include in replace file.') + print 'no series overall design found. Please include in replace file.' seriesStanza['!Series_overall_design'] = '[REPLACE]' seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ] if '!Series_contributor' in replace: seriesStanza['!Series_contributor'] = replace['!Series_contributor'] else: seriesStanza['!Series_contributor'] = '[REPLACE]' - seriesStanza['!Series_gp_id'] = gpIds[datatype.source] + seriesStanza['!Series_gp_id'] = gpIds[compositeTrack.organism + ' ' + datatype.source] # could use !Series_variable_* and !Series_repeats_* seriesStanza['!Series_sample_id'] = list() for idNum in expIds.iterkeys(): if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': seriesStanza['!Series_sample_id'].append(geoMapping[idNum]) else: seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars)) softfile[series['composite']] = seriesStanza def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, instrument, replace): print 'Creating HighThroughput soft file' softfile = HighThroughputSoftFile() fileList = list() createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza() - sample['^SAMPLE'] = sampleTitle(firstStanza, expVars) + sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1) sample['!Sample_type'] = 'SRA' sample['!Sample_title'] = sample['^SAMPLE'] if 'geoSeriesAccession' in series: sample['!Sample_series_id'] = series['geoSeriesAccession'] count = 1 for stanza in expId: file = compositeTrack.files[stanza['fileName']] if isRawFile(file): sample['!Sample_raw_file_' + str(count)] = file.name sample['!Sample_raw_file_type_' + str(count)] = file.extension @@ -509,31 +513,31 @@ for id in args.expIds: if '-' in id: start, end = id.split('-', 1) ids.extend(range(int(start), int(end) + 1)) else: ids.append(int(id)) expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb) submission = dict() if len(ids) == 0: submission = expIds else: for expId in ids: - submission[expId] = expIds[expId] + submission[str(expId)] = expIds[str(expId)] expIdStr = ' ' for id in args.expIds: expIdStr = expIdStr + id + ',' expIdStr = expIdStr[:len(expIdStr) - 1] print 'Generating soft using expIds ' + expIdStr if datatype.soft == HighThroughputSoftFile: softfile, fileList = createHighThroughputSoftFile(compositeTrack, cv, submission, expVars, geoMapping, series, datatype, args.instrument, replace) elif datatype.soft == MicroArraySoftFile: softfile, fileList = createMicroArraySoftFile(compositeTrack, cv, submission, expVars, geoMapping, series, datatype) else: raise KeyError('unsupported type') print 'Creating directory'