python/programs/mkGeoPkg/mkGeoPkg 20e6eff593096b1ec381de4ee5058c5edafd2bba

20e6eff593096b1ec381de4ee5058c5edafd2bba
mmaddren
  Tue Aug 30 15:47:54 2011 -0700
added protocol checking functionality, partial support for deprecated, and revised the error messages to be more consistent
diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg
index 5dd0ec7..a83a9bb 100755
--- python/programs/mkGeoPkg/mkGeoPkg
+++ python/programs/mkGeoPkg/mkGeoPkg
@@ -41,33 +41,37 @@
 	'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None)
 }
 
 #compare this to the source in datatype, give GP ids depending on the type
 gpIds = {
-	'genomic': '63443',
-	'transcriptomic': '30709',
-	'protein': '63447'
+	'human genomic': '63443',
+	'human transcriptomic': '30709',
+	'human protein': '63447',
+	
+	'mouse genomic': '63471',
+	'mouse transcriptomic': '66167',
+	'mouse protein': '63475'
 }
 
 cvDetails = {
 	'cell':	[ 'organism', 'description', 'karyotype', 'lineage', 'sex' ],
 	'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ]
 }
 
 #if the term appears in the mdb and must overriding the value in the cv
 cvOverride = [ 'sex' ]
 
 #talk to Venkat lol
 cvPretend = { 'antibody Input': 'control' }
 
 #if its not in cvDetails, which things should we check by default
 cvDefaults = [ 'description' ]
@@ -117,37 +121,37 @@
 
 # map our instrument names to GEO's names
 instrumentModels = {
 	'Illumina_GA2x': 'Illumina Genome Analyzer II',
 	'Illumina_GA2': 'Illumina Genome Analyzer II',
 	'Illumina_HiSeq_2000': 'Illumina HiSeq 2000'
 }
 
 	
 def isRawFile(file):
 	return (file.extension == 'fastq' or file.extension == 'fasta')
 	
 def isSupplimentaryFile(file):
 	return not isRawFile(file)
 
-def sampleTitle(stanza, expVars):
+def sampleTitle(stanza, expVars, warn=False):
 	concat = stanza[expVars[0]].replace('-m', '')
 	for expVar in expVars[1:len(expVars)]:
-		if expVar in stanza:
+		if expVar in stanza and stanza[expVar] != 'None':
 			concat += '_' + stanza[expVar]
-		else:
-			print 'warning: %s not in %s' % (expVar, stanza.name)
+		elif warn:
+			print 'warning: %s is None or not in %s' % (expVar, stanza.name)
 	return concat
 	
 def createMappings(mdb):
 	expIds = dict()
 	geoMapping = dict()
 	expVars = None
 	series = None
 	datatype = None
 	
 	for stanza in mdb.itervalues():
 		
 		if 'objType' in stanza and stanza['objType'] == 'composite':
 			series = stanza
 			expVars = stanza['expVars'].split(',')
 			continue
@@ -184,77 +188,77 @@
 def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace):
 	
 	if 'geoSeriesAccession' in series:
 		print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession']
 		return
 		
 	print 'Writing series ' + series['composite']
 	
 	seriesStanza = SeriesStanza()
 	seriesStanza['^SERIES'] = series['composite']
 	seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT
 	
 	if '!Series_summary' in replace:
 		seriesStanza['!Series_summary'] = replace['!Series_summary']
 	else:
-		raise Error('no series summary found. Please include in replace file.')
+		print 'warning: no series summary found. Please include in replace file.'
 		seriesStanza['!Series_summary'] = '[REPLACE]'
 		
 	if '!Series_overall_design' in replace:
 		seriesStanza['!Series_overall_design'] = replace['!Series_overall_design']
 	else:
-		raise Error('no series overall design found. Please include in replace file.')
+		print 'no series overall design found. Please include in replace file.'
 		seriesStanza['!Series_overall_design'] = '[REPLACE]'
 		
 	seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ]
 	
 	if '!Series_contributor' in replace:
 		seriesStanza['!Series_contributor'] = replace['!Series_contributor']
 	else:
 		seriesStanza['!Series_contributor'] = '[REPLACE]'
 		
-	seriesStanza['!Series_gp_id'] = gpIds[datatype.source]
+	seriesStanza['!Series_gp_id'] = gpIds[compositeTrack.organism + ' ' + datatype.source]
 	
 	# could use !Series_variable_* and !Series_repeats_*
 	
 	seriesStanza['!Series_sample_id'] = list()
 	
 	for idNum in expIds.iterkeys():
 		if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
 			seriesStanza['!Series_sample_id'].append(geoMapping[idNum])
 		else:
 			seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars))
 	
 	softfile[series['composite']] = seriesStanza
 	
 def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, instrument, replace):
 	
 	print 'Creating HighThroughput soft file'
 
 	softfile = HighThroughputSoftFile()
 	fileList = list()
 	
 	createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace)
 		
 	for idNum in expIds.iterkeys():
 		
 		expId = expIds[idNum]
 		firstStanza = expId[0]
 		print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
 		sample = HighThroughputSampleStanza()
 
-		sample['^SAMPLE'] = sampleTitle(firstStanza, expVars)
+		sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1)
 		sample['!Sample_type'] = 'SRA'
 		sample['!Sample_title'] = sample['^SAMPLE']
 		
 		if 'geoSeriesAccession' in series:
 			sample['!Sample_series_id'] = series['geoSeriesAccession']
 			
 		count = 1
 		
 		for stanza in expId:
 		
 			file = compositeTrack.files[stanza['fileName']]
 			
 			if isRawFile(file):
 				sample['!Sample_raw_file_' + str(count)] = file.name
 				sample['!Sample_raw_file_type_' + str(count)] = file.extension
@@ -509,31 +513,31 @@
 	
 	for id in args.expIds:
 		if '-' in id:
 			start, end = id.split('-', 1)
 			ids.extend(range(int(start), int(end) + 1))
 		else:
 			ids.append(int(id))
 	
 	expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb)
 	
 	submission = dict()
 	if len(ids) == 0:
 		submission = expIds
 	else:
 		for expId in ids:
-			submission[expId] = expIds[expId]
+			submission[str(expId)] = expIds[str(expId)]
 	
 	expIdStr = ' '
 	for id in args.expIds:
 		expIdStr = expIdStr + id + ',' 
 	expIdStr = expIdStr[:len(expIdStr) - 1]
 	print 'Generating soft using expIds ' + expIdStr
 	
 	if datatype.soft == HighThroughputSoftFile:
 		softfile, fileList = createHighThroughputSoftFile(compositeTrack, cv, submission, expVars, geoMapping, series, datatype, args.instrument, replace)
 	elif datatype.soft == MicroArraySoftFile:
 		softfile, fileList = createMicroArraySoftFile(compositeTrack, cv, submission, expVars, geoMapping, series, datatype)
 	else:
 		raise KeyError('unsupported type')
 	
 	print 'Creating directory'