python/programs/mkGeoPkg/mkGeoPkg b59be32f2b6db2037cad497ff2627527199e4b93

b59be32f2b6db2037cad497ff2627527199e4b93
mmaddren
  Tue Jul 12 14:57:09 2011 -0700
added new version of soft file to allow for micro array data
diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg
index 23a3e84..d1c6c82 100755
--- python/programs/mkGeoPkg/mkGeoPkg
+++ python/programs/mkGeoPkg/mkGeoPkg
@@ -99,31 +99,35 @@
 	'membraneFraction': 'cytoplasmic RNA',
 	'mitochondria': 'cytoplasmic RNA',
 	'nucleus': 'nuclear RNA', 
 	'nucleolus': 'nuclear RNA', 
 	'nucleoplasm': 'nuclear RNA', 
 	'nuclearMatrix': 'nuclear RNA', 
 	'chromatin': 'nuclear RNA',
 	'cell': 'total RNA'
 }
 
 # map our instrument names to GEO's names
 instrumentModels = {
 	'Illumina_GA2x': 'Illumina Genome Analyzer II'
 }
 
-organisms = { 'hg19': 'human', 'hg18': 'human', 'mm9': 'mouse' }
+organisms = {
+	'hg19': 'human',
+	'hg18': 'human',
+	'mm9': 'mouse'
+}
 
 
 def getFileType(filename):
 	filename.replace('.gz', '')
 	return filename.rsplit('.')[1]
 	
 def isRawFile(filename):
 	return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta')
 	
 def isSupplimentaryFile(filename):
 	return not isRawFile(filename)
 
 
 def readMd5sums(filename):
 	if os.path.isfile(filename):
@@ -168,79 +172,92 @@
 				geoMapping[stanza['expId']] = 'Inconsistent'
 				print stanza.name + ': inconsistent geo mapping'
 		
 		if datatype == None and 'dataType' in stanza:
 			datatype = stanza['dataType']
 		elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']:
 			raise KeyError(stanza.name + ': inconsistent data type') 
 
 		
 	
 	datatype = datatypes[datatype]
 	
 	return expIds, expVars, geoMapping, series, datatype
 	
 	
-def createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory):
+def createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series):
 	
-	print 'Writing series ' + series['composite']
+	if 'geoSeriesAccession' in series:
+		print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession']
+		return
 	
-	fileList = list()
+	print 'Writing series ' + series['composite']
 	
-	softfile = SoftFile()
-	seriesStanza = HighThroughputSeriesStanza()
+	seriesStanza = SeriesStanza()
 	seriesStanza['^SERIES'] = series['composite']
 	seriesStanza['!Series_title'] = track[composite]['longLabel'] #STILL INCORRECT
 	seriesStanza['!Series_summary'] = '[REPLACE]'
 	seriesStanza['!Series_overall_design'] = '[REPLACE]'
 	seriesStanza['!Series_web_link'] = [ compositeUrl, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ]
 	seriesStanza['!Series_contributor'] = '[REPLACE]'
 	seriesStanza['!Series_gp_id'] = '[REPLACE]'
 	
 	#stanza['!Series_variable_1'] = 'var1' #dont use for now, follow up for later
 	#stanza['!Series_variable_description_1'] = 'desc1' # ^
 	#stanza['!Series_variable_sample_list_1'] = 'list1' # ^
 	#stanza['!Series_repeats_1'] = 'rep1' #WILL USE BUT DONT KNOW YET
 	#stanza['!Series_repeats_sample_list_1'] = 'replist1' # ^
 	
 	seriesStanza['!Series_sample_id'] = list()
 	
 	for idNum in expIds.iterkeys():
 		if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
 			seriesStanza['!Series_sample_id'].append(geoMapping[idNum])
 		else:
 			seriesStanza['!Series_sample_id'].append(expIds[idNum][0]['metaObject'])
 	
 	if 'geoAccession' in series:
 		seriesStanza['!Series_geo_accession'] = series['geoAccession']
 	
 	softfile[series['composite']] = seriesStanza
 		
+def createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory):
+	
+	print 'Creating HighThroughput soft file'
+
+	softfile = SoftFile()
+	fileList = list()
+	
+	createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series)
+		
 	for idNum in expIds.iterkeys():
 		
 		expId = expIds[idNum]
 		firstStanza = expId[0]
 		print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
 		sample = HighThroughputSampleStanza()
 		sample['^SAMPLE'] = firstStanza['metaObject']
 		sample['!Sample_type'] = 'SRA'
 		
 		concat = expVars[0]
 		for expVar in expVars[1:len(expVars)]:
 			concat += '_' + firstStanza[expVar]
 		sample['!Sample_title'] = concat
 		
+		if 'geoSeriesAccession' in series:
+			sample['!Sample_series_id'] = series['geoSeriesAccession']
+			
 		count = 1
 		
 		for stanza in expId:
 		
 			if isRawFile(stanza['fileName']):
 				sample['!Sample_raw_file_' + str(count)] = stanza['fileName']
 				sample['!Sample_raw_file_type_' + str(count)] = getFileType(stanza['fileName'])
 				
 				if 'checksum' in stanza:
 					sample['!Sample_raw_file_checksum_' + str(count)] = stanza['checksum']
 				elif md5sums != None and stanza['fileName'] in md5sums:
 					sample['!Sample_raw_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
 
 				fileList.append(stanza['fileName'])	
 				count = count + 1
@@ -316,31 +333,114 @@
 		for stanza in expId:	
 			if 'seqPlatform' in stanza:
 				sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']]
 			
 		sample['!Sample_data_processing'] = compositeUrl
 
 		if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
 			sample['!Sample_geo_accession'] = geoMapping[idNum]
 		
 		softfile[firstStanza['metaObject']] = sample
 		
 	return softfile, fileList
 		
 		
 def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory):
-	pass
+	
+	print 'Creating MicroArray soft file'
+
+	softfile = SoftFile()
+	fileList = list()
+	
+	createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series)
+	
+	for idNum in expIds.iterkeys():
+		
+		expId = expIds[idNum]
+		firstStanza = expId[0]
+		print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
+		sample = MicroArraySampleStanza()
+		sample['^SAMPLE'] = firstStanza['metaObject']
+
+		concat = expVars[0]
+		for expVar in expVars[1:len(expVars)]:
+			concat += '_' + firstStanza[expVar]
+		sample['!Sample_title'] = concat
+		
+		count = 1
+			
+		for stanza in expId:
+		
+			if isSupplimentaryFile(stanza['fileName']):
+				sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
+				
+				# if 'checksum' in stanza:
+					# sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
+				# elif md5sums != None and stanza['fileName'] in md5sums:
+					# sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
+				
+				# sample['!Sample_supplementary_file_build_' + str(count)] = database
+				
+				fileList.append(stanza['fileName'])
+				count = count + 1
+
+		# sample['!Sample_table'] = KeyOptional # CEL file
+		# sample['!Sample_source_name_ch'] = KeyOnePlusNumbered
+		# sample['!Sample_organism_ch'] = KeyOnePlusNumbered
+		# sample['!Sample_characteristics_ch'] = KeyOnePlusNumbered
+		# sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered
+		# sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered
+		# sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered
+		# sample['!Sample_molecule_ch'] = KeyOnePlusNumbered
+		# sample['!Sample_extract_protocol_ch'] = KeyOnePlusNumbered
+		# sample['!Sample_label_ch'] = KeyOnePlusNumbered
+		# sample['!Sample_label_protocol_ch'] = KeyOnePlusNumbered
+		# sample['!Sample_hyb_protocol'] = KeyOnePlus
+		# sample['!Sample_scan_protocol'] = KeyOnePlus
+		# sample['!Sample_data_processing'] = KeyOnePlus
+		# sample['!Sample_description'] = KeyZeroPlus
+		# sample['!Sample_platform_id'] = KeyRequired
+		# sample['!Sample_geo_accession'] = KeyOptional
+		# sample['!Sample_anchor'] = KeyRequired
+		# sample['!Sample_type'] = KeyRequired
+		# sample['!Sample_tag_count'] = KeyRequired
+		# sample['!Sample_tag_length'] = KeyRequired
+		# sample['!Sample_table_begin'] = KeyRequired
+		# sample['!Sample_table_end'] = KeyRequired
+	
+	# for idk:
+	
+		# platform = PlatformStanza()
+		
+		# platform['^PLATFORM'] = KeyRequired
+		# platform['!Platform_title'] = KeyRequired
+		# platform['!Platform_distribution'] = KeyRequired
+		# platform['!Platform_technology'] = KeyRequired
+		# platform['!Platform_organism'] = KeyOnePlus
+		# platform['!Platform_manufacturer'] = KeyRequired
+		# platform['!Platform_manufacture_protocol'] = KeyOnePlus
+		# platform['!Platform_catalog_number'] = KeyZeroPlus
+		# platform['!Platform_web_link'] = KeyZeroPlus
+		# platform['!Platform_support'] = KeyOptional
+		# platform['!Platform_coating'] = KeyOptional
+		# platform['!Platform_description'] = KeyZeroPlus
+		# platform['!Platform_contributor'] = KeyZeroPlus
+		# platform['!Platform_pubmed_id'] = KeyZeroPlus
+		# platform['!Platform_geo_accession'] = KeyOptional
+		# platform['!Platform_table_begin'] = KeyRequired
+		# platform['!Platform_table_end'] = KeyRequired
+	
 		
 def main():
 	database = sys.argv[1]
 	composite = sys.argv[2]
 	submitStart = sys.argv[3]
 	submitSize = int(sys.argv[4])
 	organism = organisms[database]
 
 	mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE
 	cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #CHANGE
 	trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra'
 	md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt'
 
 	downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/'
 	copyDirectory = '/cluster/home/mmaddren/kent/python/ucscgenomics/mkGeoPkg/' + composite