python/programs/mkGeoPkg/mkGeoPkg 0bb1cef7954937ff2c3d0851ee8db34c07dc5008

0bb1cef7954937ff2c3d0851ee8db34c07dc5008
mmaddren
  Tue Jul 19 14:42:36 2011 -0700
small but in soft file, work on mkGeoPkg for micro array submissions (still not functional) and added a readme
diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg
index d1c6c82..2930f66 100755
--- python/programs/mkGeoPkg/mkGeoPkg
+++ python/programs/mkGeoPkg/mkGeoPkg
@@ -17,31 +17,31 @@
 	'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile),
 	'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile),
 	'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile),
 	'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile),
 	'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile),
 	'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile),
 	'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile),
 	'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile),
 	'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile),
 	'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile),
 	'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile),
 	'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile),
 	
 	#these need to be curated
 	'5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
-	'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', MicroArraySoftFile),
 	'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
@@ -124,30 +124,31 @@
 	
 def isRawFile(filename):
 	return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta')
 	
 def isSupplimentaryFile(filename):
 	return not isRawFile(filename)
 
 
 def readMd5sums(filename):
 	if os.path.isfile(filename):
 		md5sums = dict()
 		md5file = open(filename, 'r')
 		for line in md5file:
 			val, key = map(str.strip, line.split('  ', 1))
 			md5sums[key] = val
+		return md5sums
 	else:
 		return None
 	
 def createMappings(mdb):
 	expIds = dict()
 	geoMapping = dict()
 	expVars = None
 	series = None
 	datatype = None
 	
 	for stanza in mdb.itervalues():
 		
 		if 'objType' in stanza and stanza['objType'] == 'composite':
 			series = stanza
 			expVars = stanza['expVars'].split(',')
@@ -347,54 +348,61 @@
 def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory):
 	
 	print 'Creating MicroArray soft file'
 
 	softfile = SoftFile()
 	fileList = list()
 	
 	createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series)
 	
 	for idNum in expIds.iterkeys():
 		
 		expId = expIds[idNum]
 		firstStanza = expId[0]
 		print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
 		sample = MicroArraySampleStanza()
-		sample['^SAMPLE'] = firstStanza['metaObject']
+		sample['^SAMPLE'] = firstStanza['accession']
 
-		concat = expVars[0]
-		for expVar in expVars[1:len(expVars)]:
-			concat += '_' + firstStanza[expVar]
-		sample['!Sample_title'] = concat
+		if 'geoSeriesAccession' in series:
+			sample['!Sample_series_id'] = series['geoSeriesAccession']
+
+		#concat = expVars[0]
+		#for expVar in expVars[1:len(expVars)]:
+		#	concat += '_' + firstStanza[expVar]
+		#sample['!Sample_title'] = concat
+		
+		sample['!Sample_geo_accession'] = firstStanza['accession']
 		
 		count = 1
 			
 		for stanza in expId:
 		
 			if isSupplimentaryFile(stanza['fileName']):
 				sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
 				
-				# if 'checksum' in stanza:
-					# sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
-				# elif md5sums != None and stanza['fileName'] in md5sums:
-					# sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
+				if 'checksum' in stanza:
+					sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
+				elif md5sums != None and stanza['fileName'] in md5sums:
+					sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
 				
 				# sample['!Sample_supplementary_file_build_' + str(count)] = database
 				
 				fileList.append(stanza['fileName'])
 				count = count + 1
 
+		softfile[firstStanza['accession']] = sample
+
 		# sample['!Sample_table'] = KeyOptional # CEL file
 		# sample['!Sample_source_name_ch'] = KeyOnePlusNumbered
 		# sample['!Sample_organism_ch'] = KeyOnePlusNumbered
 		# sample['!Sample_characteristics_ch'] = KeyOnePlusNumbered
 		# sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered
 		# sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered
 		# sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered
 		# sample['!Sample_molecule_ch'] = KeyOnePlusNumbered
 		# sample['!Sample_extract_protocol_ch'] = KeyOnePlusNumbered
 		# sample['!Sample_label_ch'] = KeyOnePlusNumbered
 		# sample['!Sample_label_protocol_ch'] = KeyOnePlusNumbered
 		# sample['!Sample_hyb_protocol'] = KeyOnePlus
 		# sample['!Sample_scan_protocol'] = KeyOnePlus
 		# sample['!Sample_data_processing'] = KeyOnePlus
 		# sample['!Sample_description'] = KeyZeroPlus
@@ -417,85 +425,97 @@
 		# platform['!Platform_technology'] = KeyRequired
 		# platform['!Platform_organism'] = KeyOnePlus
 		# platform['!Platform_manufacturer'] = KeyRequired
 		# platform['!Platform_manufacture_protocol'] = KeyOnePlus
 		# platform['!Platform_catalog_number'] = KeyZeroPlus
 		# platform['!Platform_web_link'] = KeyZeroPlus
 		# platform['!Platform_support'] = KeyOptional
 		# platform['!Platform_coating'] = KeyOptional
 		# platform['!Platform_description'] = KeyZeroPlus
 		# platform['!Platform_contributor'] = KeyZeroPlus
 		# platform['!Platform_pubmed_id'] = KeyZeroPlus
 		# platform['!Platform_geo_accession'] = KeyOptional
 		# platform['!Platform_table_begin'] = KeyRequired
 		# platform['!Platform_table_end'] = KeyRequired
 	
+	return softfile, fileList
+	
 		
 def main():
 	database = sys.argv[1]
 	composite = sys.argv[2]
+	
+	wholeComposite = 1
+	if len(sys.argv) == 5:
 	submitStart = sys.argv[3]
 	submitSize = int(sys.argv[4])
+		wholeComposite = 0
+		
 	organism = organisms[database]
 
 	mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE
 	cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #CHANGE
 	trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra'
 	md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt'
 
 	downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/'
-	copyDirectory = '/cluster/home/mmaddren/kent/python/ucscgenomics/mkGeoPkg/' + composite
 	
 	compositeUrl = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + composite
 	
 	mdb = RaFile(mdbPath)
 	cv = CvFile(cvPath)
 	track = RaFile(trackPath)
 	md5sums = readMd5sums(md5path)
+	print md5sums
 	expIds, expVars, geoMapping, series, datatype = createMappings(mdb)
 	
 	submission = dict()
 	sortedIds = expIds.keys()
 	sortedIds.sort()
+	
+	if wholeComposite == 0:
 	sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize]
+		
 	minId = min(sortedIds)
 	maxId = max(sortedIds)
 	print 'Generating soft using expIds ' + minId + ' to ' + maxId
 	for expId in sortedIds:
 		submission[expId] = expIds[expId]
 
 	if datatype.soft == HighThroughputSoftFile:
 		softfile, fileList = createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype, copyDirectory)
 	elif datatype.soft == MicroArraySoftFile:
 		softfile, fileList = createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype, copyDirectory)
 	else:
-		raise Error('unsupported type')
+		raise KeyError('unsupported type')
 		
 	print 'Writing soft file'
 	outfileName = os.path.dirname(sys.argv[0]) + composite + '.soft'
 	outfile = open(outfileName, 'w')
 	outfile.write(str(softfile))
 	
 	fileString = outfileName
 	for file in fileList:
 		fileString = fileString + ' ' + downloadsDirectory + file
 		
 	fileString.strip()
 	callString = '/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m ' + fileString + ' asp-geo@upload.ncbi.nlm.nih.gov:ENCODE/' + composite
 	outscript = open(composite + minId + '-' + maxId + '.sh', 'w')
 	outscript.write('#!/bin/sh\n\n')
 	outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m \\\n')
 	
 	outscript.write(os.path.dirname(sys.argv[0]) + composite + '.soft' + ' \\\n')
 	
 	for file in fileList:
+		if not os.path.exists(downloadsDirectory + file):
+			raise FileError(downloadsDirectory + file + ' does not exist')
 		outscript.write(downloadsDirectory + file + ' \\\n')
 		
 	outscript.write('asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n')
 	outscript.close()
 
 	os.system('chmod +x ' + composite + minId + '-' + maxId + '.sh')
 		
 	print 'Finished!'
 	
 if __name__ == '__main__':
 	main()
\ No newline at end of file