python/programs/mkGeoPkg/mkGeoPkg b2085ccb5a8adac84c1bb71292db726f8747827e

b2085ccb5a8adac84c1bb71292db726f8747827e
mmaddren
  Mon Jul 11 13:19:44 2011 -0700
restructred directory again, updated mkGeoPkg, fixed a bug in OrderedDict, and removed extra __init__ file
diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg
new file mode 100755
index 0000000..53a1d40
--- /dev/null
+++ python/programs/mkGeoPkg/mkGeoPkg
@@ -0,0 +1,419 @@
+#!/hive/groups/encode/dcc/bin/python
+import sys, os, shutil, stat
+from rafile.RaFile import *
+from softfile.SoftFile import *
+from cvfile.CvFile import *
+
+class DataType(object):
+
+	def __init__(self, molecule, strategy, source, selection, soft):
+		self.molecule = molecule
+		self.strategy = strategy
+		self.source = source
+		self.selection = selection
+		self.soft = soft
+
+datatypes = {
+	'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile),
+	'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile),
+	'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile),
+	'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile),
+	'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile),
+	'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile),
+	'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile),
+	'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile),
+	'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile),
+	'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile),
+	'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile),
+	'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile),
+	
+	#these need to be curated
+	'5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None)
+}
+cvDetails = {
+	'cell':	[ 'organism', 'description', 'karyotype', 'lineage', 'sex' ],
+	'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ]
+}
+
+#if the term appears in the mdb and must overriding the value in the cv
+cvOverride = [ 'sex' ]
+
+#talk to Venkat lol
+cvPretend = { 'antibody Input': 'control' }
+
+#if its not in cvDetails, which things should we check by default
+cvDefaults = [ 'description' ]
+
+mdbWhitelist = [
+	'age',
+	'bioRep',
+	'control',
+	'controlId',
+	'fragSize',
+	'labExpId',
+	'labVersion',
+	'mapAlgorithm',
+	'obtainedBy',
+	'phase',
+	'readType',
+	'region',
+	'replicate',
+	'restrictionEnzyme',
+	'run',
+	'softwareVersion',
+	'spikeInPool',
+	'strain'
+]
+
+# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields
+# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current)
+# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. 
+rnaExtractMapping = {
+	'shortPolyA': 'polyA RNA', 
+	'longPolyA': 'polyA RNA', 
+	'polyA': 'polyA RNA'
+}
+
+localizationMapping = {
+	'cytosol': 'cytoplasmic RNA', 
+	'polysome': 'cytoplasmic RNA',
+	'membraneFraction': 'cytoplasmic RNA',
+	'mitochondria': 'cytoplasmic RNA',
+	'nucleus': 'nuclear RNA', 
+	'nucleolus': 'nuclear RNA', 
+	'nucleoplasm': 'nuclear RNA', 
+	'nuclearMatrix': 'nuclear RNA', 
+	'chromatin': 'nuclear RNA',
+	'cell': 'total RNA'
+}
+
+# map our instrument names to GEO's names
+instrumentModels = {
+	'Illumina_GA2x': 'Illumina Genome Analyzer II'
+}
+
+organisms = { 'hg19': 'human', 'hg18': 'human', 'mm9': 'mouse' }
+
+
+def getFileType(filename):
+	filename.replace('.gz', '')
+	return filename.rsplit('.')[1]
+	
+def isRawFile(filename):
+	return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta')
+	
+def isSupplimentaryFile(filename):
+	return not isRawFile(filename)
+
+
+def readMd5sums(filename):
+	if os.path.isfile(filename):
+		md5sums = dict()
+		md5file = open(filename, 'r')
+		for line in md5file:
+			val, key = map(str.strip, line.split('  ', 1))
+			md5sums[key] = val
+	else:
+		return None
+	
+def createMappings(mdb):
+	expIds = dict()
+	geoMapping = dict()
+	expVars = None
+	series = None
+	datatype = None
+	
+	for stanza in mdb.itervalues():
+		
+		if 'objType' in stanza and stanza['objType'] == 'composite':
+			series = stanza
+			expVars = stanza['expVars'].split(',')
+			continue
+
+		if 'expId' not in stanza:
+			print stanza.name + ': no expId'
+			continue
+
+		if 'geoSampleAccession' not in stanza:
+			# if this hasn't been submitted to GEO yet, we'll add it to the submission list
+			if stanza['expId'] not in expIds:
+				expIds[stanza['expId']] = list()
+				
+			expIds[stanza['expId']].append(stanza)
+		
+		else:
+			# otherwise we keep track of the geo number for partially submitted samples
+			if stanza['expId'] not in geoMapping:
+				geoMapping[stanza['expId']] = stanza['geoSampleAccession']
+			elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']:
+				geoMapping[stanza['expId']] = 'Inconsistent'
+				print stanza.name + ': inconsistent geo mapping'
+		
+		if datatype == None and 'dataType' in stanza:
+			datatype = stanza['dataType']
+		elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']:
+			raise KeyError(stanza.name + ': inconsistent data type') 
+
+		
+	
+	datatype = datatypes[datatype]
+	
+	return expIds, expVars, geoMapping, series, datatype
+	
+	
+def createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory):
+	
+	print 'Writing series ' + series['composite']
+	
+	fileList = list()
+	
+	softfile = SoftFile()
+	seriesStanza = HighThroughputSeriesStanza()
+	seriesStanza['^SERIES'] = series['composite']
+	seriesStanza['!Series_title'] = track[composite]['longLabel'] #STILL INCORRECT
+	seriesStanza['!Series_summary'] = '[REPLACE]'
+	seriesStanza['!Series_overall_design'] = '[REPLACE]'
+	seriesStanza['!Series_web_link'] = [ compositeUrl, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ]
+	seriesStanza['!Series_contributor'] = '[REPLACE]'
+	seriesStanza['!Series_gp_id'] = '[REPLACE]'
+	
+	#stanza['!Series_variable_1'] = 'var1' #dont use for now, follow up for later
+	#stanza['!Series_variable_description_1'] = 'desc1' # ^
+	#stanza['!Series_variable_sample_list_1'] = 'list1' # ^
+	#stanza['!Series_repeats_1'] = 'rep1' #WILL USE BUT DONT KNOW YET
+	#stanza['!Series_repeats_sample_list_1'] = 'replist1' # ^
+	
+	seriesStanza['!Series_sample_id'] = list()
+	
+	for idNum in expIds.iterkeys():
+		if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
+			seriesStanza['!Series_sample_id'].append(geoMapping[idNum])
+		else:
+			seriesStanza['!Series_sample_id'].append(expIds[idNum][0]['metaObject'])
+	
+	if 'geoAccession' in series:
+		seriesStanza['!Series_geo_accession'] = series['geoAccession']
+	
+	softfile[series['composite']] = seriesStanza
+		
+	for idNum in expIds.iterkeys():
+		
+		expId = expIds[idNum]
+		firstStanza = expId[0]
+		print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
+		sample = HighThroughputSampleStanza()
+		sample['^SAMPLE'] = firstStanza['metaObject']
+		sample['!Sample_type'] = 'SRA'
+		
+		concat = expVars[0]
+		for expVar in expVars[1:len(expVars)]:
+			concat += '_' + firstStanza[expVar]
+		sample['!Sample_title'] = concat
+		
+		count = 1
+		
+		for stanza in expId:
+		
+			if isRawFile(stanza['fileName']):
+				sample['!Sample_raw_file_' + str(count)] = stanza['fileName']
+				sample['!Sample_raw_file_type_' + str(count)] = getFileType(stanza['fileName'])
+				
+				if 'checksum' in stanza:
+					sample['!Sample_raw_file_checksum_' + str(count)] = stanza['checksum']
+				elif md5sums != None and stanza['fileName'] in md5sums:
+					sample['!Sample_raw_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
+
+				fileList.append(stanza['fileName'])	
+				count = count + 1
+			
+		count = 1
+			
+		for stanza in expId:
+		
+			if isSupplimentaryFile(stanza['fileName']):
+				sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
+				
+				if 'checksum' in stanza:
+					sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
+				elif md5sums != None and stanza['fileName'] in md5sums:
+					sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
+				
+				sample['!Sample_supplementary_file_build_' + str(count)] = database
+				
+				fileList.append(stanza['fileName'])
+				count = count + 1
+			
+		sample['!Sample_source_name'] = firstStanza['cell']
+		sample['!Sample_organism'] = organism
+		
+		sample['!Sample_characteristics'] = list()
+		allVars = expVars + mdbWhitelist
+		
+		for var in allVars:
+			if var in firstStanza:
+				foobar = var
+				sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var])
+				for pretend in cvPretend.iterkeys():
+					if var + ' ' + firstStanza[var] == pretend:
+						foobar = cvPretend[pretend]
+				if foobar in cvDetails:
+					for cvVar in cvDetails[foobar]:
+						if cvVar in cvOverride and cvVar in firstStanza:
+							sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + firstStanza[cvVar])
+						elif cvVar in cv[firstStanza[var]]:
+							sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar])
+				else:
+					for cvVar in cvDefaults:
+						if firstStanza[var] in cv and cvVar in cv[firstStanza[var]]:
+							sample['!Sample_characteristics'].append(var + ' ' +  cvVar + ': ' + cv[firstStanza[var]][cvVar])
+				
+		sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName']
+		
+		if 'treatment' in firstStanza:
+			sample['!Sample_treatment_protocol'] = firstStanza['treatment']
+		
+		if 'protocol' in cv[firstStanza['cell']]:
+			for protocol in cv[firstStanza['cell']]['protocol'].split(' '):
+					key, val = protocol.split(':')
+					if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']:
+						sample['!Sample_growth_protocol'] = val
+		
+		if datatype.molecule == 'OVERRIDE RNA':
+			if firstStanza['rnaExtract'] in rnaExtractMapping:
+				sample['!Sample_molecule'] = rnaExtractMapping[firstStanza['rnaExtract']]
+			elif firstStanza['localization'] in localizationMapping:
+				sample['!Sample_molecule'] = localizationMapping[firstStanza['localization']]
+				
+		else:
+			sample['!Sample_molecule'] = datatype.molecule
+			
+		sample['!Sample_extract_protocol'] = compositeUrl
+		sample['!Sample_library_strategy'] = datatype.strategy
+		sample['!Sample_library_source'] = datatype.source
+		sample['!Sample_library_selection'] = datatype.selection
+		
+		# set to replace for if nothing has a seqPlatform
+		sample['!Sample_instrument_model'] = '[REPLACE]'
+		for stanza in expId:	
+			if 'seqPlatform' in stanza:
+				sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']]
+			
+		sample['!Sample_data_processing'] = compositeUrl
+
+		if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
+			sample['!Sample_geo_accession'] = geoMapping[idNum]
+		
+		softfile[firstStanza['metaObject']] = sample
+		
+	return softfile, fileList
+		
+		
+def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory):
+	pass
+	
+	
+def copyFiles(fileList, downloadsDirectory, copyDirectory):
+
+	print 'Copying files:'
+	for filename in fileList:
+		print 'Copying file ' + filename + ' (' + count + '/' + len(fileList) + ' ...'
+		shutil.copy2(downloadsDirectory + filename, copyDirectory)
+		
+def main():
+	database = sys.argv[1]
+	composite = sys.argv[2]
+	submitStart = sys.argv[3]
+	submitSize = int(sys.argv[4])
+	organism = organisms[database]
+
+	mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE
+	cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #CHANGE
+	trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra'
+	md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt'
+
+	downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/'
+	copyDirectory = '/cluster/home/mmaddren/kent/python/ucscgenomics/mkGeoPkg/' + composite
+	
+	compositeUrl = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + composite
+	
+	mdb = RaFile(mdbPath)
+	cv = CvFile(cvPath)
+	track = RaFile(trackPath)
+	md5sums = readMd5sums(md5path)
+	expIds, expVars, geoMapping, series, datatype = createMappings(mdb)
+	
+	submission = dict()
+	sortedIds = expIds.keys()
+	sortedIds.sort()
+	sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize]
+	print 'Generating soft using expIds ' + min(sortedIds) + ' to ' + max(sortedIds)
+	for expId in sortedIds:
+		submission[expId] = expIds[expId]
+
+	if datatype.soft == HighThroughputSoftFile:
+		softfile, fileList = createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype, copyDirectory)
+	elif datatype.soft == MicroArraySoftFile:
+		softfile, fileList = createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype, copyDirectory)
+	else:
+		raise Error('unsupported type')
+		
+	print 'Writing soft file'
+	outfileName = os.path.dirname(sys.argv[0]) + composite + '.soft'
+	outfile = open(outfileName, 'w')
+	outfile.write(str(softfile))
+	
+	#print 'Copying files'
+
+	fileString = outfileName
+	for file in fileList:
+		fileString = fileString + ' ' + downloadsDirectory + file
+		
+#callList = [ '/opt/aspera/connect/bin/ascp asp-geo@upload.ncbi.nlm.nih.gov:ENCODE/' + composite, '-i', '~/encode_geo_key/encode_geo_key.ppk', '-QTr', '-l300m' ]
+#callList.extend(fileList)
+#callList.append(')
+    
+	fileString.strip()
+	callString = '/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m ' + fileString + ' asp-geo@upload.ncbi.nlm.nih.gov:ENCODE/' + composite
+#subprocess.call(callList, shell=True)
+	outscript = open(composite + '_' + submitStart + '-' + str(int(submitStart) + submitSize) + '.sh', 'w')
+	outscript.write('#!/bin/sh\n\n')
+	outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m \\\n')
+	
+	outscript.write(os.path.dirname(sys.argv[0]) + composite + '.soft' + ' \\\n')
+	
+	for file in fileList:
+		outscript.write(downloadsDirectory + file + ' \\\n')
+		
+	outscript.write('asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n')
+	outscript.close()
+	#os.fchmod(composite + '_' + submitStart + '-' + str(int(submitStart) + submitSize) + '.sh', stat.S_IXUSR)
+	os.system('chmod +x ' + composite + '_' + submitStart + '-' + str(int(submitStart) + submitSize) + '.sh')
+#	outscript.write(callString)
+	#print callString
+	#os.system(callString)
+	
+	#copyFiles(fileList, downloadsDirectory, copyDirectory)
+		
+	print 'Finished!'
+	
+if __name__ == '__main__':
+	main()
\ No newline at end of file