python/programs/mkGeoPkg/mkGeoPkg f1be2cca63ee3217c5337b55dca6ba0248370cc1

f1be2cca63ee3217c5337b55dca6ba0248370cc1
mmaddren
  Mon Oct 31 15:33:45 2011 -0700
minor updates to mkGeoPkg
diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg
index e85c252..22cef27 100755
--- python/programs/mkGeoPkg/mkGeoPkg
+++ python/programs/mkGeoPkg/mkGeoPkg
@@ -1,58 +1,130 @@
 #!/hive/groups/encode/dcc/bin/python
 import sys, os, shutil, stat, argparse, datetime
 from ucscgenomics import track, ra, soft, cv
 
+"""
+mkGeoPkg - create a soft file and upload script for a given track, so that it
+may be sumbitted to GEO.
+
+To invoke the script, you must pass the composite and track name:
+    mkGeoPkg hg19 wgEncodeSomeTrack
+    
+This is typically not enough however; most tracks are not completely covered
+by their metadata, and it is necessary to supply additional information. The
+most commonly needed information is:
+    !Series_summary - this is taken from the track's html page description. 
+        In most cases it can be copied, one paragraph per line.
+    !Series_overall_design - this is taken from the Methods section on the
+        track's page. As with summary, 1 paragraph per line.
+    !Series_contributor - this is taken from the contributors list on the 
+        track's page. It has a special format: "Last,First" where each person
+        is listed on a separate line.
+    !Sample_instrument_model - this is a bit more difficult, as it technically
+        supposed to be a per-sample field. Occasionally this appears in the
+        metaDb for newer tracks, if so, it's automatically added in. Otherwise
+        it must be either specified on a whole-series basis, or added to the
+        metadata. In many cases, we don't actually know all of them. This is
+        okay. GEO will allow us to submit whatever information we do have, and
+        they can take care of the rest. The instrumentModels dictionary below
+        gives a list of the allowable entered values (the keys). The values
+        map to the "human readable" version used by GEO.
+        
+You can supply all of the above information in an additional file and specify
+it using the r (replace) option:
+    mkGeoPkg -r somefile hg19 wgEncodeSomeTrack
+    
+The replace file must be organized with each of the above key value pairs as
+in a soft file:
+    !Series_summary = some summary stuff ...
+    !Series_summary = another paragraph of summary ...
+    !Series_overall_design = ...
+    !Series_contributor = Researcher,Some
+    !Series_contributor = Researcher,Another
+    !Sample_instrument_model = Illumina_GA2
+    
+There is a template for this, named replaceTemplate in the directory.
+
+You may need to only submit a part of the track, in this case you can specify
+experiment ID numbers to include:
+
+One problem you may run into while running the script is that the DataType is
+not valid. This means that the huge dictionary called datatypes below isn't
+filled in for that entry. If you can get ahold of the information, modify the
+script and push the changes back out.
+
+You may also get an error when trying to submit MicroArray data. This is to be
+expected: MicroArray is currently not functional at all. We have no way as of
+current to map our data to the format expected by GEO, so we've punted on this
+issue for the time being.
+
+Once you've successfully run the script, you'll have generated a soft file and
+a script that will handle the uploading process. All of this will be put into
+a directory named 'database_wgEncodeSomeTrack_year-month-day'. To begin the
+upload, simply cd into the directoy and run upload.sh. This will start the
+aspera copy program, copying files, a files list, and the soft file itself.
+
+For each submission, you need to email GEO. Our current contact is:
+    Steve Wilhite, wilhite@ncbi.nlm.nih.gov
+    
+You must specify which track you're submitting. GEO will only allow us 1TB of
+space dedicated to ENCODE, so you must break down submissions larger than 1TB
+and only submit as many submissions as they have room to process at any given
+time. In a few days, GEO will get back to you with a list of accession numbers
+which need to be put back into our metadata (see encodeAddGeoAccessions).
+
+"""
+
 class DataType(object):
 
 	def __init__(self, molecule, strategy, source, selection, soft):
 		self.molecule = molecule
 		self.strategy = strategy
 		self.source = source
 		self.selection = selection
 		self.soft = soft
 
 datatypes = {
 	'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', soft.HighThroughputSoftFile),
 	'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', soft.HighThroughputSoftFile),
 	'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', soft.HighThroughputSoftFile),
 	'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile),
 	'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile),
 	'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile),
 	'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', soft.HighThroughputSoftFile),
 	'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', soft.HighThroughputSoftFile),
 	'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile),
 	'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', soft.HighThroughputSoftFile),
 	'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', soft.HighThroughputSoftFile),
 	'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', soft.HighThroughputSoftFile),
 	
 	#these need to be curated
 	'5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', soft.MicroArraySoftFile),
 	'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
-	'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Genotype': DataType('genomic DNA', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
-	'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'RipGeneSt': DataType('OVERRIDE RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', soft.HighThroughputSoftFile), #this isn't correct
 	'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None)
 }
 
 #compare this to the source in datatype, give GP ids depending on the type
 gpIds = {
 	'human genomic': '63443',
 	'human transcriptomic': '30709',
 	'human protein': '63447',
 	
 	'mouse genomic': '63471',
 	'mouse transcriptomic': '66167',
@@ -108,88 +180,100 @@
 	'polysome': 'cytoplasmic RNA',
 	'membraneFraction': 'cytoplasmic RNA',
 	'mitochondria': 'cytoplasmic RNA',
 	'nucleus': 'nuclear RNA', 
 	'nucleolus': 'nuclear RNA', 
 	'nucleoplasm': 'nuclear RNA', 
 	'nuclearMatrix': 'nuclear RNA', 
 	'chromatin': 'nuclear RNA',
 	'cell': 'total RNA'
 }
 
 # map our instrument names to GEO's names
 instrumentModels = {
 	'Illumina_GA2x': 'Illumina Genome Analyzer II',
 	'Illumina_GA2': 'Illumina Genome Analyzer II',
-	'Illumina_HiSeq_2000': 'Illumina HiSeq 2000'
+    'Illumina_HiSeq_2000': 'Illumina HiSeq 2000',
+    'Illumina_GA1': 'Illumina Genome Analyzer',
+    'Illumina_GA1_or_GA2': 'Illumina Genome Analyzer, Illumina Genome Analyzer II',
+    'SOLiD_Unknown': 'SOLiD',
+    'Unknown': 'Illumina Genome Analyzer'
 }
 
 	
 def isRawFile(file):
 	return (file.extension == 'fastq' or file.extension == 'fasta')
 	
 def isSupplimentaryFile(file):
 	return not isRawFile(file)
 	
 def sampleTitle(stanza, expVars, warn=False):
 	concat = stanza[expVars[0]].replace('-m', '')
 	for expVar in expVars[1:len(expVars)]:
 		if expVar in stanza and stanza[expVar] != 'None':
 			concat += '_' + stanza[expVar]
 		elif warn:
 			print 'warning: %s is None or not in %s' % (expVar, stanza.name)
 	return concat
 	
+def linkName(file, track):
+    return '%s_%s' % (track.database, file.name)
+    
 def createMappings(mdb):
 	expIds = dict()
 	geoMapping = dict()
 	expVars = None
 	series = None
 	datatype = None
 	
 	for stanza in mdb.itervalues():
 		
 		if 'objType' in stanza and stanza['objType'] == 'composite':
 			series = stanza
 			expVars = stanza['expVars'].split(',')
 			continue
 
 		if 'expId' not in stanza:
 			print stanza.name + ': no expId'
 			continue
 
-		if 'geoSampleAccession' not in stanza:
+        if 'geoSampleAccession' not in stanza or 1:
 			# if this hasn't been submitted to GEO yet, we'll add it to the submission list
 			if stanza['expId'] not in expIds:
 				expIds[stanza['expId']] = list()
 				
 			expIds[stanza['expId']].append(stanza)
 		
 		else:
 			# otherwise we keep track of the geo number for partially submitted samples
 			if stanza['expId'] not in geoMapping:
 				geoMapping[stanza['expId']] = stanza['geoSampleAccession']
 			elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']:
 				geoMapping[stanza['expId']] = 'Inconsistent'
 				print stanza.name + ': inconsistent geo mapping'
 		
 		if datatype == None and 'dataType' in stanza:
 			datatype = stanza['dataType']
 		elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']:
 			raise KeyError(stanza.name + ': inconsistent data type') 
 
-	datatype = datatypes[datatype]
+    try:
+        dt = datatype
+        datatype = datatypes[dt]
+        datatype.name = dt
+    except KeyError:
+        raise KeyError(datatype)
 	
 	return expIds, expVars, geoMapping, series, datatype
 	
 	
 def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit):
 	
 	if 'geoSeriesAccession' in series:
 		print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession']
 		return
 		
 	print 'Writing series ' + series['composite']
 	
 	seriesStanza = soft.SeriesStanza()
 	seriesStanza['^SERIES'] = series['composite']
 	seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT
@@ -261,53 +345,55 @@
 		#figure out if the instrument model is consistent across the entire sample
 		instrumentModel = None
 		for stanza in expId:	
 			if 'seqPlatform' in stanza:
 				if instrumentModel == None:
 					instrumentModel = stanza['seqPlatform']
 				else:
 					if instrumentModel != stanza['seqPlatform']:
 						instrumentModel = None
 						if audit:
 							print 'expId' + str(expId) + ': inconsistent instrument model'
 						break
 		
 		for stanza in expId:
 		
-			file = compositeTrack.files[stanza['fileName']]
+            for fname in stanza['fileName'].split(','):
+                file = compositeTrack.files[fname]
 			
 			if isRawFile(file):
-				sample['!Sample_raw_file_' + str(count)] = file.name
+                    sample['!Sample_raw_file_' + str(count)] = linkName(file, compositeTrack)
 				sample['!Sample_raw_file_type_' + str(count)] = file.extension
 				
 				if file.md5sum != None:
 					sample['!Sample_raw_file_checksum_' + str(count)] = file.md5sum
 
 				if instrumentModel == None and 'seqPlatform' in stanza:
 					sample['!Sample_raw_file_instrument_model_' + str(count)] = stanza['seqPlatform']
 					
 				fileList.append(file)	
 				count = count + 1
 			
 		count = 1
 			
 		for stanza in expId:
 		
-			file = compositeTrack.files[stanza['fileName']]
+            for fname in stanza['fileName'].split(','):
+                file = compositeTrack.files[fname]
 		
 			if isSupplimentaryFile(file):
-				sample['!Sample_supplementary_file_' + str(count)] = file.name
+                    sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack)
 				
 				if file.md5sum != None:
 					sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum
 				
 				sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database
 				
 				if instrumentModel == None and 'seqPlatform' in stanza:
 					sample['!Sample_supplementary_file_instrument_model_' + str(count)] = stanza['seqPlatform']
 				
 				fileList.append(file)
 				count = count + 1
 			
 		sample['!Sample_source_name'] = firstStanza['cell']
 		sample['!Sample_organism'] = compositeTrack.organism
 		
@@ -340,30 +426,33 @@
 		if 'protocol' in cv[firstStanza['cell']]:
 			for protocol in cv[firstStanza['cell']]['protocol'].split(' '):
 					key, val = protocol.split(':')
 					if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']:
 						sample['!Sample_growth_protocol'] = val
 		
 		if datatype.molecule == 'OVERRIDE RNA':
 			if firstStanza['rnaExtract'] in rnaExtractMapping:
 				sample['!Sample_molecule'] = rnaExtractMapping[firstStanza['rnaExtract']]
 			elif firstStanza['localization'] in localizationMapping:
 				sample['!Sample_molecule'] = localizationMapping[firstStanza['localization']]
 				
 		else:
 			sample['!Sample_molecule'] = datatype.molecule
 			
+        if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown':
+            sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url)
+        else:
 		sample['!Sample_extract_protocol'] = compositeTrack.url
 		sample['!Sample_library_strategy'] = datatype.strategy
 		sample['!Sample_library_source'] = datatype.source
 		sample['!Sample_library_selection'] = datatype.selection
 		
 		# if the instrumentModel is consistent, just use that
 		# otherwise take the first seqPlatform value from metadata
 		# if that still fails, check the replacement file
 		# finally just make it say [REPLACE]
 		if instrumentModel != None:
 			sample['!Sample_instrument_model'] = instrumentModel
 		else:
 			for stanza in expId:	
 				if 'seqPlatform' in stanza:
 					sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']]
@@ -554,64 +643,63 @@
 	else:
 		for expId in ids:
 			submission[str(expId)] = expIds[str(expId)]
 	
 	expIdStr = ' '
 	for id in args.expIds:
 		expIdStr = expIdStr + id + ',' 
 	expIdStr = expIdStr[:len(expIdStr) - 1]
 	print 'Generating soft using expIds ' + expIdStr
 	
 	if datatype.soft == soft.HighThroughputSoftFile:
 		softfile, fileList = createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit)
 	elif datatype.soft == soft.MicroArraySoftFile:
 		softfile, fileList = createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype)
 	else:
-		raise KeyError('unsupported type')
+        raise KeyError('unsupported type ' + datatype.name)
 	
 	if not args.audit:
 		print 'Creating directory'
 	
 		d = datetime.datetime.today()
 		datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day)
 		
 		dirname = '%s_%s_%s/' % (compositeTrack.database, compositeTrack.name, datestring)
-		asperadirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name)
 		linkdirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name)
 	
 		os.mkdir(dirname)
 		os.mkdir(dirname + linkdirname)
 	
 		print 'Writing file'
 		
-		outfileName = '%s%s_%s.soft' % (dirname, compositeTrack.database, compositeTrack.name)
+        outfileName = '%s%s_%s.soft' % (dirname + linkdirname, compositeTrack.database, compositeTrack.name)
 		outfile = open(outfileName, 'w')
 		outfile.write(str(softfile))
-		fileslistname = '%sfiles.txt' % dirname
+        fileslistname = '%sfiles.txt' % (dirname + linkdirname)
 		fileslist = open(fileslistname, 'w')
 		scriptname = '%supload.sh' % dirname
 		outscript = open(scriptname, 'w')
 		
 		outscript.write('#!/bin/sh\n\n')
-		outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname)
+        outscript.write('/cluster/home/mmaddren/.aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname)
 		outscript.close()
 		
 	for file in fileList:
 		if not os.path.exists(file.path):
 			print IOError(file.path + ' does not exist')
 		elif not args.audit:
-			linkname = '%s_%s' % (compositeTrack.database, file.name)
+            linkname = linkName(file, compositeTrack)
 			linkpath = linkdirname + linkname
 			os.symlink(file.fullname, dirname + linkpath)
 		
 			#outscript.write(linkpath + ' \\\n')
 			fileslist.write(linkname + '\n')
 	
 	if not args.audit:
 		#outscript.write()
 		
 		fileslist.close()
 
 		os.system('chmod +x ' + scriptname)
 		
 		print 'Finished!'