python/programs/mkGeoPkg/trackInfo 0bb1cef7954937ff2c3d0851ee8db34c07dc5008

0bb1cef7954937ff2c3d0851ee8db34c07dc5008
mmaddren
  Tue Jul 19 14:42:36 2011 -0700
small but in soft file, work on mkGeoPkg for micro array submissions (still not functional) and added a readme
diff --git python/programs/mkGeoPkg/trackInfo python/programs/mkGeoPkg/trackInfo
new file mode 100755
index 0000000..2247661
--- /dev/null
+++ python/programs/mkGeoPkg/trackInfo
@@ -0,0 +1,286 @@
+#!/hive/groups/encode/dcc/bin/python
+import sys, os, shutil, stat
+from rafile.RaFile import *
+from softfile.SoftFile import *
+from cvfile.CvFile import *
+
+class DataType(object):
+
+	def __init__(self, molecule, strategy, source, selection, soft):
+		self.molecule = molecule
+		self.strategy = strategy
+		self.source = source
+		self.selection = selection
+		self.soft = soft
+
+datatypes = {
+	'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile),
+	'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile),
+	'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile),
+	'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile),
+	'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile),
+	'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile),
+	'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile),
+	'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile),
+	'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile),
+	'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile),
+	'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile),
+	'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile),
+	
+	#these need to be curated
+	'5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+	'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None)
+}
+cvDetails = {
+	'cell':	[ 'organism', 'description', 'karyotype', 'lineage', 'sex' ],
+	'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ]
+}
+
+#if the term appears in the mdb and must overriding the value in the cv
+cvOverride = [ 'sex' ]
+
+#talk to Venkat lol
+cvPretend = { 'antibody Input': 'control' }
+
+#if its not in cvDetails, which things should we check by default
+cvDefaults = [ 'description' ]
+
+mdbWhitelist = [
+	'age',
+	'bioRep',
+	'control',
+	'controlId',
+	'fragSize',
+	'labExpId',
+	'labVersion',
+	'mapAlgorithm',
+	'obtainedBy',
+	'phase',
+	'readType',
+	'region',
+	'replicate',
+	'restrictionEnzyme',
+	'run',
+	'softwareVersion',
+	'spikeInPool',
+	'strain'
+]
+
+# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields
+# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current)
+# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. 
+rnaExtractMapping = {
+	'shortPolyA': 'polyA RNA', 
+	'longPolyA': 'polyA RNA', 
+	'polyA': 'polyA RNA'
+}
+
+localizationMapping = {
+	'cytosol': 'cytoplasmic RNA', 
+	'polysome': 'cytoplasmic RNA',
+	'membraneFraction': 'cytoplasmic RNA',
+	'mitochondria': 'cytoplasmic RNA',
+	'nucleus': 'nuclear RNA', 
+	'nucleolus': 'nuclear RNA', 
+	'nucleoplasm': 'nuclear RNA', 
+	'nuclearMatrix': 'nuclear RNA', 
+	'chromatin': 'nuclear RNA',
+	'cell': 'total RNA'
+}
+
+# map our instrument names to GEO's names
+instrumentModels = {
+	'Illumina_GA2x': 'Illumina Genome Analyzer II'
+}
+
+organisms = {
+	'hg19': 'human',
+	'hg18': 'human',
+	'mm9': 'mouse'
+}
+
+def filesize(val):
+	if val > 1099511627776:
+		return str(round(float(val) / 1099511627776, 2)) + 'TB'
+	if val > 1073741824:
+		return str(round(float(val) / 1073741824, 2)) + 'GB'
+	if val > 1048576:
+		return str(round(float(val) / 1048576, 2)) + 'MB'
+	if val > 1024:
+		return str(round(float(val) / 1024, 2)) + 'KB'
+	else:
+		return str(val) + 'B'
+
+
+
+def getFileType(filename):
+	filename.replace('.gz', '')
+	return filename.rsplit('.')[1]
+	
+def isRawFile(filename):
+	return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta')
+	
+def isSupplimentaryFile(filename):
+	return not isRawFile(filename)
+
+	
+def createMappings(mdb):
+	expIds = dict()
+	geoMapping = dict()
+	expVars = None
+	series = None
+	datatype = None
+	
+	for stanza in mdb.itervalues():
+		
+		if 'objType' in stanza and stanza['objType'] == 'composite':
+			series = stanza
+			expVars = stanza['expVars'].split(',')
+			continue
+
+		if 'expId' not in stanza:
+			print stanza.name + ': no expId'
+			continue
+
+		if 'geoSampleAccession' not in stanza:
+			# if this hasn't been submitted to GEO yet, we'll add it to the submission list
+			if stanza['expId'] not in expIds:
+				expIds[stanza['expId']] = list()
+				
+			expIds[stanza['expId']].append(stanza)
+		
+		else:
+			# otherwise we keep track of the geo number for partially submitted samples
+			if stanza['expId'] not in geoMapping:
+				geoMapping[stanza['expId']] = stanza['geoSampleAccession']
+			elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']:
+				geoMapping[stanza['expId']] = 'Inconsistent'
+				print stanza.name + ': inconsistent geo mapping'
+		
+		if datatype == None and 'dataType' in stanza:
+			datatype = stanza['dataType']
+		elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']:
+			raise KeyError(stanza.name + ': inconsistent data type') 
+
+		
+	
+	datatype = datatypes[datatype]
+	
+	return expIds, expVars, geoMapping, series, datatype
+
+		
+def main():
+	database = sys.argv[1]
+	composite = sys.argv[2]
+	organism = organisms[database]
+	
+	#list everything
+	mode = 0
+	
+	if len(sys.argv) > 3:
+		submitStart = sys.argv[3]
+		#list individual
+		mode = 1
+	
+	if len(sys.argv) > 4:
+		submitSize = int(sys.argv[4])
+		#list range
+		mode = 2
+	
+	
+	mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE
+	trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra'
+	
+	downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/'
+	
+	
+	mdb = RaFile(mdbPath)
+	track = RaFile(trackPath)
+	expIds, expVars, geoMapping, series, datatype = createMappings(mdb)
+	
+	submission = dict()
+	sortedIds = expIds.keys()
+	sortedIds.sort()
+	
+	if mode == 1:
+		sortedIds = [submitStart]
+	elif mode == 2:
+		sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize]
+	
+	
+	minId = min(sortedIds)
+	maxId = max(sortedIds)
+	out = list()
+	totalsize = 0
+	filecount = 0
+	
+	# 'Generating soft using expIds ' + minId + ' to ' + maxId
+		
+	for idNum in sortedIds:
+		
+		samplesize = 0
+		expId = expIds[idNum]
+
+		for stanza in expId:
+		
+			if os.path.exists(downloadsDirectory + stanza['fileName']):
+				
+				st = os.stat(downloadsDirectory + stanza['fileName'])
+				samplesize = samplesize + st.st_size
+				totalsize = totalsize + st.st_size
+				filecount = filecount + 1
+				
+		strsub = '[Unsubmitted]'
+		if idNum in geoMapping and geoMapping[idNum] == 'Inconsistent':
+			strsub = '[Inconsistent]'
+		if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
+			strsub = '[' + geoMapping[idNum] + ']'		
+		
+		out.append('  + ' + expId[0]['metaObject'] + ' (' + str(idNum) + ')' + '[' + filesize(samplesize) + ']' + strsub + ' - ' + str(len(expId)) + ' files')
+		
+		for stanza in expId:
+			
+			if not os.path.exists(downloadsDirectory + stanza['fileName']):
+				out.append('  |     ' + stanza['fileName'] + ' MISSING FILE!')
+			else:
+			
+				st = os.stat(downloadsDirectory + stanza['fileName'])
+				out.append('  |     ' + stanza['fileName'] + ' [' + filesize(st.st_size) + ']')
+
+	
+	strsub = '[Unsubmitted]'
+	if 'geoSeriesAccession' in series:
+		strsub = '[' + series['geoSeriesAccession'] + ']'
+	
+	modestr = ''
+	if mode == 1:
+		modestr = ' <' + minId + '>' 
+	elif mode == 2:
+		modestr = ' <' + minId + '-' + maxId + '>'
+		
+	out.insert(0, composite + ' [' + filesize(totalsize) + ']' + strsub + modestr + ' - ' + str(filecount) + ' files')
+
+	for line in out:
+		print line
+		
+	
+if __name__ == '__main__':
+	main()
\ No newline at end of file