python/programs/mkGeoPkg/trackInfo d01a44fe987e619d7f14fd81a5d9beed28e433dc

d01a44fe987e619d7f14fd81a5d9beed28e433dc
mmaddren
  Tue Jul 19 16:18:13 2011 -0700
fixed a minor bug in trackInfo
diff --git python/programs/mkGeoPkg/trackInfo python/programs/mkGeoPkg/trackInfo
index 2247661..2946d18 100755
--- python/programs/mkGeoPkg/trackInfo
+++ python/programs/mkGeoPkg/trackInfo
@@ -1,286 +1,285 @@
 #!/hive/groups/encode/dcc/bin/python
 import sys, os, shutil, stat
 from rafile.RaFile import *
 from softfile.SoftFile import *
 from cvfile.CvFile import *
 
 class DataType(object):
 
 	def __init__(self, molecule, strategy, source, selection, soft):
 		self.molecule = molecule
 		self.strategy = strategy
 		self.source = source
 		self.selection = selection
 		self.soft = soft
 
 datatypes = {
 	'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile),
 	'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile),
 	'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile),
 	'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile),
 	'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile),
 	'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile),
 	'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile),
 	'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile),
 	'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile),
 	'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile),
 	'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile),
 	'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile),
 	
 	#these need to be curated
 	'5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
 	'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None)
 }
 cvDetails = {
 	'cell':	[ 'organism', 'description', 'karyotype', 'lineage', 'sex' ],
 	'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ]
 }
 
 #if the term appears in the mdb and must overriding the value in the cv
 cvOverride = [ 'sex' ]
 
 #talk to Venkat lol
 cvPretend = { 'antibody Input': 'control' }
 
 #if its not in cvDetails, which things should we check by default
 cvDefaults = [ 'description' ]
 
 mdbWhitelist = [
 	'age',
 	'bioRep',
 	'control',
 	'controlId',
 	'fragSize',
 	'labExpId',
 	'labVersion',
 	'mapAlgorithm',
 	'obtainedBy',
 	'phase',
 	'readType',
 	'region',
 	'replicate',
 	'restrictionEnzyme',
 	'run',
 	'softwareVersion',
 	'spikeInPool',
 	'strain'
 ]
 
 # if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields
 # first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current)
 # polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. 
 rnaExtractMapping = {
 	'shortPolyA': 'polyA RNA', 
 	'longPolyA': 'polyA RNA', 
 	'polyA': 'polyA RNA'
 }
 
 localizationMapping = {
 	'cytosol': 'cytoplasmic RNA', 
 	'polysome': 'cytoplasmic RNA',
 	'membraneFraction': 'cytoplasmic RNA',
 	'mitochondria': 'cytoplasmic RNA',
 	'nucleus': 'nuclear RNA', 
 	'nucleolus': 'nuclear RNA', 
 	'nucleoplasm': 'nuclear RNA', 
 	'nuclearMatrix': 'nuclear RNA', 
 	'chromatin': 'nuclear RNA',
 	'cell': 'total RNA'
 }
 
 # map our instrument names to GEO's names
 instrumentModels = {
 	'Illumina_GA2x': 'Illumina Genome Analyzer II'
 }
 
 organisms = {
 	'hg19': 'human',
 	'hg18': 'human',
 	'mm9': 'mouse'
 }
 
 def filesize(val):
 	if val > 1099511627776:
 		return str(round(float(val) / 1099511627776, 2)) + 'TB'
 	if val > 1073741824:
 		return str(round(float(val) / 1073741824, 2)) + 'GB'
 	if val > 1048576:
 		return str(round(float(val) / 1048576, 2)) + 'MB'
 	if val > 1024:
 		return str(round(float(val) / 1024, 2)) + 'KB'
 	else:
 		return str(val) + 'B'
 
 
 
 def getFileType(filename):
 	filename.replace('.gz', '')
 	return filename.rsplit('.')[1]
 	
 def isRawFile(filename):
 	return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta')
 	
 def isSupplimentaryFile(filename):
 	return not isRawFile(filename)
 
 	
 def createMappings(mdb):
 	expIds = dict()
 	geoMapping = dict()
 	expVars = None
 	series = None
 	datatype = None
 	
 	for stanza in mdb.itervalues():
 		
 		if 'objType' in stanza and stanza['objType'] == 'composite':
 			series = stanza
 			expVars = stanza['expVars'].split(',')
 			continue
 
 		if 'expId' not in stanza:
 			print stanza.name + ': no expId'
 			continue
 
-		if 'geoSampleAccession' not in stanza:
 			# if this hasn't been submitted to GEO yet, we'll add it to the submission list
 			if stanza['expId'] not in expIds:
 				expIds[stanza['expId']] = list()
 				
 			expIds[stanza['expId']].append(stanza)
 		
-		else:
+		if 'geoSampleAccession' in stanza:
 			# otherwise we keep track of the geo number for partially submitted samples
 			if stanza['expId'] not in geoMapping:
 				geoMapping[stanza['expId']] = stanza['geoSampleAccession']
 			elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']:
 				geoMapping[stanza['expId']] = 'Inconsistent'
 				print stanza.name + ': inconsistent geo mapping'
 		
 		if datatype == None and 'dataType' in stanza:
 			datatype = stanza['dataType']
 		elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']:
 			raise KeyError(stanza.name + ': inconsistent data type') 
 
 		
 	
 	datatype = datatypes[datatype]
 	
 	return expIds, expVars, geoMapping, series, datatype
 
 		
 def main():
 	database = sys.argv[1]
 	composite = sys.argv[2]
 	organism = organisms[database]
 	
 	#list everything
 	mode = 0
 	
 	if len(sys.argv) > 3:
 		submitStart = sys.argv[3]
 		#list individual
 		mode = 1
 	
 	if len(sys.argv) > 4:
 		submitSize = int(sys.argv[4])
 		#list range
 		mode = 2
 	
 	
 	mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE
 	trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra'
 	
 	downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/'
 	
 	
 	mdb = RaFile(mdbPath)
 	track = RaFile(trackPath)
 	expIds, expVars, geoMapping, series, datatype = createMappings(mdb)
 	
 	submission = dict()
 	sortedIds = expIds.keys()
 	sortedIds.sort()
 	
 	if mode == 1:
 		sortedIds = [submitStart]
 	elif mode == 2:
 		sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize]
 	
 	
 	minId = min(sortedIds)
 	maxId = max(sortedIds)
 	out = list()
 	totalsize = 0
 	filecount = 0
 	
 	# 'Generating soft using expIds ' + minId + ' to ' + maxId
 		
 	for idNum in sortedIds:
 		
 		samplesize = 0
 		expId = expIds[idNum]
 
 		for stanza in expId:
 		
 			if os.path.exists(downloadsDirectory + stanza['fileName']):
 				
 				st = os.stat(downloadsDirectory + stanza['fileName'])
 				samplesize = samplesize + st.st_size
 				totalsize = totalsize + st.st_size
 				filecount = filecount + 1
 				
 		strsub = '[Unsubmitted]'
 		if idNum in geoMapping and geoMapping[idNum] == 'Inconsistent':
 			strsub = '[Inconsistent]'
 		if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
 			strsub = '[' + geoMapping[idNum] + ']'		
 		
 		out.append('  + ' + expId[0]['metaObject'] + ' (' + str(idNum) + ')' + '[' + filesize(samplesize) + ']' + strsub + ' - ' + str(len(expId)) + ' files')
 		
 		for stanza in expId:
 			
 			if not os.path.exists(downloadsDirectory + stanza['fileName']):
 				out.append('  |     ' + stanza['fileName'] + ' MISSING FILE!')
 			else:
 			
 				st = os.stat(downloadsDirectory + stanza['fileName'])
 				out.append('  |     ' + stanza['fileName'] + ' [' + filesize(st.st_size) + ']')
 
 	
 	strsub = '[Unsubmitted]'
 	if 'geoSeriesAccession' in series:
 		strsub = '[' + series['geoSeriesAccession'] + ']'
 	
 	modestr = ''
 	if mode == 1:
 		modestr = ' <' + minId + '>' 
 	elif mode == 2:
 		modestr = ' <' + minId + '-' + maxId + '>'
 		
 	out.insert(0, composite + ' [' + filesize(totalsize) + ']' + strsub + modestr + ' - ' + str(filecount) + ' files')
 
 	for line in out:
 		print line
 		
 	
 if __name__ == '__main__':
 	main()
\ No newline at end of file