python/ucscgenomics/mkGeoPkg/mkGeoPkg b633c35e25be8790e5a61c2d80c367f01b1226df

b633c35e25be8790e5a61c2d80c367f01b1226df
mmaddren
  Tue Jun 28 14:53:39 2011 -0700
new version of mkGeoPkg
diff --git python/ucscgenomics/mkGeoPkg/mkGeoPkg python/ucscgenomics/mkGeoPkg/mkGeoPkg
new file mode 100755
index 0000000..6734218
--- /dev/null
+++ python/ucscgenomics/mkGeoPkg/mkGeoPkg
@@ -0,0 +1,231 @@
+#!/hive/groups/encode/dcc/bin/python
+import sys, os
+from rafile.RaFile import *
+from softfile.SoftFile import *
+from cvfile.CvFile import *
+
+class DataType(object):
+
+	def __init__(self, molecule, strategy, source, selection):
+		self.molecule = molecule
+		self.strategy = strategy
+		self.source = source
+		self.selection = selection
+
+datatypes = {
+	'Cage': DataType('Total RNA', 'OTHER', 'transcriptomic', 'CAGE'),
+	'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP'),
+	'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation'),
+	'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase'),
+	'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase'),
+	'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other'),
+	'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest'),
+	'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation'),
+	'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other'),
+	'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides'),
+	'RnaPet': DataType('total RNA', 'OTHER', 'transcriptomic', 'other'),
+	'RnaSeq': DataType('polyA RNA', 'RNA-Seq', 'transcriptomic', 'cDNA'),
+	
+	#these need to be curated
+	'5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
+	'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE')
+}
+	
+cellCharacteristics = [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ]
+	
+organisms = { 'hg19': 'human', 'mm9': 'mouse' }
+
+
+def getFileType(filename):
+	filename.replace('.gz', '')
+	return filename.rsplit('.')[1]
+	
+def isRawFile(filename):
+	return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta')
+	
+def isSupplimentaryFile(filename):
+	return not isRawFile(filename)
+
+
+def main():
+
+	database = sys.argv[1]
+	composite = sys.argv[2]
+	organism = organisms[database]
+
+	mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #change
+	cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #change
+	trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra'
+	md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt'
+
+	mdb = RaFile(mdbPath)
+	cv = CvFile(cvPath)
+	track = RaFile(trackPath)
+	
+	if os.path.isfile(md5path):
+		md5sums = dict()
+		md5file = open(md5path, 'r')
+		for line in md5file:
+			val, key = map(str.strip, line.split('  ', 1))
+			md5sums[key] = val
+	else:
+		md5sums = None
+	
+	expIds = dict() #mapping for expId : filelist
+	geomapping = dict() #mapping for expId : geoSampleAccession
+	series = None
+	expVars = None
+	compositeUrl = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + composite
+
+	for stanza in mdb.itervalues():
+		
+		if 'objType' in stanza and stanza['objType'] == 'composite':
+			series = stanza
+			expVars = stanza['expVars'].split(',')
+			continue
+
+		if 'expId' not in stanza:
+			KeyError('no expId for ' + stanza.name)
+
+		if stanza['expId'] not in expIds:
+			expIds[stanza['expId']] = list()
+			
+		expIds[stanza['expId']].append(stanza)
+
+		if 'geoSampleAccession' in stanza:
+			if stanza['expId'] not in geomapping:
+				geomapping[stanza['expId']] = stanza['geoSampleAccession']
+			elif geomapping[stanza['expId']] != 'Inconsistent' and geomapping[stanza['expId']] != stanza['geoSampleAccession']:
+				geomapping[stanza['expId']] = 'Inconsistent'
+				# should print warning message, but continue execution
+
+	softfile = SoftFile()
+	seriesStanza = HighThroughputSeriesStanza()
+	seriesStanza['^SERIES'] = series['composite']
+	seriesStanza['!Series_title'] = track[composite]['shortLabel']
+	seriesStanza['!Series_summary'] = '[REPLACE]'
+	seriesStanza['!Series_overall_design'] = '[REPLACE]'
+	seriesStanza['!Series_web_link'] = compositeUrl
+	seriesStanza['!Series_contributor'] = '[REPLACE]'
+	
+	#stanza['!Series_variable_1'] = 'var1' #dont use for now, follow up for later
+	#stanza['!Series_variable_description_1'] = 'desc1' # ^
+	#stanza['!Series_variable_sample_list_1'] = 'list1' # ^
+	#stanza['!Series_repeats_1'] = 'rep1' #WILL USE BUT DONT KNOW YET
+	#stanza['!Series_repeats_sample_list_1'] = 'replist1' # ^
+	
+	seriesStanza['!Series_sample_id'] = list()
+	
+	for idNum in expIds.iterkeys():
+		if idNum in geomapping and geomapping[idNum] != 'Inconsistent':
+			seriesStanza['!Series_sample_id'].append(geomapping[idNum])
+		else:
+			seriesStanza['!Series_sample_id'].append(expIds[idNum][0]['metaObject'])
+	
+	if 'geoAccession' in series:
+		seriesStanza['!Series_geo_accession'] = series['geoAccession']
+	
+	softfile[series['composite']] = seriesStanza
+
+	for idNum in expIds.iterkeys():
+	
+		expId = expIds[idNum]
+		firstStanza = expId[0]
+		sample = HighThroughputSampleStanza()
+		sample['^SAMPLE'] = firstStanza['metaObject']
+		sample['!Sample_type'] = 'SRA'
+		
+		concat = expVars[0]
+		for expVar in expVars[1:len(expVars)]:
+			concat += '_' + firstStanza[expVar]
+		sample['!Sample_title'] = concat
+		
+		count = 1
+		
+		for stanza in expId:
+		
+			if isRawFile(stanza['fileName']):
+				sample['!Sample_raw_file_' + str(count)] = stanza['fileName']
+				sample['!Sample_raw_file_type_' + str(count)] = getFileType(stanza['fileName'])
+				
+				if 'checksum' in stanza:
+					sample['!Sample_raw_file_checksum_' + str(count)] = stanza['checksum']
+				elif md5sums != None and stanza['fileName'] in md5sums:
+					sample['!Sample_raw_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
+
+				count = count + 1
+			
+		count = 1
+			
+		for stanza in expId:
+		
+			if isSupplimentaryFile(stanza['fileName']):
+				sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
+				
+				if 'checksum' in stanza:
+					sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
+				elif md5sums != None and stanza['fileName'] in md5sums:
+					sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
+				
+				sample['!Sample_supplementary_file_build_' + str(count)] = organism
+				count = count + 1
+			
+		sample['!Sample_source_name'] = firstStanza['cell']
+		sample['!Sample_organism'] = organism
+		
+		sample['!Sample_characteristics'] = list()
+		for expVar in expVars:
+			sample['!Sample_characteristics'].append(expVar + ': ' + firstStanza[expVar])
+		for cellChar in cellCharacteristics:
+			if cellChar in cv[firstStanza['cell']]:
+				sample['!Sample_characteristics'].append('cell ' + cellChar + ': ' + cv[firstStanza['cell']][cellChar])
+		
+		sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName']
+		
+		if 'treatment' in firstStanza:
+			sample['!Sample_treatment_protocol'] = firstStanza['treatment']
+		
+		if 'protocol' in cv[firstStanza['cell']]:
+			sample['!Sample_growth_protocol'] = cv[firstStanza['cell']]['protocol'] # NEED TO FIX
+		
+		sample['!Sample_molecule'] = datatypes[firstStanza['dataType']].molecule
+		sample['!Sample_extract_protocol'] = compositeUrl
+		sample['!Sample_library_strategy'] = datatypes[firstStanza['dataType']].strategy
+		sample['!Sample_library_source'] = datatypes[firstStanza['dataType']].source
+		sample['!Sample_library_selection'] = datatypes[firstStanza['dataType']].selection
+		
+		if 'seqPlatform' in stanza:
+			sample['!Sample_instrument_model'] = stanza['seqPlatform']
+		else:
+			sample['!Sample_instrument_model'] = '[REPLACE]'
+			
+		sample['!Sample_data_processing'] = compositeUrl
+
+		if idNum in geomapping and geomapping[idNum] != 'Inconsistent':
+			sample['!Sample_geo_accession'] = geomapping[idNum]
+		
+		softfile[firstStanza['metaObject']] = sample
+
+	outfile = open(os.path.dirname(sys.argv[0]) + composite + '.soft', 'w')
+	outfile.write(str(softfile))
+	
+if __name__ == '__main__':
+    main()
\ No newline at end of file