python/ucscgenomics/mkGeoPkg/mkGeoPkg b2085ccb5a8adac84c1bb71292db726f8747827e

b2085ccb5a8adac84c1bb71292db726f8747827e
mmaddren
  Mon Jul 11 13:19:44 2011 -0700
restructred directory again, updated mkGeoPkg, fixed a bug in OrderedDict, and removed extra __init__ file
diff --git python/ucscgenomics/mkGeoPkg/mkGeoPkg python/ucscgenomics/mkGeoPkg/mkGeoPkg
deleted file mode 100755
index 6734218..0000000
--- python/ucscgenomics/mkGeoPkg/mkGeoPkg
+++ /dev/null
@@ -1,231 +0,0 @@
-#!/hive/groups/encode/dcc/bin/python
-import sys, os
-from rafile.RaFile import *
-from softfile.SoftFile import *
-from cvfile.CvFile import *
-
-class DataType(object):
-
-	def __init__(self, molecule, strategy, source, selection):
-		self.molecule = molecule
-		self.strategy = strategy
-		self.source = source
-		self.selection = selection
-
-datatypes = {
-	'Cage': DataType('Total RNA', 'OTHER', 'transcriptomic', 'CAGE'),
-	'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP'),
-	'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation'),
-	'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase'),
-	'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase'),
-	'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other'),
-	'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest'),
-	'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation'),
-	'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other'),
-	'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides'),
-	'RnaPet': DataType('total RNA', 'OTHER', 'transcriptomic', 'other'),
-	'RnaSeq': DataType('polyA RNA', 'RNA-Seq', 'transcriptomic', 'cDNA'),
-	
-	#these need to be curated
-	'5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'),
-	'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE')
-}
-	
-cellCharacteristics = [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ]
-	
-organisms = { 'hg19': 'human', 'mm9': 'mouse' }
-
-
-def getFileType(filename):
-	filename.replace('.gz', '')
-	return filename.rsplit('.')[1]
-	
-def isRawFile(filename):
-	return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta')
-	
-def isSupplimentaryFile(filename):
-	return not isRawFile(filename)
-
-
-def main():
-
-	database = sys.argv[1]
-	composite = sys.argv[2]
-	organism = organisms[database]
-
-	mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #change
-	cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #change
-	trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra'
-	md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt'
-
-	mdb = RaFile(mdbPath)
-	cv = CvFile(cvPath)
-	track = RaFile(trackPath)
-	
-	if os.path.isfile(md5path):
-		md5sums = dict()
-		md5file = open(md5path, 'r')
-		for line in md5file:
-			val, key = map(str.strip, line.split('  ', 1))
-			md5sums[key] = val
-	else:
-		md5sums = None
-	
-	expIds = dict() #mapping for expId : filelist
-	geomapping = dict() #mapping for expId : geoSampleAccession
-	series = None
-	expVars = None
-	compositeUrl = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + composite
-
-	for stanza in mdb.itervalues():
-		
-		if 'objType' in stanza and stanza['objType'] == 'composite':
-			series = stanza
-			expVars = stanza['expVars'].split(',')
-			continue
-
-		if 'expId' not in stanza:
-			KeyError('no expId for ' + stanza.name)
-
-		if stanza['expId'] not in expIds:
-			expIds[stanza['expId']] = list()
-			
-		expIds[stanza['expId']].append(stanza)
-
-		if 'geoSampleAccession' in stanza:
-			if stanza['expId'] not in geomapping:
-				geomapping[stanza['expId']] = stanza['geoSampleAccession']
-			elif geomapping[stanza['expId']] != 'Inconsistent' and geomapping[stanza['expId']] != stanza['geoSampleAccession']:
-				geomapping[stanza['expId']] = 'Inconsistent'
-				# should print warning message, but continue execution
-
-	softfile = SoftFile()
-	seriesStanza = HighThroughputSeriesStanza()
-	seriesStanza['^SERIES'] = series['composite']
-	seriesStanza['!Series_title'] = track[composite]['shortLabel']
-	seriesStanza['!Series_summary'] = '[REPLACE]'
-	seriesStanza['!Series_overall_design'] = '[REPLACE]'
-	seriesStanza['!Series_web_link'] = compositeUrl
-	seriesStanza['!Series_contributor'] = '[REPLACE]'
-	
-	#stanza['!Series_variable_1'] = 'var1' #dont use for now, follow up for later
-	#stanza['!Series_variable_description_1'] = 'desc1' # ^
-	#stanza['!Series_variable_sample_list_1'] = 'list1' # ^
-	#stanza['!Series_repeats_1'] = 'rep1' #WILL USE BUT DONT KNOW YET
-	#stanza['!Series_repeats_sample_list_1'] = 'replist1' # ^
-	
-	seriesStanza['!Series_sample_id'] = list()
-	
-	for idNum in expIds.iterkeys():
-		if idNum in geomapping and geomapping[idNum] != 'Inconsistent':
-			seriesStanza['!Series_sample_id'].append(geomapping[idNum])
-		else:
-			seriesStanza['!Series_sample_id'].append(expIds[idNum][0]['metaObject'])
-	
-	if 'geoAccession' in series:
-		seriesStanza['!Series_geo_accession'] = series['geoAccession']
-	
-	softfile[series['composite']] = seriesStanza
-
-	for idNum in expIds.iterkeys():
-	
-		expId = expIds[idNum]
-		firstStanza = expId[0]
-		sample = HighThroughputSampleStanza()
-		sample['^SAMPLE'] = firstStanza['metaObject']
-		sample['!Sample_type'] = 'SRA'
-		
-		concat = expVars[0]
-		for expVar in expVars[1:len(expVars)]:
-			concat += '_' + firstStanza[expVar]
-		sample['!Sample_title'] = concat
-		
-		count = 1
-		
-		for stanza in expId:
-		
-			if isRawFile(stanza['fileName']):
-				sample['!Sample_raw_file_' + str(count)] = stanza['fileName']
-				sample['!Sample_raw_file_type_' + str(count)] = getFileType(stanza['fileName'])
-				
-				if 'checksum' in stanza:
-					sample['!Sample_raw_file_checksum_' + str(count)] = stanza['checksum']
-				elif md5sums != None and stanza['fileName'] in md5sums:
-					sample['!Sample_raw_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
-
-				count = count + 1
-			
-		count = 1
-			
-		for stanza in expId:
-		
-			if isSupplimentaryFile(stanza['fileName']):
-				sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
-				
-				if 'checksum' in stanza:
-					sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
-				elif md5sums != None and stanza['fileName'] in md5sums:
-					sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
-				
-				sample['!Sample_supplementary_file_build_' + str(count)] = organism
-				count = count + 1
-			
-		sample['!Sample_source_name'] = firstStanza['cell']
-		sample['!Sample_organism'] = organism
-		
-		sample['!Sample_characteristics'] = list()
-		for expVar in expVars:
-			sample['!Sample_characteristics'].append(expVar + ': ' + firstStanza[expVar])
-		for cellChar in cellCharacteristics:
-			if cellChar in cv[firstStanza['cell']]:
-				sample['!Sample_characteristics'].append('cell ' + cellChar + ': ' + cv[firstStanza['cell']][cellChar])
-		
-		sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName']
-		
-		if 'treatment' in firstStanza:
-			sample['!Sample_treatment_protocol'] = firstStanza['treatment']
-		
-		if 'protocol' in cv[firstStanza['cell']]:
-			sample['!Sample_growth_protocol'] = cv[firstStanza['cell']]['protocol'] # NEED TO FIX
-		
-		sample['!Sample_molecule'] = datatypes[firstStanza['dataType']].molecule
-		sample['!Sample_extract_protocol'] = compositeUrl
-		sample['!Sample_library_strategy'] = datatypes[firstStanza['dataType']].strategy
-		sample['!Sample_library_source'] = datatypes[firstStanza['dataType']].source
-		sample['!Sample_library_selection'] = datatypes[firstStanza['dataType']].selection
-		
-		if 'seqPlatform' in stanza:
-			sample['!Sample_instrument_model'] = stanza['seqPlatform']
-		else:
-			sample['!Sample_instrument_model'] = '[REPLACE]'
-			
-		sample['!Sample_data_processing'] = compositeUrl
-
-		if idNum in geomapping and geomapping[idNum] != 'Inconsistent':
-			sample['!Sample_geo_accession'] = geomapping[idNum]
-		
-		softfile[firstStanza['metaObject']] = sample
-
-	outfile = open(os.path.dirname(sys.argv[0]) + composite + '.soft', 'w')
-	outfile.write(str(softfile))
-	
-if __name__ == '__main__':
-    main()
\ No newline at end of file