b633c35e25be8790e5a61c2d80c367f01b1226df mmaddren Tue Jun 28 14:53:39 2011 -0700 new version of mkGeoPkg diff --git python/ucscgenomics/mkGeoPkg/mkGeoPkg python/ucscgenomics/mkGeoPkg/mkGeoPkg new file mode 100755 index 0000000..6734218 --- /dev/null +++ python/ucscgenomics/mkGeoPkg/mkGeoPkg @@ -0,0 +1,231 @@ +#!/hive/groups/encode/dcc/bin/python +import sys, os +from rafile.RaFile import * +from softfile.SoftFile import * +from cvfile.CvFile import * + +class DataType(object): + + def __init__(self, molecule, strategy, source, selection): + self.molecule = molecule + self.strategy = strategy + self.source = source + self.selection = selection + +datatypes = { + 'Cage': DataType('Total RNA', 'OTHER', 'transcriptomic', 'CAGE'), + 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP'), + 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation'), + 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase'), + 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase'), + 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other'), + 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest'), + 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation'), + 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other'), + 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides'), + 'RnaPet': DataType('total RNA', 'OTHER', 'transcriptomic', 'other'), + 'RnaSeq': DataType('polyA RNA', 'RNA-Seq', 'transcriptomic', 'cDNA'), + + #these need to be curated + '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), + 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE') +} + +cellCharacteristics = [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ] + +organisms = { 'hg19': 'human', 'mm9': 'mouse' } + + +def getFileType(filename): + filename.replace('.gz', '') + return filename.rsplit('.')[1] + +def isRawFile(filename): + return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta') + +def isSupplimentaryFile(filename): + return not isRawFile(filename) + + +def main(): + + database = sys.argv[1] + composite = sys.argv[2] + organism = organisms[database] + + mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #change + cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #change + trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra' + md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt' + + mdb = RaFile(mdbPath) + cv = CvFile(cvPath) + track = RaFile(trackPath) + + if os.path.isfile(md5path): + md5sums = dict() + md5file = open(md5path, 'r') + for line in md5file: + val, key = map(str.strip, line.split(' ', 1)) + md5sums[key] = val + else: + md5sums = None + + expIds = dict() #mapping for expId : filelist + geomapping = dict() #mapping for expId : geoSampleAccession + series = None + expVars = None + compositeUrl = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + composite + + for stanza in mdb.itervalues(): + + if 'objType' in stanza and stanza['objType'] == 'composite': + series = stanza + expVars = stanza['expVars'].split(',') + continue + + if 'expId' not in stanza: + KeyError('no expId for ' + stanza.name) + + if stanza['expId'] not in expIds: + expIds[stanza['expId']] = list() + + expIds[stanza['expId']].append(stanza) + + if 'geoSampleAccession' in stanza: + if stanza['expId'] not in geomapping: + geomapping[stanza['expId']] = stanza['geoSampleAccession'] + elif geomapping[stanza['expId']] != 'Inconsistent' and geomapping[stanza['expId']] != stanza['geoSampleAccession']: + geomapping[stanza['expId']] = 'Inconsistent' + # should print warning message, but continue execution + + softfile = SoftFile() + seriesStanza = HighThroughputSeriesStanza() + seriesStanza['^SERIES'] = series['composite'] + seriesStanza['!Series_title'] = track[composite]['shortLabel'] + seriesStanza['!Series_summary'] = '[REPLACE]' + seriesStanza['!Series_overall_design'] = '[REPLACE]' + seriesStanza['!Series_web_link'] = compositeUrl + seriesStanza['!Series_contributor'] = '[REPLACE]' + + #stanza['!Series_variable_1'] = 'var1' #dont use for now, follow up for later + #stanza['!Series_variable_description_1'] = 'desc1' # ^ + #stanza['!Series_variable_sample_list_1'] = 'list1' # ^ + #stanza['!Series_repeats_1'] = 'rep1' #WILL USE BUT DONT KNOW YET + #stanza['!Series_repeats_sample_list_1'] = 'replist1' # ^ + + seriesStanza['!Series_sample_id'] = list() + + for idNum in expIds.iterkeys(): + if idNum in geomapping and geomapping[idNum] != 'Inconsistent': + seriesStanza['!Series_sample_id'].append(geomapping[idNum]) + else: + seriesStanza['!Series_sample_id'].append(expIds[idNum][0]['metaObject']) + + if 'geoAccession' in series: + seriesStanza['!Series_geo_accession'] = series['geoAccession'] + + softfile[series['composite']] = seriesStanza + + for idNum in expIds.iterkeys(): + + expId = expIds[idNum] + firstStanza = expId[0] + sample = HighThroughputSampleStanza() + sample['^SAMPLE'] = firstStanza['metaObject'] + sample['!Sample_type'] = 'SRA' + + concat = expVars[0] + for expVar in expVars[1:len(expVars)]: + concat += '_' + firstStanza[expVar] + sample['!Sample_title'] = concat + + count = 1 + + for stanza in expId: + + if isRawFile(stanza['fileName']): + sample['!Sample_raw_file_' + str(count)] = stanza['fileName'] + sample['!Sample_raw_file_type_' + str(count)] = getFileType(stanza['fileName']) + + if 'checksum' in stanza: + sample['!Sample_raw_file_checksum_' + str(count)] = stanza['checksum'] + elif md5sums != None and stanza['fileName'] in md5sums: + sample['!Sample_raw_file_checksum_' + str(count)] = md5sums[stanza['fileName']] + + count = count + 1 + + count = 1 + + for stanza in expId: + + if isSupplimentaryFile(stanza['fileName']): + sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] + + if 'checksum' in stanza: + sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] + elif md5sums != None and stanza['fileName'] in md5sums: + sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] + + sample['!Sample_supplementary_file_build_' + str(count)] = organism + count = count + 1 + + sample['!Sample_source_name'] = firstStanza['cell'] + sample['!Sample_organism'] = organism + + sample['!Sample_characteristics'] = list() + for expVar in expVars: + sample['!Sample_characteristics'].append(expVar + ': ' + firstStanza[expVar]) + for cellChar in cellCharacteristics: + if cellChar in cv[firstStanza['cell']]: + sample['!Sample_characteristics'].append('cell ' + cellChar + ': ' + cv[firstStanza['cell']][cellChar]) + + sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName'] + + if 'treatment' in firstStanza: + sample['!Sample_treatment_protocol'] = firstStanza['treatment'] + + if 'protocol' in cv[firstStanza['cell']]: + sample['!Sample_growth_protocol'] = cv[firstStanza['cell']]['protocol'] # NEED TO FIX + + sample['!Sample_molecule'] = datatypes[firstStanza['dataType']].molecule + sample['!Sample_extract_protocol'] = compositeUrl + sample['!Sample_library_strategy'] = datatypes[firstStanza['dataType']].strategy + sample['!Sample_library_source'] = datatypes[firstStanza['dataType']].source + sample['!Sample_library_selection'] = datatypes[firstStanza['dataType']].selection + + if 'seqPlatform' in stanza: + sample['!Sample_instrument_model'] = stanza['seqPlatform'] + else: + sample['!Sample_instrument_model'] = '[REPLACE]' + + sample['!Sample_data_processing'] = compositeUrl + + if idNum in geomapping and geomapping[idNum] != 'Inconsistent': + sample['!Sample_geo_accession'] = geomapping[idNum] + + softfile[firstStanza['metaObject']] = sample + + outfile = open(os.path.dirname(sys.argv[0]) + composite + '.soft', 'w') + outfile.write(str(softfile)) + +if __name__ == '__main__': + main() \ No newline at end of file