b2085ccb5a8adac84c1bb71292db726f8747827e mmaddren Mon Jul 11 13:19:44 2011 -0700 restructred directory again, updated mkGeoPkg, fixed a bug in OrderedDict, and removed extra __init__ file diff --git python/ucscgenomics/mkGeoPkg/mkGeoPkg python/ucscgenomics/mkGeoPkg/mkGeoPkg deleted file mode 100755 index 6734218..0000000 --- python/ucscgenomics/mkGeoPkg/mkGeoPkg +++ /dev/null @@ -1,231 +0,0 @@ -#!/hive/groups/encode/dcc/bin/python -import sys, os -from rafile.RaFile import * -from softfile.SoftFile import * -from cvfile.CvFile import * - -class DataType(object): - - def __init__(self, molecule, strategy, source, selection): - self.molecule = molecule - self.strategy = strategy - self.source = source - self.selection = selection - -datatypes = { - 'Cage': DataType('Total RNA', 'OTHER', 'transcriptomic', 'CAGE'), - 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP'), - 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation'), - 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase'), - 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase'), - 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other'), - 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest'), - 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation'), - 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other'), - 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides'), - 'RnaPet': DataType('total RNA', 'OTHER', 'transcriptomic', 'other'), - 'RnaSeq': DataType('polyA RNA', 'RNA-Seq', 'transcriptomic', 'cDNA'), - - #these need to be curated - '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE'), - 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE') -} - -cellCharacteristics = [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ] - -organisms = { 'hg19': 'human', 'mm9': 'mouse' } - - -def getFileType(filename): - filename.replace('.gz', '') - return filename.rsplit('.')[1] - -def isRawFile(filename): - return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta') - -def isSupplimentaryFile(filename): - return not isRawFile(filename) - - -def main(): - - database = sys.argv[1] - composite = sys.argv[2] - organism = organisms[database] - - mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #change - cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #change - trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra' - md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt' - - mdb = RaFile(mdbPath) - cv = CvFile(cvPath) - track = RaFile(trackPath) - - if os.path.isfile(md5path): - md5sums = dict() - md5file = open(md5path, 'r') - for line in md5file: - val, key = map(str.strip, line.split(' ', 1)) - md5sums[key] = val - else: - md5sums = None - - expIds = dict() #mapping for expId : filelist - geomapping = dict() #mapping for expId : geoSampleAccession - series = None - expVars = None - compositeUrl = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + composite - - for stanza in mdb.itervalues(): - - if 'objType' in stanza and stanza['objType'] == 'composite': - series = stanza - expVars = stanza['expVars'].split(',') - continue - - if 'expId' not in stanza: - KeyError('no expId for ' + stanza.name) - - if stanza['expId'] not in expIds: - expIds[stanza['expId']] = list() - - expIds[stanza['expId']].append(stanza) - - if 'geoSampleAccession' in stanza: - if stanza['expId'] not in geomapping: - geomapping[stanza['expId']] = stanza['geoSampleAccession'] - elif geomapping[stanza['expId']] != 'Inconsistent' and geomapping[stanza['expId']] != stanza['geoSampleAccession']: - geomapping[stanza['expId']] = 'Inconsistent' - # should print warning message, but continue execution - - softfile = SoftFile() - seriesStanza = HighThroughputSeriesStanza() - seriesStanza['^SERIES'] = series['composite'] - seriesStanza['!Series_title'] = track[composite]['shortLabel'] - seriesStanza['!Series_summary'] = '[REPLACE]' - seriesStanza['!Series_overall_design'] = '[REPLACE]' - seriesStanza['!Series_web_link'] = compositeUrl - seriesStanza['!Series_contributor'] = '[REPLACE]' - - #stanza['!Series_variable_1'] = 'var1' #dont use for now, follow up for later - #stanza['!Series_variable_description_1'] = 'desc1' # ^ - #stanza['!Series_variable_sample_list_1'] = 'list1' # ^ - #stanza['!Series_repeats_1'] = 'rep1' #WILL USE BUT DONT KNOW YET - #stanza['!Series_repeats_sample_list_1'] = 'replist1' # ^ - - seriesStanza['!Series_sample_id'] = list() - - for idNum in expIds.iterkeys(): - if idNum in geomapping and geomapping[idNum] != 'Inconsistent': - seriesStanza['!Series_sample_id'].append(geomapping[idNum]) - else: - seriesStanza['!Series_sample_id'].append(expIds[idNum][0]['metaObject']) - - if 'geoAccession' in series: - seriesStanza['!Series_geo_accession'] = series['geoAccession'] - - softfile[series['composite']] = seriesStanza - - for idNum in expIds.iterkeys(): - - expId = expIds[idNum] - firstStanza = expId[0] - sample = HighThroughputSampleStanza() - sample['^SAMPLE'] = firstStanza['metaObject'] - sample['!Sample_type'] = 'SRA' - - concat = expVars[0] - for expVar in expVars[1:len(expVars)]: - concat += '_' + firstStanza[expVar] - sample['!Sample_title'] = concat - - count = 1 - - for stanza in expId: - - if isRawFile(stanza['fileName']): - sample['!Sample_raw_file_' + str(count)] = stanza['fileName'] - sample['!Sample_raw_file_type_' + str(count)] = getFileType(stanza['fileName']) - - if 'checksum' in stanza: - sample['!Sample_raw_file_checksum_' + str(count)] = stanza['checksum'] - elif md5sums != None and stanza['fileName'] in md5sums: - sample['!Sample_raw_file_checksum_' + str(count)] = md5sums[stanza['fileName']] - - count = count + 1 - - count = 1 - - for stanza in expId: - - if isSupplimentaryFile(stanza['fileName']): - sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] - - if 'checksum' in stanza: - sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] - elif md5sums != None and stanza['fileName'] in md5sums: - sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] - - sample['!Sample_supplementary_file_build_' + str(count)] = organism - count = count + 1 - - sample['!Sample_source_name'] = firstStanza['cell'] - sample['!Sample_organism'] = organism - - sample['!Sample_characteristics'] = list() - for expVar in expVars: - sample['!Sample_characteristics'].append(expVar + ': ' + firstStanza[expVar]) - for cellChar in cellCharacteristics: - if cellChar in cv[firstStanza['cell']]: - sample['!Sample_characteristics'].append('cell ' + cellChar + ': ' + cv[firstStanza['cell']][cellChar]) - - sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName'] - - if 'treatment' in firstStanza: - sample['!Sample_treatment_protocol'] = firstStanza['treatment'] - - if 'protocol' in cv[firstStanza['cell']]: - sample['!Sample_growth_protocol'] = cv[firstStanza['cell']]['protocol'] # NEED TO FIX - - sample['!Sample_molecule'] = datatypes[firstStanza['dataType']].molecule - sample['!Sample_extract_protocol'] = compositeUrl - sample['!Sample_library_strategy'] = datatypes[firstStanza['dataType']].strategy - sample['!Sample_library_source'] = datatypes[firstStanza['dataType']].source - sample['!Sample_library_selection'] = datatypes[firstStanza['dataType']].selection - - if 'seqPlatform' in stanza: - sample['!Sample_instrument_model'] = stanza['seqPlatform'] - else: - sample['!Sample_instrument_model'] = '[REPLACE]' - - sample['!Sample_data_processing'] = compositeUrl - - if idNum in geomapping and geomapping[idNum] != 'Inconsistent': - sample['!Sample_geo_accession'] = geomapping[idNum] - - softfile[firstStanza['metaObject']] = sample - - outfile = open(os.path.dirname(sys.argv[0]) + composite + '.soft', 'w') - outfile.write(str(softfile)) - -if __name__ == '__main__': - main() \ No newline at end of file