b59be32f2b6db2037cad497ff2627527199e4b93 mmaddren Tue Jul 12 14:57:09 2011 -0700 added new version of soft file to allow for micro array data diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg index 23a3e84..d1c6c82 100755 --- python/programs/mkGeoPkg/mkGeoPkg +++ python/programs/mkGeoPkg/mkGeoPkg @@ -99,31 +99,35 @@ 'membraneFraction': 'cytoplasmic RNA', 'mitochondria': 'cytoplasmic RNA', 'nucleus': 'nuclear RNA', 'nucleolus': 'nuclear RNA', 'nucleoplasm': 'nuclear RNA', 'nuclearMatrix': 'nuclear RNA', 'chromatin': 'nuclear RNA', 'cell': 'total RNA' } # map our instrument names to GEO's names instrumentModels = { 'Illumina_GA2x': 'Illumina Genome Analyzer II' } -organisms = { 'hg19': 'human', 'hg18': 'human', 'mm9': 'mouse' } +organisms = { + 'hg19': 'human', + 'hg18': 'human', + 'mm9': 'mouse' +} def getFileType(filename): filename.replace('.gz', '') return filename.rsplit('.')[1] def isRawFile(filename): return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta') def isSupplimentaryFile(filename): return not isRawFile(filename) def readMd5sums(filename): if os.path.isfile(filename): @@ -168,79 +172,92 @@ geoMapping[stanza['expId']] = 'Inconsistent' print stanza.name + ': inconsistent geo mapping' if datatype == None and 'dataType' in stanza: datatype = stanza['dataType'] elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: raise KeyError(stanza.name + ': inconsistent data type') datatype = datatypes[datatype] return expIds, expVars, geoMapping, series, datatype -def createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory): +def createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series): - print 'Writing series ' + series['composite'] + if 'geoSeriesAccession' in series: + print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession'] + return - fileList = list() + print 'Writing series ' + series['composite'] - softfile = SoftFile() - seriesStanza = HighThroughputSeriesStanza() + seriesStanza = SeriesStanza() seriesStanza['^SERIES'] = series['composite'] seriesStanza['!Series_title'] = track[composite]['longLabel'] #STILL INCORRECT seriesStanza['!Series_summary'] = '[REPLACE]' seriesStanza['!Series_overall_design'] = '[REPLACE]' seriesStanza['!Series_web_link'] = [ compositeUrl, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ] seriesStanza['!Series_contributor'] = '[REPLACE]' seriesStanza['!Series_gp_id'] = '[REPLACE]' #stanza['!Series_variable_1'] = 'var1' #dont use for now, follow up for later #stanza['!Series_variable_description_1'] = 'desc1' # ^ #stanza['!Series_variable_sample_list_1'] = 'list1' # ^ #stanza['!Series_repeats_1'] = 'rep1' #WILL USE BUT DONT KNOW YET #stanza['!Series_repeats_sample_list_1'] = 'replist1' # ^ seriesStanza['!Series_sample_id'] = list() for idNum in expIds.iterkeys(): if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': seriesStanza['!Series_sample_id'].append(geoMapping[idNum]) else: seriesStanza['!Series_sample_id'].append(expIds[idNum][0]['metaObject']) if 'geoAccession' in series: seriesStanza['!Series_geo_accession'] = series['geoAccession'] softfile[series['composite']] = seriesStanza +def createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory): + + print 'Creating HighThroughput soft file' + + softfile = SoftFile() + fileList = list() + + createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) + for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = HighThroughputSampleStanza() sample['^SAMPLE'] = firstStanza['metaObject'] sample['!Sample_type'] = 'SRA' concat = expVars[0] for expVar in expVars[1:len(expVars)]: concat += '_' + firstStanza[expVar] sample['!Sample_title'] = concat + if 'geoSeriesAccession' in series: + sample['!Sample_series_id'] = series['geoSeriesAccession'] + count = 1 for stanza in expId: if isRawFile(stanza['fileName']): sample['!Sample_raw_file_' + str(count)] = stanza['fileName'] sample['!Sample_raw_file_type_' + str(count)] = getFileType(stanza['fileName']) if 'checksum' in stanza: sample['!Sample_raw_file_checksum_' + str(count)] = stanza['checksum'] elif md5sums != None and stanza['fileName'] in md5sums: sample['!Sample_raw_file_checksum_' + str(count)] = md5sums[stanza['fileName']] fileList.append(stanza['fileName']) count = count + 1 @@ -316,31 +333,114 @@ for stanza in expId: if 'seqPlatform' in stanza: sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']] sample['!Sample_data_processing'] = compositeUrl if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': sample['!Sample_geo_accession'] = geoMapping[idNum] softfile[firstStanza['metaObject']] = sample return softfile, fileList def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory): - pass + + print 'Creating MicroArray soft file' + + softfile = SoftFile() + fileList = list() + + createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) + + for idNum in expIds.iterkeys(): + + expId = expIds[idNum] + firstStanza = expId[0] + print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' + sample = MicroArraySampleStanza() + sample['^SAMPLE'] = firstStanza['metaObject'] + + concat = expVars[0] + for expVar in expVars[1:len(expVars)]: + concat += '_' + firstStanza[expVar] + sample['!Sample_title'] = concat + + count = 1 + + for stanza in expId: + + if isSupplimentaryFile(stanza['fileName']): + sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] + + # if 'checksum' in stanza: + # sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] + # elif md5sums != None and stanza['fileName'] in md5sums: + # sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] + + # sample['!Sample_supplementary_file_build_' + str(count)] = database + + fileList.append(stanza['fileName']) + count = count + 1 + + # sample['!Sample_table'] = KeyOptional # CEL file + # sample['!Sample_source_name_ch'] = KeyOnePlusNumbered + # sample['!Sample_organism_ch'] = KeyOnePlusNumbered + # sample['!Sample_characteristics_ch'] = KeyOnePlusNumbered + # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered + # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered + # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered + # sample['!Sample_molecule_ch'] = KeyOnePlusNumbered + # sample['!Sample_extract_protocol_ch'] = KeyOnePlusNumbered + # sample['!Sample_label_ch'] = KeyOnePlusNumbered + # sample['!Sample_label_protocol_ch'] = KeyOnePlusNumbered + # sample['!Sample_hyb_protocol'] = KeyOnePlus + # sample['!Sample_scan_protocol'] = KeyOnePlus + # sample['!Sample_data_processing'] = KeyOnePlus + # sample['!Sample_description'] = KeyZeroPlus + # sample['!Sample_platform_id'] = KeyRequired + # sample['!Sample_geo_accession'] = KeyOptional + # sample['!Sample_anchor'] = KeyRequired + # sample['!Sample_type'] = KeyRequired + # sample['!Sample_tag_count'] = KeyRequired + # sample['!Sample_tag_length'] = KeyRequired + # sample['!Sample_table_begin'] = KeyRequired + # sample['!Sample_table_end'] = KeyRequired + + # for idk: + + # platform = PlatformStanza() + + # platform['^PLATFORM'] = KeyRequired + # platform['!Platform_title'] = KeyRequired + # platform['!Platform_distribution'] = KeyRequired + # platform['!Platform_technology'] = KeyRequired + # platform['!Platform_organism'] = KeyOnePlus + # platform['!Platform_manufacturer'] = KeyRequired + # platform['!Platform_manufacture_protocol'] = KeyOnePlus + # platform['!Platform_catalog_number'] = KeyZeroPlus + # platform['!Platform_web_link'] = KeyZeroPlus + # platform['!Platform_support'] = KeyOptional + # platform['!Platform_coating'] = KeyOptional + # platform['!Platform_description'] = KeyZeroPlus + # platform['!Platform_contributor'] = KeyZeroPlus + # platform['!Platform_pubmed_id'] = KeyZeroPlus + # platform['!Platform_geo_accession'] = KeyOptional + # platform['!Platform_table_begin'] = KeyRequired + # platform['!Platform_table_end'] = KeyRequired + def main(): database = sys.argv[1] composite = sys.argv[2] submitStart = sys.argv[3] submitSize = int(sys.argv[4]) organism = organisms[database] mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #CHANGE trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra' md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt' downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/' copyDirectory = '/cluster/home/mmaddren/kent/python/ucscgenomics/mkGeoPkg/' + composite