0bb1cef7954937ff2c3d0851ee8db34c07dc5008 mmaddren Tue Jul 19 14:42:36 2011 -0700 small but in soft file, work on mkGeoPkg for micro array submissions (still not functional) and added a readme diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg index d1c6c82..2930f66 100755 --- python/programs/mkGeoPkg/mkGeoPkg +++ python/programs/mkGeoPkg/mkGeoPkg @@ -17,31 +17,31 @@ 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile), 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile), 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile), 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile), 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile), 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile), 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile), 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile), #these need to be curated '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', MicroArraySoftFile), 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), @@ -124,30 +124,31 @@ def isRawFile(filename): return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta') def isSupplimentaryFile(filename): return not isRawFile(filename) def readMd5sums(filename): if os.path.isfile(filename): md5sums = dict() md5file = open(filename, 'r') for line in md5file: val, key = map(str.strip, line.split(' ', 1)) md5sums[key] = val + return md5sums else: return None def createMappings(mdb): expIds = dict() geoMapping = dict() expVars = None series = None datatype = None for stanza in mdb.itervalues(): if 'objType' in stanza and stanza['objType'] == 'composite': series = stanza expVars = stanza['expVars'].split(',') @@ -347,54 +348,61 @@ def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype, copyDirectory): print 'Creating MicroArray soft file' softfile = SoftFile() fileList = list() createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series) for idNum in expIds.iterkeys(): expId = expIds[idNum] firstStanza = expId[0] print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')' sample = MicroArraySampleStanza() - sample['^SAMPLE'] = firstStanza['metaObject'] + sample['^SAMPLE'] = firstStanza['accession'] - concat = expVars[0] - for expVar in expVars[1:len(expVars)]: - concat += '_' + firstStanza[expVar] - sample['!Sample_title'] = concat + if 'geoSeriesAccession' in series: + sample['!Sample_series_id'] = series['geoSeriesAccession'] + + #concat = expVars[0] + #for expVar in expVars[1:len(expVars)]: + # concat += '_' + firstStanza[expVar] + #sample['!Sample_title'] = concat + + sample['!Sample_geo_accession'] = firstStanza['accession'] count = 1 for stanza in expId: if isSupplimentaryFile(stanza['fileName']): sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName'] - # if 'checksum' in stanza: - # sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] - # elif md5sums != None and stanza['fileName'] in md5sums: - # sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] + if 'checksum' in stanza: + sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum'] + elif md5sums != None and stanza['fileName'] in md5sums: + sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']] # sample['!Sample_supplementary_file_build_' + str(count)] = database fileList.append(stanza['fileName']) count = count + 1 + softfile[firstStanza['accession']] = sample + # sample['!Sample_table'] = KeyOptional # CEL file # sample['!Sample_source_name_ch'] = KeyOnePlusNumbered # sample['!Sample_organism_ch'] = KeyOnePlusNumbered # sample['!Sample_characteristics_ch'] = KeyOnePlusNumbered # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered # sample['!Sample_molecule_ch'] = KeyOnePlusNumbered # sample['!Sample_extract_protocol_ch'] = KeyOnePlusNumbered # sample['!Sample_label_ch'] = KeyOnePlusNumbered # sample['!Sample_label_protocol_ch'] = KeyOnePlusNumbered # sample['!Sample_hyb_protocol'] = KeyOnePlus # sample['!Sample_scan_protocol'] = KeyOnePlus # sample['!Sample_data_processing'] = KeyOnePlus # sample['!Sample_description'] = KeyZeroPlus @@ -417,85 +425,97 @@ # platform['!Platform_technology'] = KeyRequired # platform['!Platform_organism'] = KeyOnePlus # platform['!Platform_manufacturer'] = KeyRequired # platform['!Platform_manufacture_protocol'] = KeyOnePlus # platform['!Platform_catalog_number'] = KeyZeroPlus # platform['!Platform_web_link'] = KeyZeroPlus # platform['!Platform_support'] = KeyOptional # platform['!Platform_coating'] = KeyOptional # platform['!Platform_description'] = KeyZeroPlus # platform['!Platform_contributor'] = KeyZeroPlus # platform['!Platform_pubmed_id'] = KeyZeroPlus # platform['!Platform_geo_accession'] = KeyOptional # platform['!Platform_table_begin'] = KeyRequired # platform['!Platform_table_end'] = KeyRequired + return softfile, fileList + def main(): database = sys.argv[1] composite = sys.argv[2] + + wholeComposite = 1 + if len(sys.argv) == 5: submitStart = sys.argv[3] submitSize = int(sys.argv[4]) + wholeComposite = 0 + organism = organisms[database] mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE cvPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/cv/alpha/cv.ra' #CHANGE trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra' md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/md5sum.txt' downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/' - copyDirectory = '/cluster/home/mmaddren/kent/python/ucscgenomics/mkGeoPkg/' + composite compositeUrl = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + composite mdb = RaFile(mdbPath) cv = CvFile(cvPath) track = RaFile(trackPath) md5sums = readMd5sums(md5path) + print md5sums expIds, expVars, geoMapping, series, datatype = createMappings(mdb) submission = dict() sortedIds = expIds.keys() sortedIds.sort() + + if wholeComposite == 0: sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize] + minId = min(sortedIds) maxId = max(sortedIds) print 'Generating soft using expIds ' + minId + ' to ' + maxId for expId in sortedIds: submission[expId] = expIds[expId] if datatype.soft == HighThroughputSoftFile: softfile, fileList = createHighThroughputSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype, copyDirectory) elif datatype.soft == MicroArraySoftFile: softfile, fileList = createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, submission, expVars, geoMapping, series, datatype, copyDirectory) else: - raise Error('unsupported type') + raise KeyError('unsupported type') print 'Writing soft file' outfileName = os.path.dirname(sys.argv[0]) + composite + '.soft' outfile = open(outfileName, 'w') outfile.write(str(softfile)) fileString = outfileName for file in fileList: fileString = fileString + ' ' + downloadsDirectory + file fileString.strip() callString = '/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m ' + fileString + ' asp-geo@upload.ncbi.nlm.nih.gov:ENCODE/' + composite outscript = open(composite + minId + '-' + maxId + '.sh', 'w') outscript.write('#!/bin/sh\n\n') outscript.write('/opt/aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk -QTr -l300m \\\n') outscript.write(os.path.dirname(sys.argv[0]) + composite + '.soft' + ' \\\n') for file in fileList: + if not os.path.exists(downloadsDirectory + file): + raise FileError(downloadsDirectory + file + ' does not exist') outscript.write(downloadsDirectory + file + ' \\\n') outscript.write('asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n') outscript.close() os.system('chmod +x ' + composite + minId + '-' + maxId + '.sh') print 'Finished!' if __name__ == '__main__': main() \ No newline at end of file