aa37b776679006c611ac73f01ab340e29429b71f mmaddren Tue Jun 14 17:53:53 2011 -0700 Working soft file. mkGeoPkg is 1 step above pseudocode diff --git python/ucscgenomics/softfile/SoftFile.py python/ucscgenomics/softfile/SoftFile.py new file mode 100644 index 0000000..f4a2c76 --- /dev/null +++ python/ucscgenomics/softfile/SoftFile.py @@ -0,0 +1,299 @@ +import sys +import re +from OrderedDict import * + +class SoftFile(OrderedDict): + + """ + Stores an Ra file in a set of entries, one for each stanza in the file. + """ + + def __init__(self, filePath=''): + OrderedDict.__init__(self) + if filePath != '': + self.read(filePath) + + def read(self, filePath): + """ + Reads an SoftFile stanza by stanza, and internalizes it. + """ + + file = open(filePath, 'r') + + stanza = list() + + for line in file: + + line = line.strip() + + if line.startswith('^') and stanza != []: + name, entry = self.readStanza(stanza) + #print 'hit: ' + name + if entry != None: + if name in self: + raise KeyError('Duplicate Key ' + name) + self[name] = entry + + stanza = list() + + #print 'appending: ' + line + stanza.append(line) + + file.close() + + name, entry = self.readStanza(stanza) + #print 'hit: ' + name + if entry != None: + if name in self: + raise KeyError('Duplicate Key ' + name) + self[name] = entry + + + def readStanza(self, stanza): + #print stanza + + if stanza[0].startswith('^SAMPLE'): + entry = SampleStanza() + elif stanza[0].startswith('^SERIES'): + entry = SeriesStanza() + elif stanza[0].startswith('^PLATFORM'): + entry = PlatformStanza() + else: + # GOTTA FIX THIS ERROR + raise KeyError(stanza[0], 'type') + + val = entry.readStanza(stanza) + return val, entry + + + def iter(self): + for item in self._OrderedDict__ordering: + yield item + + + def iterkeys(self): + for item in self._OrderedDict__ordering: + yield item + + + def itervalues(self): + for item in self._OrderedDict__ordering: + yield self[item] + + + def iteritems(self): + for item in self._OrderedDict__ordering: + yield [item] + + + def __str__(self): + str = '' + for item in self.iterkeys(): + str += self[item].__str__() + + return str + + +class KeyRequired(object): + pass + +class KeyOptional(object): + pass + +class KeyZeroPlus(object): + pass + +class KeyOnePlus(object): + pass + +class KeyZeroPlusNumbered(object): + pass + +class KeyOnePlusNumbered(object): + pass + + +class SoftStanza(OrderedDict): + """ + Holds an individual entry in the RaFile. + """ + + def __init__(self, keys): + self._name = '' + self.keys = keys + OrderedDict.__init__(self) + + @property + def name(self): + return self._name + + def readStanza(self, stanza): + """ + Populates this entry from a single stanza + """ + + for line in stanza: + self.__readLine(line) + + return self.__readName(stanza[0]) + + + def __readName(self, line): + """ + Extracts the Stanza's name from the value of the first line of the + stanza. + """ + + if len(line.split('=', 1)) != 2: + raise ValueError() + + self._name = line.split('=', 1)[1].strip() + return self._name + + def __readLine(self, line): + """ + Reads a single line from the stanza, extracting the key-value pair + """ + key = line.split('=', 1)[0].strip() + val = '' + if (len(line.split('=', 1)) == 2): + val = line.split('=', 1)[1].strip() + + #split on the last underscore to determine if we're using a numbered key or not + splitkey = key.rsplit('_', 1)[0] + #if the key is a numbered key + if splitkey in self.keys and (self.keys[splitkey] == KeyZeroPlusNumbered or self.keys[splitkey] == KeyOnePlusNumbered): + self[key] = val + + #if its a single value (ie 0 or 1 allowed entries) + elif key in self.keys and (self.keys[key] == KeyRequired or self.keys[key] == KeyOptional): + self[key] = val + + else: + + if key not in self.keys: + raise KeyError('invalid key') + + if (self.keys[key] == KeyRequired or self.keys[key] == KeyOptional) and key in self: + raise KeyError('too many of key') + + if key not in self: + self[key] = list() + self[key].append(val) + + + def iter(self): + yield iterkeys(self) + + + def iterkeys(self): + for item in self._OrderedDict__ordering: + yield item + + + def itervalues(self): + for item in self._OrderedDict__ordering: + yield self[item] + + + def iteritems(self): + for item in self._OrderedDict__ordering: + yield item, self[item] + + + def __str__(self): + str = '' + for key in self: + if isinstance(self[key], basestring): + str += key + ' = ' + self[key] + '\n' + else: + for val in self[key]: + str += key + ' = ' + val + '\n' + + return str + + def write(self, filename): + #check for absence of required vars + file = open(filename, 'r') + file.write(self.__str__()) + file.close() + + +class PlatformStanza(SoftStanza): + + def __init__(self): + keys = { '^PLATFORM': KeyRequired, + '!Platform_title': KeyRequired, + '!Platform_distribution': KeyRequired, + '!Platform_technology': KeyRequired, + '!Platform_organism': KeyOnePlus, + '!Platform_manufacturer': KeyRequired, + '!Platform_manufacture_protocol': KeyOnePlus, + '!Platform_catalog_number': KeyZeroPlus, + '!Platform_web_link': KeyZeroPlus, + '!Platform_support': KeyOptional, + '!Platform_coating': KeyOptional, + '!Platform_description': KeyZeroPlus, + '!Platform_contributor': KeyZeroPlus, + '!Platform_pubmed_id': KeyZeroPlus, + '!Platform_geo_accession': KeyOptional, + '!Platform_table_begin': KeyRequired, + '!Platform_table_end': KeyRequired } + + SoftStanza.__init__(self, keys) + + +class SampleStanza(SoftStanza): + + def __init__(self): + keys = { '^SAMPLE': KeyRequired, + '!Sample_type': KeyRequired, + '!Sample_title': KeyRequired, + '!Sample_supplementary_file': KeyOnePlusNumbered, + '!Sample_supplementary_file_checksum': KeyZeroPlusNumbered, + '!Sample_supplementary_file_build': KeyZeroPlusNumbered, + '!Sample_raw_file': KeyOnePlusNumbered, + '!Sample_raw_file_type': KeyOnePlusNumbered, + '!Sample_raw_file_checksum': KeyZeroPlusNumbered, + '!Sample_source_name': KeyRequired, + '!Sample_organism': KeyOnePlus, + '!Sample_characteristics': KeyOnePlus, + '!Sample_biomaterial_provider': KeyZeroPlus, + '!Sample_treatment_protocol': KeyZeroPlus, + '!Sample_growth_protocol': KeyZeroPlus, + '!Sample_molecule': KeyRequired, + '!Sample_extract_protocol': KeyOnePlus, + '!Sample_library_strategy': KeyOnePlus, + '!Sample_library_source': KeyOnePlus, + '!Sample_library_selection': KeyOnePlus, + '!Sample_instrument_model': KeyOnePlus, + '!Sample_data_processing': KeyRequired, + '!Sample_barcode': KeyOptional, + '!Sample_description': KeyZeroPlus, + '!Sample_geo_accession': KeyOptional, + '!Sample_table_begin': KeyOptional, + '!Sample_table': KeyOptional, + '!Sample_table_end': KeyOptional } + #print 'Sample!' + SoftStanza.__init__(self, keys) + + +class SeriesStanza(SoftStanza): + + def __init__(self): + keys = { '^SERIES': KeyRequired, + '!Series_title': KeyRequired, + '!Series_summary': KeyOnePlus, + '!Series_overall_design': KeyRequired, + '!Series_pubmed_id': KeyZeroPlus, + '!Series_web_link': KeyZeroPlus, + '!Series_contributor': KeyZeroPlus, + '!Series_variable': KeyZeroPlusNumbered, + '!Series_variable_description': KeyZeroPlusNumbered, + '!Series_variable_sample_list': KeyZeroPlusNumbered, + '!Series_repeats': KeyZeroPlusNumbered, + '!Series_repeats_sample_list': KeyZeroPlusNumbered, + '!Series_sample_id': KeyOnePlus, + '!Series_geo_accession': KeyOptional } + + SoftStanza.__init__(self, keys) \ No newline at end of file