e14a4cf5576db1bae722948cd5697289ef073cd6 mmaddren Thu Jan 26 12:08:17 2012 -0800 added cvValidate back into the tree diff --git python/lib/ucscgenomics/ra.py python/lib/ucscgenomics/ra.py index 5cb074f..04abb68 100644 --- python/lib/ucscgenomics/ra.py +++ python/lib/ucscgenomics/ra.py @@ -69,31 +69,32 @@ that lambda s: 1 means always return true. Lambda expressions are always preferable to functions unless the expression would need to be reused multiple times. It is also best to reduce the set of stanzas as much as possible before operating over them. Filtering allows you to eliminate a lot of code. ''' def __init__(self, filePath=None, key=None): OrderedDict.__init__(self) if filePath != None: self.read(filePath, key) def read(self, filePath, key=None): ''' - Reads an rafile stanza by stanza, and internalizes it. + Reads an rafile stanza by stanza, and internalizes it. Don't override + this for derived types, instead override readStanza. ''' file = open(filePath, 'r') #entry = None stanza = list() keyValue = '' reading = 1 while reading: line = file.readline() if line == '': reading = 0 @@ -112,34 +113,45 @@ if entry != None and keyValue != testKey: raise KeyError('Inconsistent Key ' + testKey) if entry != None: if name != None or key == None: if name in self: raise KeyError('Duplicate Key ' + name) self[name] = entry stanza = list() file.close() def readStanza(self, stanza, key=None): + ''' + Override this to create custom stanza behavior in derived types. + + IN + stanza: list of strings with keyval data + key: optional key for selective key filtering. Don't worry about it + + OUT + namekey: the key of the stanza's name + nameval: the value of the stanza's name + entry: the stanza itself + ''' entry = RaStanza() if entry.readStanza(stanza, key) == None: return None, None, None - entry = RaStanza() val1, val2 = entry.readStanza(stanza, key) return val1, val2, entry def iter(self): pass def iterkeys(self): for item in self._OrderedDict__ordering: if not(item.startswith('#') or item == ''): yield item def itervalues(self): @@ -217,59 +229,52 @@ one with identical values being preserved, differences are marked with a >>> and <<< ''' mergedKeys = ucscUtils.mergeList(list(self), list(other)) selfKeys = set(self) otherKeys = set(other) newCommon = RaFile() p = re.compile('^\s*#') p2 = re.compile('^\s*$') for i in mergedKeys: if p.match(i) or p2.match(i): newCommon.append(i) continue if i not in selfKeys: - newCommon[i] = other[i] - continue + newCommon.append(other[i]) if i not in otherKeys: - newCommon[i] = self[i] - continue + newCommon.append(self[i]) if i in otherKeys and i in selfKeys: newStanza = RaStanza() selfStanzaKeys = set(self[i].iterkeys()) otherStanzaKeys = set(other[i].iterkeys()) - stanzaKeys = ucscUtils.mergeList(list(self[i]), list(other[i])) + stanzaKeys = ucscUtils.mergeList(list(self[i].iterkeys()), list(other[i].iterkeys())) for j in stanzaKeys: - if p.match(j): - newStanza.append(j) - continue if j not in selfStanzaKeys: newStanza[j] = other[i][j] - continue if j not in otherStanzaKeys: newStanza[j] = self[i][j] - continue if j in selfStanzaKeys and j in otherStanzaKeys: if self[i][j] == other[i][j]: newStanza[j] = self[i][j] else: in_j = '>>>>>%s' % j out_j = '<<<<<%s' % j newStanza[out_j] = self[i][j] newStanza[in_j] = other[i][j] - newCommon[i] = newStanza + newCommon.append(newStanza) return newCommon def summaryDiff(self, other): ''' Input: RaFile object being compared. Output: RaFile with differences. Returns ***partial*** stanzas of ***anything*** different from the self dictionary compared to the other dictionary. For versatility, it only returns stanzas from the self Ra file. In other words, it returns the self dictionary lines that are either not present in or different from the other dictionary. @@ -284,52 +289,52 @@ if stanza.name not in other.keys(): RetThis[stanza.name] = stanza else: if stanza.difference(other[stanza.name]): RetThis[stanza.name] = stanza.difference(other[stanza.name]) return RetThis def changeSummary(self, otherRa): ''' Input: Two RaFile objects Output: Dictionary showing differences between stanzas, list of added and dropeed stanzas ''' retDict = collections.defaultdict(list) - addList = set(self.iterkeys()) - set(otherRa.iterkeys()) - dropList = set(otherRa.iterkeys()) - set(self.iterkeys()) + dropList = set(self.iterkeys()) - set(otherRa.iterkeys()) + addList = set(otherRa.iterkeys()) - set(self.iterkeys()) common = set(self.iterkeys()) & set(otherRa.iterkeys()) p = re.compile('^\s*#') for stanza in common: if p.match(stanza): continue for key in self[stanza]: if p.match(key): continue if key in otherRa[stanza]: if self[stanza][key] != otherRa[stanza][key]: retDict[stanza].append("Changed %s from %s -> %s" %(key, otherRa[stanza][key], self[stanza][key])) else: retDict[stanza].append("Added %s -> %s" %(key, self[stanza][key])) for key in otherRa[stanza]: if p.match(key): continue if key not in self[stanza]: retDict[stanza].append("Dropped %s -> %s" %(key, otherRa[stanza][key])) - return retDict, addList, dropList + return retDict, dropList, addList def diffFilter(self, select, other): ''' Input: Lambda function of desired comparison term RaFile object being compared. Output: RaFile with differences. Filter returns ***full*** stanzas of a ***select function*** from the self dictionary compared to the other dictionary. For versatility, it only returns stanzas from the self Ra file. In other words, it only returns self dictionary stanzas with the function term that are either not found in or different from the other dictionary. @@ -363,140 +368,98 @@ RetThis[stanza.name] = stanza elif thisSelectDict[stanza.name] != thatSelectDict[stanza.name]: RetThis[stanza.name] = stanza return RetThis def updateDiffFilter(self, term, other): ''' Replicates updateMetadata. Input: Term Other raFile Output: Merged RaFile Stanzas found in 'self' and 'other' that have the 'Term' in 'other' - are overwritten (or inserted if not found) into 'self'. - Final merged dictionary is returned. + are overwritten (or inserted if not found) into 'self'. Final merged + dictionary is returned. ''' ret = self common = set(self.iterkeys()) & set(other.iterkeys()) for stanza in common: if term not in self[stanza] and term not in other[stanza]: continue if term in self[stanza] and term not in other[stanza]: del ret[stanza][term] continue + if term in other[stanza]: #Remake stanza to keep order of terms tempStanza = RaStanza() tempStanza._name = stanza - selfKeys = list(self[stanza].iterkeys()) - otherKeys = list(other[stanza].iterkeys()) - newOther = list() - #filter out keys in other that aren't in self, or the term we're interested in - for i in otherKeys: - if not i in selfKeys and i != term: - continue + try: + tempStanza['metaObject'] = self[stanza]['metaObject'] + tempStanza['objType'] = self[stanza]['objType'] + termList = self[stanza].keys() + termList.remove('metaObject') + termList.remove('objType') + except KeyError: + termList = self[stanza].keys() + if term not in termList: + termList.append(term) + for t in sorted(termList, key=str.lower): + if t == term: + if t not in self[stanza]: + tempStanza[t] = other[stanza][t] + elif self[stanza][t] != other[stanza][t]: + tempStanza[t] = other[stanza][t] else: - newOther.append(i) - #merge self keylist and filtered other list - masterList = ucscUtils.mergeList(newOther, selfKeys) - for i in masterList: - if i == term: - tempStanza[i] = other[stanza][i] + tempStanza[t] = self[stanza][t] else: - tempStanza[i] = self[stanza][i] + tempStanza[t] = self[stanza][t] ret[stanza] = tempStanza return ret - def printTrackDbFormat(self): - retstring = "" - space = False - tab = False - commentList = [] - for stanza in self: - if stanza == "": - if commentList: - for line in commentList: - if space == True: - retstring += " " - if tab == True: - retstring += " " - retstring += line + "\n" - commentList = [] - retstring += "\n" - continue - if stanza.startswith("#"): - commentList.append(stanza) - continue - if "visibility" in self[stanza].keys(): - tab = False - space = True - if "subGroups" in self[stanza].keys(): - tab = True - space = True - if commentList: - for line in commentList: - if space == True: - retstring += " " - if tab == True: - retstring += " " - retstring += line + "\n" - commentList = [] - for line in self[stanza]: - if space == True: - retstring += " " - if tab == True: - retstring += " " - if line.startswith("#"): - retstring += line + "\n" - else: - retstring += line + " " + self[stanza][line] + "\n" - retstring += "\n" - return retstring - - def __str__(self): str = '' for item in self.iteritems(): if len(item) == 1: str += item[0].__str__() + '\n' else: str += item[1].__str__() + '\n' return str #.rsplit('\n', 1)[0] class RaStanza(OrderedDict): ''' Holds an individual entry in the RaFile. ''' - def __init__(self): - self._name = '' - self._nametype = '' - OrderedDict.__init__(self) - @property def name(self): return self._name + def __init__(self): + self._name = '' + self._nametype = '' + OrderedDict.__init__(self) def readStanza(self, stanza, key=None): ''' - Populates this entry from a single stanza + Populates this entry from a single stanza. Override this to create + custom behavior in derived classes ''' for line in stanza: self.readLine(line) return self.readName(stanza, key) def readName(self, stanza, key=None): ''' Extracts the Stanza's name from the value of the first line of the stanza. ''' if key == None: