586f861d9abdf012f9b312984280ebaef43ba21d wong Thu Dec 1 16:42:34 2011 -0800 added in willy's changed to the ra.py library, only went over changeSummary and updateDiffFilter with him so far, the rest still need to be looked over diff --git python/lib/ucscgenomics/ra.py python/lib/ucscgenomics/ra.py index 9ed40fd..3d3208a 100644 --- python/lib/ucscgenomics/ra.py +++ python/lib/ucscgenomics/ra.py @@ -1,18 +1,19 @@ import sys import re from ucscgenomics.ordereddict import OrderedDict +import collections class RaFile(OrderedDict): """ Stores a Ra file in a set of entries, one for each stanza in the file. To make a RaFile, it is usually easiest to just pass it's path: rafile = ra.RaFile('kent/src/hg/.../wgEncodeSomeRaFile.ra') The data is read in and organized as a collection of stanzas. Ra files store the stanza by it's name, so to access a specific stanza, say: somestanza = rafile['wgEncodeSomeStanzaName'] Once you have a stanza, you may want specific data about that stanza. Stanzas are, as ra files, organized as a collection of terms. Therefore to get the description of the stanza, we can say: @@ -155,66 +156,226 @@ for item in self._OrderedDict__ordering: if not (item.startswith('#') or item == ''): yield item, self[item] else: yield [item] def append(self, item): OrderedDict.append(self, item) def filter(self, where, select): """ select useful data from matching criteria - where: the conditional function that must be met. Where takes one argument, the stanza and should return true or false - select: the data to return. Takes in stanza, should return whatever to be added to the list for that stanza. + where: the conditional function that must be met. Where takes one + argument, the stanza and should return true or false + select: the data to return. Takes in stanza, should return whatever + to be added to the list for that stanza. - For each stanza, if where(stanza) holds, it will add select(stanza) to the list of returned entities. - Also forces silent failure of key errors, so you don't have to check that a value is or is not in the stanza. + For each stanza, if where(stanza) holds, it will add select(stanza) + to the list of returned entities. Also forces silent failure of key + errors, so you don't have to check that a value is or is not in the stanza. """ ret = list() for stanza in self.itervalues(): try: if where(stanza): ret.append(select(stanza)) except KeyError: continue return ret def filter2(self, where): """ select useful data from matching criteria + Filter2 returns a Ra dictionary. Easier to use but more memory intensive. - where: the conditional function that must be met. Where takes one argument, the stanza and should return true or false - select: the data to return. Takes in stanza, should return whatever to be added to the list for that stanza. + where: the conditional function that must be met. Where takes one + argument, the stanza and should return true or false + select: the data to return. Takes in stanza, should return whatever + to be added to the list for that stanza. - For each stanza, if where(stanza) holds, it will add select(stanza) to the list of returned entities. - Also forces silent failure of key errors, so you don't have to check that a value is or is not in the stanza. + For each stanza, if where(stanza) holds, it will add select(stanza) + to the list of returned entities. Also forces silent failure of key + errors, so you don't have to check that a value is or is not in the stanza. """ - ret = RaFile() for stanza in self.itervalues(): try: if where(stanza): ret[stanza.name] = stanza except KeyError: continue return ret + def summaryDiff(self,other): + """ + Input: + RaFile object being compared. + Output: RaFile with differences. + + Returns ***partial*** stanzas of ***anything*** different + from the self dictionary compared to the other dictionary. + For versatility, it only returns stanzas from the self Ra file. In other + words, it returns the self dictionary lines that are either not present + in or different from the other dictionary. + + To obtain full set of differences, run summaryDiff twice + ra1 = this.summaryDiff(that) + and + ra2 = that.summaryDiff(this) + """ + this = RaFile() + RetThis = RaFile() + for stanza in self.itervalues(): + if stanza.name not in other.keys(): + RetThis[stanza.name] = stanza + else: + if stanza.difference(other[stanza.name]): + RetThis[stanza.name] = stanza.difference(other[stanza.name]) + return RetThis + + def changeSummary(self, otherRa): + """ + Input: + Two RaFile objects + Output: + Dictionary showing differences between stanzas, list of added and dropeed stanzas + """ + retDict = collections.defaultdict(list) + dropList = set(self.iterkeys()) - set(otherRa.iterkeys()) + addList = set(otherRa.iterkeys()) - set(self.iterkeys()) + common = set(self.iterkeys()) & set(otherRa.iterkeys()) + + p = re.compile('^\s*#') + for stanza in common: + if p.match(stanza): + continue + for key in self[stanza]: + if p.match(key): + continue + if key in otherRa[stanza]: + if self[stanza][key] != otherRa[stanza][key]: + retDict[stanza].append("Changed %s from %s -> %s" %(key, self[stanza][key], otherRa[stanza][key])) + else: + retDict[stanza].append("Added %s -> %s" %(key, self[stanza][key])) + for key in otherRa[stanza]: + if p.match(key): + continue + if key not in self[stanza]: + retDict[stanza].append("Dropped %s -> %s" %(key, otherRa[stanza][key])) + return retDict, dropList, addList + + def diffFilter(self, select, other): + """ + Input: + Lambda function of desired comparison term + RaFile object being compared. + Output: RaFile with differences. + + Filter returns ***full*** stanzas of a ***select function*** from + the self dictionary compared to the other dictionary. For + versatility, it only returns stanzas from the self Ra file. In other + words, it only returns self dictionary stanzas with the function term + that are either not found in or different from the other + dictionary. + + To obtain full set of differences, run diffFilter twice + ra1 = this.diffFilter(select function, that) + and + ra2 = that.diffFilter(select function, this) + """ + this = RaFile() + RetThis = RaFile() + thisSelectDict = dict() + thatSelectDict = dict() + #Build 2 dict of stanzas to later compare line-by-line + for stanza in self.itervalues(): + try: + if select(stanza): + this[stanza.name] = stanza #'this' only records stanzas of the self dict + thisSelectDict[stanza.name] = select(stanza) + except KeyError: + continue + for stanza in other.itervalues(): + #Exact code as filter2 but kept for clarity. + try: + if select(stanza): + thatSelectDict[stanza.name] = select(stanza) + except KeyError: + continue + #Compare this and that dict + for stanza in this.itervalues(): + if stanza.name not in thatSelectDict: + RetThis[stanza.name] = stanza + elif thisSelectDict[stanza.name] != thatSelectDict[stanza.name]: + RetThis[stanza.name] = stanza + return RetThis + + def updateDiffFilter(self, term, other): + """ + Replicates updateMetadata. + Input: + Term + Other raFile + + Output: + Merged RaFile + Stanzas found in 'self' and 'other' that have the 'Term' in 'other' + are overwritten (or inserted if not found) into 'self'. Final merged + dictionary is returned. + """ + ret = self + common = set(self.iterkeys()) & set(self.iterkeys()) + for stanza in common: + if term not in self[stanza] and term not in other[stanza]: + continue + if term in self[stanza] and term not in other[stanza]: + del ret[stanza][term] + continue + + if term in other[stanza]: + #Remake stanza to keep order of terms + tempStanza = RaStanza() + tempStanza._name = stanza + try: + tempStanza['metaObject'] = self[stanza]['metaObject'] + tempStanza['objType'] = self[stanza]['objType'] + termList = self[stanza].keys() + termList.remove('metaObject') + termList.remove('objType') + except KeyError: + termList = self[stanza].keys() + if term not in termList: + termList.append(term) + for t in sorted(termList, key=str.lower): + if t == term: + if t not in self[stanza]: + tempStanza[t] = other[stanza][t] + elif self[stanza][t] != other[stanza][t]: + tempStanza[t] = other[stanza][t] + else: + tempStanza[t] = self[stanza][t] + else: + tempStanza[t] = self[stanza][t] + ret[stanza] = tempStanza + + return ret + def __str__(self): str = '' for item in self.iteritems(): if len(item) == 1: str += item[0].__str__() + '\n' else: str += item[1].__str__() + '\n' return str class RaStanza(OrderedDict): """ Holds an individual entry in the RaFile. """ @@ -255,48 +416,65 @@ """ Reads a single line from the stanza, extracting the key-value pair """ if line.startswith('#') or line == '': OrderedDict.append(self, line) else: raKey = line.split(' ', 1)[0] raVal = '' if (len(line.split(' ', 1)) == 2): raVal = line.split(' ', 1)[1] #if raKey in self: #raise KeyError(raKey + ' already exists') self[raKey] = raVal + def difference(self, other): + ''' + Complement function to summaryDiff. + Takes in self and a comparison Stanza. + Returns new stanza with terms from 'self' that are different from 'other' + Like the summaryDiff, to get the other terms, this needs to be run + again with self and other switched. + ''' + retRa = RaStanza() + retRa._name = self.name + for key in other.keys(): + try: + if other[key] != self[key] and not key.startswith('#'): + retRa[key] = self[key] + except KeyError: + continue + #maybe add empty keys + return retRa def iterkeys(self): for item in self._OrderedDict__ordering: if not (item.startswith('#') or item == ''): yield item def itervalues(self): for item in self._OrderedDict__ordering: if not (item.startswith('#') or item == ''): yield self[item] def iteritems(self): for item in self._OrderedDict__ordering: if not (item.startswith('#') or item == ''): yield item, self[item] def iter(self): iterkeys(self) - def __str__(self): str = '' for key in self: if key.startswith('#'): str += key + '\n' else: str += key + ' ' + self[key] + '\n' return str