a0f9d361a3f365777aab3843309aa534e5277e03 wong Thu Jan 5 10:56:29 2012 -0800 first draft of raMerging functions diff --git python/lib/ucscgenomics/ra.py python/lib/ucscgenomics/ra.py index d900f2a..9b675f3 100644 --- python/lib/ucscgenomics/ra.py +++ python/lib/ucscgenomics/ra.py @@ -1,18 +1,19 @@ -import sys +import sys, string import re from ucscgenomics.ordereddict import OrderedDict +from ucscgenomics import ucscUtils import collections class RaFile(OrderedDict): ''' Stores a Ra file in a set of entries, one for each stanza in the file. To make a RaFile, it is usually easiest to just pass it's path: rafile = ra.RaFile('kent/src/hg/.../wgEncodeSomeRaFile.ra') The data is read in and organized as a collection of stanzas. Ra files store the stanza by it's name, so to access a specific stanza, say: somestanza = rafile['wgEncodeSomeStanzaName'] Once you have a stanza, you may want specific data about that stanza. Stanzas are, as ra files, organized as a collection of terms. Therefore @@ -192,30 +193,78 @@ to be added to the list for that stanza. For each stanza, if where(stanza) holds, it will add select(stanza) to the list of returned entities. Also forces silent failure of key errors, so you don't have to check that a value is or is not in the stanza. ''' ret = RaFile() for stanza in self.itervalues(): try: if where(stanza): ret[stanza.name] = stanza except KeyError: continue return ret + def mergeRa(self, other): + ''' + Input: + Two RaFile objects + Output: + A merged RaFile + + Common stanzas and key-val pairs are collapsed into + one with identical values being preserved, + differences are marked with a >>> and <<< + ''' + + mergedKeys = ucscUtils.mergeList(list(self), list(other)) + selfKeys = set(self) + otherKeys = set(other) + newCommon = RaFile() + p = re.compile('^\s*#') + p2 = re.compile('^\s*$') + for i in mergedKeys: + if p.match(i) or p2.match(i): + newCommon.append(i) + continue + if i not in selfKeys: + newCommon.append(other[i]) + if i not in otherKeys: + newCommon.append(self[i]) + if i in otherKeys and i in selfKeys: + newStanza = RaStanza() + selfStanzaKeys = set(self[i].iterkeys()) + otherStanzaKeys = set(other[i].iterkeys()) + stanzaKeys = ucscUtils.mergeList(list(self[i].iterkeys()), list(other[i].iterkeys())) + for j in stanzaKeys: + if j not in selfStanzaKeys: + newStanza[j] = other[i][j] + if j not in otherStanzaKeys: + newStanza[j] = self[i][j] + if j in selfStanzaKeys and j in otherStanzaKeys: + if self[i][j] == other[i][j]: + newStanza[j] = self[i][j] + else: + in_j = '>>>>>%s' % j + out_j = '<<<<<%s' % j + newStanza[out_j] = self[i][j] + newStanza[in_j] = other[i][j] + newCommon.append(newStanza) + return newCommon + + def summaryDiff(self,other): ''' Input: RaFile object being compared. Output: RaFile with differences. Returns ***partial*** stanzas of ***anything*** different from the self dictionary compared to the other dictionary. For versatility, it only returns stanzas from the self Ra file. In other words, it returns the self dictionary lines that are either not present in or different from the other dictionary. To obtain full set of differences, run summaryDiff twice ra1 = this.summaryDiff(that) and