586f861d9abdf012f9b312984280ebaef43ba21d
wong
  Thu Dec 1 16:42:34 2011 -0800
added in willy's changed to the ra.py library, only went over changeSummary and updateDiffFilter with him so far, the rest still need to be looked over
diff --git python/lib/ucscgenomics/ra.py python/lib/ucscgenomics/ra.py
index 9ed40fd..3d3208a 100644
--- python/lib/ucscgenomics/ra.py
+++ python/lib/ucscgenomics/ra.py
@@ -1,18 +1,19 @@
 import sys
 import re
 from ucscgenomics.ordereddict import OrderedDict
+import collections
 
 class RaFile(OrderedDict):
     """
     Stores a Ra file in a set of entries, one for each stanza in the file.
 
     To make a RaFile, it is usually easiest to just pass it's path:
         rafile = ra.RaFile('kent/src/hg/.../wgEncodeSomeRaFile.ra')
 
     The data is read in and organized as a collection of stanzas. Ra files
     store the stanza by it's name, so to access a specific stanza, say:
         somestanza = rafile['wgEncodeSomeStanzaName']
 
     Once you have a stanza, you may want specific data about that stanza.
     Stanzas are, as ra files, organized as a collection of terms. Therefore
     to get the description of the stanza, we can say:
@@ -155,66 +156,226 @@
         for item in self._OrderedDict__ordering:
             if not (item.startswith('#') or item == ''):
                 yield item, self[item]
             else:
                 yield [item]
 
 
     def append(self, item):
         OrderedDict.append(self, item)
 
 
     def filter(self, where, select):
         """
         select useful data from matching criteria
 
-        where: the conditional function that must be met. Where takes one argument, the stanza and should return true or false
-        select: the data to return. Takes in stanza, should return whatever to be added to the list for that stanza.
+        where: the conditional function that must be met. Where takes one
+        argument, the stanza and should return true or false
+        select: the data to return. Takes in stanza, should return whatever
+        to be added to the list for that stanza.
 
-        For each stanza, if where(stanza) holds, it will add select(stanza) to the list of returned entities.
-        Also forces silent failure of key errors, so you don't have to check that a value is or is not in the stanza.
+        For each stanza, if where(stanza) holds, it will add select(stanza)
+        to the list of returned entities. Also forces silent failure of key
+        errors, so you don't have to check that a value is or is not in the stanza.
         """
 
         ret = list()
         for stanza in self.itervalues():
             try:
                 if where(stanza):
                     ret.append(select(stanza))
             except KeyError:
                 continue
         return ret
 
     def filter2(self, where):
         """
         select useful data from matching criteria
+        Filter2 returns a Ra dictionary. Easier to use but more memory intensive.
 
-        where: the conditional function that must be met. Where takes one argument, the stanza and should return true or false
-        select: the data to return. Takes in stanza, should return whatever to be added to the list for that stanza.
+        where: the conditional function that must be met. Where takes one
+        argument, the stanza and should return true or false
+        select: the data to return. Takes in stanza, should return whatever
+        to be added to the list for that stanza.
 
-        For each stanza, if where(stanza) holds, it will add select(stanza) to the list of returned entities.
-        Also forces silent failure of key errors, so you don't have to check that a value is or is not in the stanza.
+        For each stanza, if where(stanza) holds, it will add select(stanza)
+        to the list of returned entities. Also forces silent failure of key
+        errors, so you don't have to check that a value is or is not in the stanza.
         """
-
         ret = RaFile()
         for stanza in self.itervalues():
             try:
                 if where(stanza):
                     ret[stanza.name] = stanza
             except KeyError:
                 continue
         return ret
 
+    def summaryDiff(self,other):
+        """
+        Input:
+            RaFile object being compared.
+        Output: RaFile with differences.
+
+        Returns ***partial*** stanzas of ***anything*** different
+        from the self dictionary compared to the other dictionary.
+        For versatility, it only returns stanzas from the self Ra file. In other
+        words, it returns the self dictionary lines that are either not present
+        in or different from the other dictionary.
+
+        To obtain full set of differences, run summaryDiff twice
+        ra1 = this.summaryDiff(that)
+        and
+        ra2 = that.summaryDiff(this)
+        """
+        this = RaFile()
+        RetThis = RaFile()
+        for stanza in self.itervalues():
+            if stanza.name not in other.keys():
+                RetThis[stanza.name] = stanza
+            else:
+                if stanza.difference(other[stanza.name]):
+                    RetThis[stanza.name] = stanza.difference(other[stanza.name])
+        return RetThis
+
+    def changeSummary(self, otherRa):
+        """
+        Input:
+            Two RaFile objects
+        Output:
+            Dictionary showing differences between stanzas, list of added and dropeed stanzas
+        """
+        retDict = collections.defaultdict(list)
+        dropList = set(self.iterkeys()) - set(otherRa.iterkeys())
+        addList = set(otherRa.iterkeys()) - set(self.iterkeys())
+        common = set(self.iterkeys()) & set(otherRa.iterkeys())
+
+        p = re.compile('^\s*#')
+        for stanza in common:
+            if p.match(stanza):
+                continue
+            for key in self[stanza]:
+                if p.match(key):
+                    continue
+                if key in otherRa[stanza]:
+                    if self[stanza][key] != otherRa[stanza][key]:
+                        retDict[stanza].append("Changed %s from  %s -> %s" %(key, self[stanza][key], otherRa[stanza][key]))
+                else:
+                    retDict[stanza].append("Added %s -> %s" %(key, self[stanza][key]))
+            for key in otherRa[stanza]:
+                if p.match(key):
+                    continue
+                if key not in self[stanza]:
+                    retDict[stanza].append("Dropped %s -> %s" %(key, otherRa[stanza][key]))
+        return retDict, dropList, addList
+
+    def diffFilter(self, select, other):
+        """
+        Input:
+            Lambda function of desired comparison term
+            RaFile object being compared.
+        Output: RaFile with differences.
+
+        Filter returns ***full*** stanzas of a ***select function*** from
+        the self dictionary compared to the other dictionary. For
+        versatility, it only returns stanzas from the self Ra file. In other
+        words, it only returns self dictionary stanzas with the function term
+        that are either not found in or different from the other
+        dictionary.
+
+        To obtain full set of differences, run diffFilter twice
+        ra1 = this.diffFilter(select function, that)
+        and
+        ra2 = that.diffFilter(select function, this)
+        """
+        this = RaFile()
+        RetThis = RaFile()
+        thisSelectDict = dict()
+        thatSelectDict = dict()
+        #Build 2 dict of stanzas to later compare line-by-line
+        for stanza in self.itervalues():
+            try:
+                if select(stanza):
+                    this[stanza.name] = stanza #'this' only records stanzas of the self dict
+                    thisSelectDict[stanza.name] = select(stanza)
+            except KeyError:
+                continue
+        for stanza in other.itervalues():
+            #Exact code as filter2 but kept for clarity.
+            try:
+                if select(stanza):
+                    thatSelectDict[stanza.name] = select(stanza)
+            except KeyError:
+                continue
+        #Compare this and that dict
+        for stanza in this.itervalues():
+            if stanza.name not in thatSelectDict:
+                RetThis[stanza.name] = stanza
+            elif thisSelectDict[stanza.name] != thatSelectDict[stanza.name]:
+                RetThis[stanza.name] = stanza
+        return RetThis
+
+    def updateDiffFilter(self, term, other):
+        """
+        Replicates updateMetadata.
+        Input:
+            Term
+            Other raFile
+
+        Output:
+            Merged RaFile
+                Stanzas found in 'self' and 'other' that have the 'Term' in 'other'
+                are overwritten (or inserted if not found) into 'self'. Final merged
+                dictionary is returned.
+        """
+        ret = self
+        common = set(self.iterkeys()) & set(self.iterkeys())
+        for stanza in common:
+            if term not in self[stanza] and term not in other[stanza]:
+                continue
+            if term in self[stanza] and term not in other[stanza]:
+                    del ret[stanza][term]
+                    continue
+
+            if term in other[stanza]:
+                #Remake stanza to keep order of terms
+                tempStanza = RaStanza()
+                tempStanza._name = stanza
+                try:
+                    tempStanza['metaObject'] = self[stanza]['metaObject']
+                    tempStanza['objType'] = self[stanza]['objType']
+                    termList = self[stanza].keys()
+                    termList.remove('metaObject')
+                    termList.remove('objType')
+                except KeyError:
+                    termList = self[stanza].keys()
+                if term not in termList:
+                    termList.append(term)
+                for t in sorted(termList, key=str.lower):
+                    if t == term:
+                        if t not in self[stanza]:
+                            tempStanza[t] = other[stanza][t]
+                        elif self[stanza][t] != other[stanza][t]:
+                            tempStanza[t] = other[stanza][t]
+                        else:
+                            tempStanza[t] = self[stanza][t]
+                    else:
+                        tempStanza[t] = self[stanza][t]
+                ret[stanza] = tempStanza
+
+        return ret
+
     def __str__(self):
         str = ''
         for item in self.iteritems():
             if len(item) == 1:
                 str += item[0].__str__() + '\n'
             else:
                 str += item[1].__str__() + '\n'
         return str
 
 
 class RaStanza(OrderedDict):
     """
     Holds an individual entry in the RaFile.
     """
 
@@ -255,48 +416,65 @@
         """
         Reads a single line from the stanza, extracting the key-value pair
         """
 
         if line.startswith('#') or line == '':
             OrderedDict.append(self, line)
         else:
             raKey = line.split(' ', 1)[0]
             raVal = ''
             if (len(line.split(' ', 1)) == 2):
                 raVal = line.split(' ', 1)[1]
             #if raKey in self:
                 #raise KeyError(raKey + ' already exists')
             self[raKey] = raVal
 
+    def difference(self, other):
+        '''
+        Complement function to summaryDiff.
+        Takes in self and a comparison Stanza.
+        Returns new stanza with terms from 'self' that are different from 'other'
+        Like the summaryDiff, to get the other terms, this needs to be run
+        again with self and other switched.
+        '''
+        retRa = RaStanza()
+        retRa._name = self.name
+        for key in other.keys():
+            try:
+                if other[key] != self[key] and not key.startswith('#'):
+                    retRa[key] = self[key]
+            except KeyError:
+                continue
+                #maybe add empty keys
+        return retRa
 
     def iterkeys(self):
         for item in self._OrderedDict__ordering:
             if not (item.startswith('#') or item == ''):
                 yield item
 
 
     def itervalues(self):
         for item in self._OrderedDict__ordering:
             if not (item.startswith('#') or item == ''):
                 yield self[item]
 
 
     def iteritems(self):
         for item in self._OrderedDict__ordering:
             if not (item.startswith('#') or item == ''):
                 yield item, self[item]
 
 
     def iter(self):
         iterkeys(self)
 
-
     def __str__(self):
         str = ''
         for key in self:
             if key.startswith('#'):
                 str += key + '\n'
             else:
                 str += key + ' ' + self[key] + '\n'
 
         return str