python/lib/ucscGb/ra.py 58f787640ea77f16c8f6d4481693a83ec9ef647b

58f787640ea77f16c8f6d4481693a83ec9ef647b
vsmalladi
  Tue May 8 10:29:52 2012 -0700
First step in python lib reogranization. Redmine #7029.
diff --git python/lib/ucscGb/ra.py python/lib/ucscGb/ra.py
deleted file mode 100644
index 79b99e6..0000000
--- python/lib/ucscGb/ra.py
+++ /dev/null
@@ -1,613 +0,0 @@
-import sys, string
-import re
-from ucscgenomics.ordereddict import OrderedDict
-from ucscgenomics import ucscUtils
-import collections
-
-class RaFile(OrderedDict):
-    '''
-    Stores a Ra file in a set of entries, one for each stanza in the file.
-
-    To make a RaFile, it is usually easiest to just pass it's path:
-        rafile = ra.RaFile('kent/src/hg/.../wgEncodeSomeRaFile.ra')
-
-    The data is read in and organized as a collection of stanzas. Ra files
-    store the stanza by it's name, so to access a specific stanza, say:
-        somestanza = rafile['wgEncodeSomeStanzaName']
-
-    Once you have a stanza, you may want specific data about that stanza.
-    Stanzas are, as ra files, organized as a collection of terms. Therefore
-    to get the description of the stanza, we can say:
-        somedescription = somestanza['description']
-
-    You can also access a stanza's name from the stanza itself, since by the
-    nature of ra files, the first key is it's name. Therefore for most ra
-    files the following holds true:
-        somestanza.name = somestanza['metaObject'] = 'wgEncodeSomeStanzaName'
-
-    Although the above is useful if you want one thing, it's usually more
-    helpful to be able to loop and query on the stanza. To add a term named
-    'foobar' to every stanza in a ra file:
-        for stanza in rafile.values():
-            stanza['foobar'] = 'some value'
-
-    Note that I iterated over values. It can also be useful to iterate over
-    a stanza's keys:
-        for key in rafile.keys():
-            print key
-
-    Note that ra files are order preserving. Added entries are appended to the
-    end of the file. This allows you to print out a ra file easily:
-        print rafile
-
-    Most of the time you don't want to do something with all stanzas though,
-    instead you want to filter them. The included filter method allows you to
-    specify two functions (or lambda expressions). The first is the 'where'
-    predicate, which must take in one stanza, and return true/false depending
-    on whether you want to take that stanza. The second is the 'select'
-    predicate, which takes in the stanza, and returns some subset or superset
-    of the stanza as a list. Using filter is preferable to for loops where
-    there are no side effects, or to filter data before iterating over it as
-    opposed to using if statements in the loop. To get all stanzas with one
-    experiment ID for instance, we would do something like this:
-        stanzas = rafile.filter(lambda s: s['expId'] == '123', lambda s: s)
-
-    Note that you don't have to ensure 'expId' is in the stanza, it will
-    silently fail. Let's look at another example, say you want to find all
-    stanza's with an geoSampleAccession that are also fastq's
-        submittedfastqs = rafile.filter(
-            lambda s: 'geoSampleAccession' in s and s['fileName'].endswith('.fastq'),
-            lambda s: s)
-
-    We don't always have to just return the stanza in the second parameter
-    however. If we wanted to, for each stanza, return the file associated
-    with that stanza, we could easily do that as well. This would return a
-    simple list of the string filenames in a ra file:
-        files = rafile.filter(lambda s: 1, lambda s: s['fileName'])
-
-    Note that once again, we don't have to ensure 'fileName' exists. Also note
-    that lambda s: 1 means always return true. Lambda expressions are always
-    preferable to functions unless the expression would need to be reused
-    multiple times. It is also best to reduce the set of stanzas as much as
-    possible before operating over them.
-
-    Filtering allows you to eliminate a lot of code.
-    '''
-
-    @property
-    def filename(self):
-        return self._filename
-    
-    def __init__(self, filePath=None, key=None):
-        OrderedDict.__init__(self)
-        if filePath != None:
-            self.read(filePath, key)
-
-    def read(self, filePath, key=None):
-        '''
-        Reads an rafile stanza by stanza, and internalizes it. Don't override
-        this for derived types, instead override readStanza.
-        '''
-        self._filename = filePath
-        file = open(filePath, 'r')
-
-        #entry = None
-        stanza = list()
-        keyValue = ''
-
-        reading = 1
-        
-        while reading:
-            line = file.readline()
-            if line == '':
-                reading = 0
-        
-            line = line.strip()
-            if len(stanza) == 0 and (line.startswith('#') or (line == '' and reading)):
-                OrderedDict.append(self, line)
-                continue
-
-            if line != '':
-                stanza.append(line)
-            elif len(stanza) > 0:
-                if keyValue == '':
-                    keyValue, name, entry = self.readStanza(stanza, key)
-                else:
-                    testKey, name, entry = self.readStanza(stanza, key)
-                    if entry != None and keyValue != testKey:
-                        raise KeyError('Inconsistent Key ' + testKey)
-
-                if entry != None:
-                    if name != None or key == None:
-                        if name in self:
-                            raise KeyError('Duplicate Key ' + name)
-                        self[name] = entry
-
-                stanza = list()
-
-        file.close()
-
-
-    def readStanza(self, stanza, key=None):
-        '''
-        Override this to create custom stanza behavior in derived types.
-        
-        IN
-        stanza: list of strings with keyval data
-        key: optional key for selective key filtering. Don't worry about it
-
-        OUT
-        namekey: the key of the stanza's name
-        nameval: the value of the stanza's name
-        entry: the stanza itself
-        '''
-        entry = RaStanza()
-        if entry.readStanza(stanza, key) == None:
-            return None, None, None
-        entry = RaStanza()
-        val1, val2 = entry.readStanza(stanza, key)
-        return val1, val2, entry
-
-
-    def write(self, filename):
-        file = open(filename, 'w')
-        file.write(str(self))
-        
-    def iter(self):
-        pass
-
-
-    def iterkeys(self):
-        for item in self._OrderedDict__ordering:
-            if not(item.startswith('#') or item == ''):
-                yield item
-
-
-    def itervalues(self):
-        for item in self._OrderedDict__ordering:
-            if not (item.startswith('#') or item == ''):
-                yield self[item]
-
-
-    def iteritems(self):
-        for item in self._OrderedDict__ordering:
-            if not (item.startswith('#') or item == ''):
-                yield item, self[item]
-            else:
-                yield [item]
-
-
-    def append(self, item):
-        OrderedDict.append(self, item)
-
-
-    def filter(self, where, select):
-        '''
-        select useful data from matching criteria
-
-        where: the conditional function that must be met. Where takes one
-        argument, the stanza and should return true or false
-        select: the data to return. Takes in stanza, should return whatever
-        to be added to the list for that stanza.
-
-        For each stanza, if where(stanza) holds, it will add select(stanza)
-        to the list of returned entities. Also forces silent failure of key
-        errors, so you don't have to check that a value is or is not in the stanza.
-        '''
-
-        ret = list()
-        for stanza in self.itervalues():
-            try:
-                if where(stanza):
-                    ret.append(select(stanza))
-            except KeyError:
-                continue
-        return ret
-
-    def filter2(self, where):
-        '''
-        select useful data from matching criteria
-        Filter2 returns a Ra dictionary. Easier to use but more memory intensive.
-
-        where: the conditional function that must be met. Where takes one
-        argument, the stanza and should return true or false
-        select: the data to return. Takes in stanza, should return whatever
-        to be added to the list for that stanza.
-
-        For each stanza, if where(stanza) holds, it will add select(stanza)
-        to the list of returned entities. Also forces silent failure of key
-        errors, so you don't have to check that a value is or is not in the stanza.
-        '''
-        ret = RaFile()
-        for stanza in self.itervalues():
-            try:
-                if where(stanza):
-                        ret[stanza.name] = stanza
-            except KeyError:
-                continue
-        return ret
-
-    def mergeRa(self, other):
-        '''
-        Input:
-            Two RaFile objects
-        Output:
-            A merged RaFile
-
-        Common stanzas and key-val pairs are collapsed into
-        one with identical values being preserved,
-        differences are marked with a >>> and <<<
-        '''
-
-        mergedKeys = ucscUtils.mergeList(list(self), list(other))
-        selfKeys = set(self)
-        otherKeys = set(other)
-        newCommon = RaFile()
-        p = re.compile('^\s*#')
-        p2 = re.compile('^\s*$')
-        for i in mergedKeys:
-            if p.match(i) or p2.match(i):
-                newCommon.append(i)
-                continue
-            if i not in selfKeys:
-                newCommon[i] = other[i]
-                continue
-            if i not in otherKeys:
-                newCommon[i] = self[i]
-                continue
-            if i in otherKeys and i in selfKeys:
-                newStanza = RaStanza()
-                selfStanzaKeys = set(self[i].iterkeys())
-                otherStanzaKeys = set(other[i].iterkeys())
-                stanzaKeys = ucscUtils.mergeList(list(self[i]), list(other[i]))
-                for j in stanzaKeys:
-                    if p.match(j):
-                        newStanza.append(j)
-                        continue
-                    if j not in selfStanzaKeys:
-                        newStanza[j] = other[i][j]
-                        continue
-                    if j not in otherStanzaKeys:
-                        newStanza[j] = self[i][j]
-                        continue
-                    if j in selfStanzaKeys and j in otherStanzaKeys:
-                        if self[i][j] == other[i][j]:
-                            newStanza[j] = self[i][j]
-                        else:
-                            in_j = '>>>>>%s' % j
-                            out_j = '<<<<<%s' % j
-                            newStanza[out_j] = self[i][j]
-                            newStanza[in_j] = other[i][j]
-                newCommon[i] = newStanza
-        return newCommon
-
-
-    def summaryDiff(self, other):
-        '''
-        Input:
-            RaFile object being compared.
-        Output: RaFile with differences.
-
-        Returns ***partial*** stanzas of ***anything*** different
-        from the self dictionary compared to the other dictionary.
-        For versatility, it only returns stanzas from the self Ra file. In other
-        words, it returns the self dictionary lines that are either not present
-        in or different from the other dictionary.
-
-        To obtain full set of differences, run summaryDiff twice
-        ra1 = this.summaryDiff(that)
-        and
-        ra2 = that.summaryDiff(this)
-        '''
-        this = RaFile()
-        RetThis = RaFile()
-        for stanza in self.itervalues():
-            if stanza.name not in other.keys():
-                RetThis[stanza.name] = stanza
-            else:
-                if stanza.difference(other[stanza.name]):
-                    RetThis[stanza.name] = stanza.difference(other[stanza.name])
-        return RetThis
-
-    def changeSummary(self, otherRa):
-        '''
-        Input:
-            Two RaFile objects
-        Output:
-            Dictionary showing differences between stanzas, list of added and dropeed stanzas
-        '''
-        retDict = collections.defaultdict(list)
-        addList = set(self.iterkeys()) - set(otherRa.iterkeys())
-        dropList = set(otherRa.iterkeys()) - set(self.iterkeys())
-        common = set(self.iterkeys()) & set(otherRa.iterkeys())
-
-        p = re.compile('^\s*#')
-        for stanza in common:
-            if p.match(stanza):
-                continue
-            for key in self[stanza]:
-                if p.match(key):
-                    continue
-                if key in otherRa[stanza]:
-                    if self[stanza][key] != otherRa[stanza][key]:
-                        retDict[stanza].append("Changed %s from %s -> %s" %(key, otherRa[stanza][key], self[stanza][key]))
-                else:
-                    retDict[stanza].append("Added %s -> %s" %(key, self[stanza][key]))
-            for key in otherRa[stanza]:
-                if p.match(key):
-                    continue
-                if key not in self[stanza]:
-                    retDict[stanza].append("Dropped %s -> %s" %(key, otherRa[stanza][key]))
-        return retDict, addList, dropList
-
-    def diffFilter(self, select, other):
-        '''
-        Input:
-            Lambda function of desired comparison term
-            RaFile object being compared.
-        Output: RaFile with differences.
-
-        Filter returns ***full*** stanzas of a ***select function*** from
-        the self dictionary compared to the other dictionary. For
-        versatility, it only returns stanzas from the self Ra file. In other
-        words, it only returns self dictionary stanzas with the function term
-        that are either not found in or different from the other
-        dictionary.
-
-        To obtain full set of differences, run diffFilter twice
-        ra1 = this.diffFilter(select function, that)
-        and
-        ra2 = that.diffFilter(select function, this)
-        '''
-        this = RaFile()
-        RetThis = RaFile()
-        thisSelectDict = dict()
-        thatSelectDict = dict()
-        #Build 2 dict of stanzas to later compare line-by-line
-        for stanza in self.itervalues():
-            try:
-                if select(stanza):
-                    this[stanza.name] = stanza #'this' only records stanzas of the self dict
-                    thisSelectDict[stanza.name] = select(stanza)
-            except KeyError:
-                continue
-        for stanza in other.itervalues():
-            #Exact code as filter2 but kept for clarity.
-            try:
-                if select(stanza):
-                    thatSelectDict[stanza.name] = select(stanza)
-            except KeyError:
-                continue
-        #Compare this and that dict
-        for stanza in this.itervalues():
-            if stanza.name not in thatSelectDict:
-                RetThis[stanza.name] = stanza
-            elif thisSelectDict[stanza.name] != thatSelectDict[stanza.name]:
-                RetThis[stanza.name] = stanza
-        return RetThis
-
-    def updateDiffFilter(self, term, other):
-        '''
-        Replicates updateMetadata.
-        Input:
-            Term
-            Other raFile
-
-        Output:
-            Merged RaFile
-                Stanzas found in 'self' and 'other' that have the 'Term' in 'other'
-                are overwritten (or inserted if not found) into 'self'. 
-                Final merged dictionary is returned.
-        '''
-        ret = self
-        common = set(self.iterkeys()) & set(other.iterkeys())
-        for stanza in common:
-            if term not in self[stanza] and term not in other[stanza]:
-                continue
-            if term in self[stanza] and term not in other[stanza]:
-                    del ret[stanza][term]
-                    continue
-            if term in other[stanza]:
-                #Remake stanza to keep order of terms
-                tempStanza = RaStanza()
-                tempStanza._name = stanza
-                selfKeys = list(self[stanza].iterkeys())
-                otherKeys = list(other[stanza].iterkeys())
-                newOther = list()
-                #filter out keys in other that aren't in self, or the term we're interested in
-                for i in otherKeys:
-                    if not i in selfKeys and i != term:
-                        continue
-                    else:
-                        newOther.append(i)
-                #merge self keylist and filtered other list
-                masterList = ucscUtils.mergeList(newOther, selfKeys)
-                for i in masterList:
-                    if i == term:
-                        tempStanza[i] = other[stanza][i]
-                    else:
-                        tempStanza[i] = self[stanza][i]
-            ret[stanza] = tempStanza
-        return ret
-
-    def printTrackDbFormat(self):
-        '''
-        Converts a .ra file into TrackDb format.
-        Returns a printable string.
-        '''
-        retstring = ""
-        parentTrack = ""
-        tier = 0
-        commentList = []
-        p = re.compile('^.*parent')
-        p2 = re.compile('^.*subTrack')
-        for stanza in self:
-            if stanza == "":
-                if commentList:
-                    for line in commentList:
-                        for i in range(tier):
-                            retstring += "    "
-                        retstring += line + "\n"
-                    commentList = []
-                    retstring += "\n"
-                continue
-            if stanza.startswith("#"):
-                commentList.append(stanza)
-                continue
-            keys = self[stanza].keys()
-            parentKey = "NOKEYFOUND"
-            for key in keys:
-                if p.search(key):
-                    parentKey = key
-                if p2.search(key):
-                    parentKey = key
-            if parentKey in keys:
-                if parentTrack not in self[stanza][parentKey] or parentTrack == "":
-                    parentTrack = self[stanza]['track']
-                    tier = 1
-                else:
-                    tier = 2
-            if commentList:
-                for line in commentList:
-                    for i in range(tier):
-                        retstring += "    "
-                    retstring += line + "\n"
-                commentList = []
-            for line in self[stanza]:
-                for i in range(tier):
-                    retstring += "    "
-                if line.startswith("#"):
-                    retstring += line + "\n"
-                else:
-                    retstring += line + " " + self[stanza][line] + "\n"
-            retstring += "\n"
-        return retstring
-
-    def __str__(self):
-        str = ''
-        for item in self.iteritems():
-            if len(item) == 1:
-                str += item[0].__str__() + '\n'
-            else:
-                str += item[1].__str__() + '\n'
-        return str #.rsplit('\n', 1)[0]
-
-
-class RaStanza(OrderedDict):
-    '''
-    Holds an individual entry in the RaFile.
-    '''
-
-    @property
-    def name(self):
-        return self._name
-
-    def __init__(self):
-        self._name = ''
-        self._nametype = ''
-        OrderedDict.__init__(self)
-
-    def readStanza(self, stanza, key=None):
-        '''
-        Populates this entry from a single stanza. Override this to create
-        custom behavior in derived classes
-        '''
-
-        for line in stanza:
-            self.readLine(line)
-
-        return self.readName(stanza, key)
-
-
-    def readName(self, stanza, key=None):
-        '''
-        Extracts the Stanza's name from the value of the first line of the
-        stanza.
-        '''
-        
-        if key == None:
-            line = stanza[0]
-        else:
-            line = None
-            for s in stanza:
-                if s.split(' ', 1)[0] == key:
-                    line = s
-                    break
-            if line == None:
-                return None
-        
-        if len(line.split(' ', 1)) != 2:
-            raise ValueError()
-
-        names = map(str.strip, line.split(' ', 1))
-        self._nametype = names[0]
-        self._name = names[1]
-        return names
-
-    def readLine(self, line):
-        '''
-        Reads a single line from the stanza, extracting the key-value pair
-        '''
-
-        if line.startswith('#') or line == '':
-            OrderedDict.append(self, line)
-        else:
-            raKey = line.split(' ', 1)[0]
-            raVal = ''
-            if (len(line.split(' ', 1)) == 2):
-                raVal = line.split(' ', 1)[1]
-            #if raKey in self:
-                #raise KeyError(raKey + ' already exists')
-            self[raKey] = raVal
-
-    def difference(self, other):
-        '''
-        Complement function to summaryDiff.
-        Takes in self and a comparison Stanza.
-        Returns new stanza with terms from 'self' that are different from 'other'
-        Like the summaryDiff, to get the other terms, this needs to be run
-        again with self and other switched.
-        '''
-        retRa = RaStanza()
-        retRa._name = self.name
-        for key in other.keys():
-            try:
-                if other[key] != self[key] and not key.startswith('#'):
-                    retRa[key] = self[key]
-            except KeyError:
-                continue
-                #maybe add empty keys
-        return retRa
-
-    def iterkeys(self):
-        for item in self._OrderedDict__ordering:
-            if not (item.startswith('#') or item == ''):
-                yield item
-
-
-    def itervalues(self):
-        for item in self._OrderedDict__ordering:
-            if not (item.startswith('#') or item == ''):
-                yield self[item]
-
-
-    def iteritems(self):
-        for item in self._OrderedDict__ordering:
-            if not (item.startswith('#') or item == ''):
-                yield item, self[item]
-
-
-    def iter(self):
-        iterkeys(self)
-
-        
-    def __str__(self):
-        str = ''
-        for key in self:
-            if key.startswith('#'):
-                str += key + '\n'
-            else:
-                str += key + ' ' + self[key] + '\n'
-
-        return str
-