a44e3254174152b2d4b3f241d1935654cd139181 vsmalladi Tue May 8 10:11:21 2012 -0700 Renamed library from ucscgenomics to ucscGb. Redmine #7029. diff --git python/lib/ucscGb/ra.py python/lib/ucscGb/ra.py new file mode 100644 index 0000000..79b99e6 --- /dev/null +++ python/lib/ucscGb/ra.py @@ -0,0 +1,613 @@ +import sys, string +import re +from ucscgenomics.ordereddict import OrderedDict +from ucscgenomics import ucscUtils +import collections + +class RaFile(OrderedDict): + ''' + Stores a Ra file in a set of entries, one for each stanza in the file. + + To make a RaFile, it is usually easiest to just pass it's path: + rafile = ra.RaFile('kent/src/hg/.../wgEncodeSomeRaFile.ra') + + The data is read in and organized as a collection of stanzas. Ra files + store the stanza by it's name, so to access a specific stanza, say: + somestanza = rafile['wgEncodeSomeStanzaName'] + + Once you have a stanza, you may want specific data about that stanza. + Stanzas are, as ra files, organized as a collection of terms. Therefore + to get the description of the stanza, we can say: + somedescription = somestanza['description'] + + You can also access a stanza's name from the stanza itself, since by the + nature of ra files, the first key is it's name. Therefore for most ra + files the following holds true: + somestanza.name = somestanza['metaObject'] = 'wgEncodeSomeStanzaName' + + Although the above is useful if you want one thing, it's usually more + helpful to be able to loop and query on the stanza. To add a term named + 'foobar' to every stanza in a ra file: + for stanza in rafile.values(): + stanza['foobar'] = 'some value' + + Note that I iterated over values. It can also be useful to iterate over + a stanza's keys: + for key in rafile.keys(): + print key + + Note that ra files are order preserving. Added entries are appended to the + end of the file. This allows you to print out a ra file easily: + print rafile + + Most of the time you don't want to do something with all stanzas though, + instead you want to filter them. The included filter method allows you to + specify two functions (or lambda expressions). The first is the 'where' + predicate, which must take in one stanza, and return true/false depending + on whether you want to take that stanza. The second is the 'select' + predicate, which takes in the stanza, and returns some subset or superset + of the stanza as a list. Using filter is preferable to for loops where + there are no side effects, or to filter data before iterating over it as + opposed to using if statements in the loop. To get all stanzas with one + experiment ID for instance, we would do something like this: + stanzas = rafile.filter(lambda s: s['expId'] == '123', lambda s: s) + + Note that you don't have to ensure 'expId' is in the stanza, it will + silently fail. Let's look at another example, say you want to find all + stanza's with an geoSampleAccession that are also fastq's + submittedfastqs = rafile.filter( + lambda s: 'geoSampleAccession' in s and s['fileName'].endswith('.fastq'), + lambda s: s) + + We don't always have to just return the stanza in the second parameter + however. If we wanted to, for each stanza, return the file associated + with that stanza, we could easily do that as well. This would return a + simple list of the string filenames in a ra file: + files = rafile.filter(lambda s: 1, lambda s: s['fileName']) + + Note that once again, we don't have to ensure 'fileName' exists. Also note + that lambda s: 1 means always return true. Lambda expressions are always + preferable to functions unless the expression would need to be reused + multiple times. It is also best to reduce the set of stanzas as much as + possible before operating over them. + + Filtering allows you to eliminate a lot of code. + ''' + + @property + def filename(self): + return self._filename + + def __init__(self, filePath=None, key=None): + OrderedDict.__init__(self) + if filePath != None: + self.read(filePath, key) + + def read(self, filePath, key=None): + ''' + Reads an rafile stanza by stanza, and internalizes it. Don't override + this for derived types, instead override readStanza. + ''' + self._filename = filePath + file = open(filePath, 'r') + + #entry = None + stanza = list() + keyValue = '' + + reading = 1 + + while reading: + line = file.readline() + if line == '': + reading = 0 + + line = line.strip() + if len(stanza) == 0 and (line.startswith('#') or (line == '' and reading)): + OrderedDict.append(self, line) + continue + + if line != '': + stanza.append(line) + elif len(stanza) > 0: + if keyValue == '': + keyValue, name, entry = self.readStanza(stanza, key) + else: + testKey, name, entry = self.readStanza(stanza, key) + if entry != None and keyValue != testKey: + raise KeyError('Inconsistent Key ' + testKey) + + if entry != None: + if name != None or key == None: + if name in self: + raise KeyError('Duplicate Key ' + name) + self[name] = entry + + stanza = list() + + file.close() + + + def readStanza(self, stanza, key=None): + ''' + Override this to create custom stanza behavior in derived types. + + IN + stanza: list of strings with keyval data + key: optional key for selective key filtering. Don't worry about it + + OUT + namekey: the key of the stanza's name + nameval: the value of the stanza's name + entry: the stanza itself + ''' + entry = RaStanza() + if entry.readStanza(stanza, key) == None: + return None, None, None + entry = RaStanza() + val1, val2 = entry.readStanza(stanza, key) + return val1, val2, entry + + + def write(self, filename): + file = open(filename, 'w') + file.write(str(self)) + + def iter(self): + pass + + + def iterkeys(self): + for item in self._OrderedDict__ordering: + if not(item.startswith('#') or item == ''): + yield item + + + def itervalues(self): + for item in self._OrderedDict__ordering: + if not (item.startswith('#') or item == ''): + yield self[item] + + + def iteritems(self): + for item in self._OrderedDict__ordering: + if not (item.startswith('#') or item == ''): + yield item, self[item] + else: + yield [item] + + + def append(self, item): + OrderedDict.append(self, item) + + + def filter(self, where, select): + ''' + select useful data from matching criteria + + where: the conditional function that must be met. Where takes one + argument, the stanza and should return true or false + select: the data to return. Takes in stanza, should return whatever + to be added to the list for that stanza. + + For each stanza, if where(stanza) holds, it will add select(stanza) + to the list of returned entities. Also forces silent failure of key + errors, so you don't have to check that a value is or is not in the stanza. + ''' + + ret = list() + for stanza in self.itervalues(): + try: + if where(stanza): + ret.append(select(stanza)) + except KeyError: + continue + return ret + + def filter2(self, where): + ''' + select useful data from matching criteria + Filter2 returns a Ra dictionary. Easier to use but more memory intensive. + + where: the conditional function that must be met. Where takes one + argument, the stanza and should return true or false + select: the data to return. Takes in stanza, should return whatever + to be added to the list for that stanza. + + For each stanza, if where(stanza) holds, it will add select(stanza) + to the list of returned entities. Also forces silent failure of key + errors, so you don't have to check that a value is or is not in the stanza. + ''' + ret = RaFile() + for stanza in self.itervalues(): + try: + if where(stanza): + ret[stanza.name] = stanza + except KeyError: + continue + return ret + + def mergeRa(self, other): + ''' + Input: + Two RaFile objects + Output: + A merged RaFile + + Common stanzas and key-val pairs are collapsed into + one with identical values being preserved, + differences are marked with a >>> and <<< + ''' + + mergedKeys = ucscUtils.mergeList(list(self), list(other)) + selfKeys = set(self) + otherKeys = set(other) + newCommon = RaFile() + p = re.compile('^\s*#') + p2 = re.compile('^\s*$') + for i in mergedKeys: + if p.match(i) or p2.match(i): + newCommon.append(i) + continue + if i not in selfKeys: + newCommon[i] = other[i] + continue + if i not in otherKeys: + newCommon[i] = self[i] + continue + if i in otherKeys and i in selfKeys: + newStanza = RaStanza() + selfStanzaKeys = set(self[i].iterkeys()) + otherStanzaKeys = set(other[i].iterkeys()) + stanzaKeys = ucscUtils.mergeList(list(self[i]), list(other[i])) + for j in stanzaKeys: + if p.match(j): + newStanza.append(j) + continue + if j not in selfStanzaKeys: + newStanza[j] = other[i][j] + continue + if j not in otherStanzaKeys: + newStanza[j] = self[i][j] + continue + if j in selfStanzaKeys and j in otherStanzaKeys: + if self[i][j] == other[i][j]: + newStanza[j] = self[i][j] + else: + in_j = '>>>>>%s' % j + out_j = '<<<<<%s' % j + newStanza[out_j] = self[i][j] + newStanza[in_j] = other[i][j] + newCommon[i] = newStanza + return newCommon + + + def summaryDiff(self, other): + ''' + Input: + RaFile object being compared. + Output: RaFile with differences. + + Returns ***partial*** stanzas of ***anything*** different + from the self dictionary compared to the other dictionary. + For versatility, it only returns stanzas from the self Ra file. In other + words, it returns the self dictionary lines that are either not present + in or different from the other dictionary. + + To obtain full set of differences, run summaryDiff twice + ra1 = this.summaryDiff(that) + and + ra2 = that.summaryDiff(this) + ''' + this = RaFile() + RetThis = RaFile() + for stanza in self.itervalues(): + if stanza.name not in other.keys(): + RetThis[stanza.name] = stanza + else: + if stanza.difference(other[stanza.name]): + RetThis[stanza.name] = stanza.difference(other[stanza.name]) + return RetThis + + def changeSummary(self, otherRa): + ''' + Input: + Two RaFile objects + Output: + Dictionary showing differences between stanzas, list of added and dropeed stanzas + ''' + retDict = collections.defaultdict(list) + addList = set(self.iterkeys()) - set(otherRa.iterkeys()) + dropList = set(otherRa.iterkeys()) - set(self.iterkeys()) + common = set(self.iterkeys()) & set(otherRa.iterkeys()) + + p = re.compile('^\s*#') + for stanza in common: + if p.match(stanza): + continue + for key in self[stanza]: + if p.match(key): + continue + if key in otherRa[stanza]: + if self[stanza][key] != otherRa[stanza][key]: + retDict[stanza].append("Changed %s from %s -> %s" %(key, otherRa[stanza][key], self[stanza][key])) + else: + retDict[stanza].append("Added %s -> %s" %(key, self[stanza][key])) + for key in otherRa[stanza]: + if p.match(key): + continue + if key not in self[stanza]: + retDict[stanza].append("Dropped %s -> %s" %(key, otherRa[stanza][key])) + return retDict, addList, dropList + + def diffFilter(self, select, other): + ''' + Input: + Lambda function of desired comparison term + RaFile object being compared. + Output: RaFile with differences. + + Filter returns ***full*** stanzas of a ***select function*** from + the self dictionary compared to the other dictionary. For + versatility, it only returns stanzas from the self Ra file. In other + words, it only returns self dictionary stanzas with the function term + that are either not found in or different from the other + dictionary. + + To obtain full set of differences, run diffFilter twice + ra1 = this.diffFilter(select function, that) + and + ra2 = that.diffFilter(select function, this) + ''' + this = RaFile() + RetThis = RaFile() + thisSelectDict = dict() + thatSelectDict = dict() + #Build 2 dict of stanzas to later compare line-by-line + for stanza in self.itervalues(): + try: + if select(stanza): + this[stanza.name] = stanza #'this' only records stanzas of the self dict + thisSelectDict[stanza.name] = select(stanza) + except KeyError: + continue + for stanza in other.itervalues(): + #Exact code as filter2 but kept for clarity. + try: + if select(stanza): + thatSelectDict[stanza.name] = select(stanza) + except KeyError: + continue + #Compare this and that dict + for stanza in this.itervalues(): + if stanza.name not in thatSelectDict: + RetThis[stanza.name] = stanza + elif thisSelectDict[stanza.name] != thatSelectDict[stanza.name]: + RetThis[stanza.name] = stanza + return RetThis + + def updateDiffFilter(self, term, other): + ''' + Replicates updateMetadata. + Input: + Term + Other raFile + + Output: + Merged RaFile + Stanzas found in 'self' and 'other' that have the 'Term' in 'other' + are overwritten (or inserted if not found) into 'self'. + Final merged dictionary is returned. + ''' + ret = self + common = set(self.iterkeys()) & set(other.iterkeys()) + for stanza in common: + if term not in self[stanza] and term not in other[stanza]: + continue + if term in self[stanza] and term not in other[stanza]: + del ret[stanza][term] + continue + if term in other[stanza]: + #Remake stanza to keep order of terms + tempStanza = RaStanza() + tempStanza._name = stanza + selfKeys = list(self[stanza].iterkeys()) + otherKeys = list(other[stanza].iterkeys()) + newOther = list() + #filter out keys in other that aren't in self, or the term we're interested in + for i in otherKeys: + if not i in selfKeys and i != term: + continue + else: + newOther.append(i) + #merge self keylist and filtered other list + masterList = ucscUtils.mergeList(newOther, selfKeys) + for i in masterList: + if i == term: + tempStanza[i] = other[stanza][i] + else: + tempStanza[i] = self[stanza][i] + ret[stanza] = tempStanza + return ret + + def printTrackDbFormat(self): + ''' + Converts a .ra file into TrackDb format. + Returns a printable string. + ''' + retstring = "" + parentTrack = "" + tier = 0 + commentList = [] + p = re.compile('^.*parent') + p2 = re.compile('^.*subTrack') + for stanza in self: + if stanza == "": + if commentList: + for line in commentList: + for i in range(tier): + retstring += " " + retstring += line + "\n" + commentList = [] + retstring += "\n" + continue + if stanza.startswith("#"): + commentList.append(stanza) + continue + keys = self[stanza].keys() + parentKey = "NOKEYFOUND" + for key in keys: + if p.search(key): + parentKey = key + if p2.search(key): + parentKey = key + if parentKey in keys: + if parentTrack not in self[stanza][parentKey] or parentTrack == "": + parentTrack = self[stanza]['track'] + tier = 1 + else: + tier = 2 + if commentList: + for line in commentList: + for i in range(tier): + retstring += " " + retstring += line + "\n" + commentList = [] + for line in self[stanza]: + for i in range(tier): + retstring += " " + if line.startswith("#"): + retstring += line + "\n" + else: + retstring += line + " " + self[stanza][line] + "\n" + retstring += "\n" + return retstring + + def __str__(self): + str = '' + for item in self.iteritems(): + if len(item) == 1: + str += item[0].__str__() + '\n' + else: + str += item[1].__str__() + '\n' + return str #.rsplit('\n', 1)[0] + + +class RaStanza(OrderedDict): + ''' + Holds an individual entry in the RaFile. + ''' + + @property + def name(self): + return self._name + + def __init__(self): + self._name = '' + self._nametype = '' + OrderedDict.__init__(self) + + def readStanza(self, stanza, key=None): + ''' + Populates this entry from a single stanza. Override this to create + custom behavior in derived classes + ''' + + for line in stanza: + self.readLine(line) + + return self.readName(stanza, key) + + + def readName(self, stanza, key=None): + ''' + Extracts the Stanza's name from the value of the first line of the + stanza. + ''' + + if key == None: + line = stanza[0] + else: + line = None + for s in stanza: + if s.split(' ', 1)[0] == key: + line = s + break + if line == None: + return None + + if len(line.split(' ', 1)) != 2: + raise ValueError() + + names = map(str.strip, line.split(' ', 1)) + self._nametype = names[0] + self._name = names[1] + return names + + def readLine(self, line): + ''' + Reads a single line from the stanza, extracting the key-value pair + ''' + + if line.startswith('#') or line == '': + OrderedDict.append(self, line) + else: + raKey = line.split(' ', 1)[0] + raVal = '' + if (len(line.split(' ', 1)) == 2): + raVal = line.split(' ', 1)[1] + #if raKey in self: + #raise KeyError(raKey + ' already exists') + self[raKey] = raVal + + def difference(self, other): + ''' + Complement function to summaryDiff. + Takes in self and a comparison Stanza. + Returns new stanza with terms from 'self' that are different from 'other' + Like the summaryDiff, to get the other terms, this needs to be run + again with self and other switched. + ''' + retRa = RaStanza() + retRa._name = self.name + for key in other.keys(): + try: + if other[key] != self[key] and not key.startswith('#'): + retRa[key] = self[key] + except KeyError: + continue + #maybe add empty keys + return retRa + + def iterkeys(self): + for item in self._OrderedDict__ordering: + if not (item.startswith('#') or item == ''): + yield item + + + def itervalues(self): + for item in self._OrderedDict__ordering: + if not (item.startswith('#') or item == ''): + yield self[item] + + + def iteritems(self): + for item in self._OrderedDict__ordering: + if not (item.startswith('#') or item == ''): + yield item, self[item] + + + def iter(self): + iterkeys(self) + + + def __str__(self): + str = '' + for key in self: + if key.startswith('#'): + str += key + '\n' + else: + str += key + ' ' + self[key] + '\n' + + return str +