51c6e6f2842abd1940f2871c7d23ffb59fd75445 mmaddren Mon Sep 19 16:02:14 2011 -0700 added first pass of documentation to cv and ra diff --git python/lib/ucscgenomics/ra.py python/lib/ucscgenomics/ra.py index 13b2109..9abc7dc 100644 --- python/lib/ucscgenomics/ra.py +++ python/lib/ucscgenomics/ra.py @@ -1,22 +1,86 @@ import sys import re from ucscgenomics.ordereddict import OrderedDict class RaFile(OrderedDict): """ - Stores an Ra file in a set of entries, one for each stanza in the file. + Stores a Ra file in a set of entries, one for each stanza in the file. + + To make a RaFile, it is usually easiest to just pass it's path: + rafile = ra.RaFile('kent/src/hg/.../wgEncodeSomeRaFile.ra') + + The data is read in and organized as a collection of stanzas. Ra files + store the stanza by it's name, so to access a specific stanza, say: + somestanza = rafile['wgEncodeSomeStanzaName'] + + Once you have a stanza, you may want specific data about that stanza. + Stanzas are, as ra files, organized as a collection of terms. Therefore + to get the description of the stanza, we can say: + somedescription = somestanza['description'] + + You can also access a stanza's name from the stanza itself, since by the + nature of ra files, the first key is it's name. Therefore for most ra + files the following holds true: + somestanza.name = somestanza['metaObject'] = 'wgEncodeSomeStanzaName' + + Although the above is useful if you want one thing, it's usually more + helpful to be able to loop and query on the stanza. To add a term named + 'foobar' to every stanza in a ra file: + for stanza in rafile.values(): + stanza['foobar'] = 'some value' + + Note that I iterated over values. It can also be useful to iterate over + a stanza's keys: + for key in rafile.keys(): + print key + + Note that ra files are order preserving. Added entries are appended to the + end of the file, and + + Most of the time you don't want to do something with all stanzas though, + instead you want to filter them. The included filter method allows you to + specify two functions (or lambda expressions). The first is the 'where' + predicate, which must take in one stanza, and return true/false depending + on whether you want to take that stanza. The second is the 'select' + predicate, which takes in the stanza, and returns some subset or superset + of the stanza as a list. Using filter is preferable to for loops where + there are no side effects, or to filter data before iterating over it as + opposed to using if statements in the loop. To get all stanzas with one + experiment ID for instance, we would do something like this: + stanzas = rafile.filter(lambda s: s['expId'] == '123', lambda s: s) + + Note that you don't have to ensure 'expId' is in the stanza, it will + silently fail. Let's look at another example, say you want to find all + stanza's with an geoSampleAccession that are also fastq's + submittedfastqs = rafile.filter( + lambda s: 'geoSampleAccession' in s and s['fileName'].endswith('.fastq'), + lambda s: s) + + We don't always have to just return the stanza in the second parameter + however. If we wanted to, for each stanza, return the file associated + with that stanza, we could easily do that as well. This would return a + simple list of the string filenames in a ra file: + files = rafile.filter(lambda s: 1, lambda s: s['fileName']) + + Note that once again, we don't have to ensure 'fileName' exists. Also note + that lambda s: 1 means always return true. Lambda expressions are always + preferable to functions unless the expression would need to be reused + multiple times. It is also best to reduce the set of stanzas as much as + possible before operating over them. + + Filtering allows you to eliminate a lot of code. """ def __init__(self, filePath=None): OrderedDict.__init__(self) if filePath != None: self.read(filePath) def read(self, filePath): """ Reads an rafile stanza by stanza, and internalizes it. """ file = open(filePath, 'r') #entry = None