python/lib/ucscgenomics/ra.py 51c6e6f2842abd1940f2871c7d23ffb59fd75445

51c6e6f2842abd1940f2871c7d23ffb59fd75445
mmaddren
  Mon Sep 19 16:02:14 2011 -0700
added first pass of documentation to cv and ra
diff --git python/lib/ucscgenomics/ra.py python/lib/ucscgenomics/ra.py
index 13b2109..9abc7dc 100644
--- python/lib/ucscgenomics/ra.py
+++ python/lib/ucscgenomics/ra.py
@@ -1,22 +1,86 @@
 import sys
 import re
 from ucscgenomics.ordereddict import OrderedDict
 
 class RaFile(OrderedDict):
 	"""
-	Stores an Ra file in a set of entries, one for each stanza in the file.
+	Stores a Ra file in a set of entries, one for each stanza in the file.
+	
+	To make a RaFile, it is usually easiest to just pass it's path:
+		rafile = ra.RaFile('kent/src/hg/.../wgEncodeSomeRaFile.ra')
+		
+	The data is read in and organized as a collection of stanzas. Ra files
+	store the stanza by it's name, so to access a specific stanza, say:
+		somestanza = rafile['wgEncodeSomeStanzaName']
+		
+	Once you have a stanza, you may want specific data about that stanza. 
+	Stanzas are, as ra files, organized as a collection of terms. Therefore
+	to get the description of the stanza, we can say:
+		somedescription = somestanza['description']
+		
+	You can also access a stanza's name from the stanza itself, since by the
+	nature of ra files, the first key is it's name. Therefore for most ra 
+	files the following holds true:
+		somestanza.name = somestanza['metaObject'] = 'wgEncodeSomeStanzaName'
+		
+	Although the above is useful if you want one thing, it's usually more
+	helpful to be able to loop and query on the stanza. To add a term named
+	'foobar' to every stanza in a ra file:
+		for stanza in rafile.values():
+			stanza['foobar'] = 'some value'
+	
+	Note that I iterated over values. It can also be useful to iterate over
+	a stanza's keys:
+		for key in rafile.keys():
+			print key
+	
+	Note that ra files are order preserving. Added entries are appended to the
+	end of the file, and 
+	
+	Most of the time you don't want to do something with all stanzas though,
+	instead you want to filter them. The included filter method allows you to
+	specify two functions (or lambda expressions). The first is the 'where'
+	predicate, which must take in one stanza, and return true/false depending
+	on whether you want to take that stanza. The second is the 'select'
+	predicate, which takes in the stanza, and returns some subset or superset
+	of the stanza as a list. Using filter is preferable to for loops where
+	there are no side effects, or to filter data before iterating over it as
+	opposed to using if statements in the loop. To get all stanzas with one 
+	experiment ID for instance, we would do something like this:
+		stanzas = rafile.filter(lambda s: s['expId'] == '123', lambda s: s)
+		
+	Note that you don't have to ensure 'expId' is in the stanza, it will
+	silently fail. Let's look at another example, say you want to find all
+	stanza's with an geoSampleAccession that are also fastq's
+		submittedfastqs = rafile.filter(
+			lambda s: 'geoSampleAccession' in s and s['fileName'].endswith('.fastq'),
+			lambda s: s)
+			
+	We don't always have to just return the stanza in the second parameter
+	however. If we wanted to, for each stanza, return the file associated
+	with that stanza, we could easily do that as well. This would return a
+	simple list of the string filenames in a ra file:
+		files = rafile.filter(lambda s: 1, lambda s: s['fileName'])
+		
+	Note that once again, we don't have to ensure 'fileName' exists. Also note
+	that lambda s: 1 means always return true. Lambda expressions are always
+	preferable to functions unless the expression would need to be reused
+	multiple times. It is also best to reduce the set of stanzas as much as
+	possible before operating over them.
+	
+	Filtering allows you to eliminate a lot of code. 
 	"""
 
 	def __init__(self, filePath=None):
 		OrderedDict.__init__(self)
 		if filePath != None:
 			self.read(filePath) 
 
 	def read(self, filePath):
 		"""
 		Reads an rafile stanza by stanza, and internalizes it.
 		"""
 
 		file = open(filePath, 'r')
 
 		#entry = None