python/lib/ucscgenomics/cv.py 64f127cb71252486ce096a832ac7fad3deb324a5

64f127cb71252486ce096a832ac7fad3deb324a5
mmaddren
  Wed Sep 14 17:04:53 2011 -0700
large-scale renaming change to allow python to be built into cluster/bin, also mkGeoPkg now renames files
diff --git python/lib/ucscgenomics/cv.py python/lib/ucscgenomics/cv.py
new file mode 100644
index 0000000..307d51c
--- /dev/null
+++ python/lib/ucscgenomics/cv.py
@@ -0,0 +1,507 @@
+import re
+import os
+from ucscgenomics import ra
+
+class CvFile(ra.RaFile):
+	"""cv.ra representation. Mainly adds CV-specific validation to the RaFile"""
+
+	def __init__(self, filePath=None, handler=None, protocolPath=None):
+		"""sets up exception handling method, and optionally reads from a file"""
+		ra.RaFile.__init__(self)
+		
+		self.handler = handler
+		if handler == None:
+			self.handler = self.raiseException
+			
+		if filePath == None:
+			filePath = os.path.expanduser('~/kent/src/hg/makeDb/trackDb/') + 'cv/alpha/cv.ra'
+		
+		self.protocolPath = protocolPath
+		if protocolPath == None:
+			self.protocolPath == os.path.expanduser('~/htdocsExtras/ENCODE/')
+		
+		self.read(filePath)
+
+	def raiseException(self, exception):
+		"""wrapper function for raising exception"""
+		raise exception
+
+	def readStanza(self, stanza):
+		"""overriden method from RaFile which makes specialized stanzas based on type"""
+		e = ra.RaStanza()
+		ek, ev = e.readStanza(stanza)
+		type = e['type']
+
+		if type == 'Antibody':
+			entry = AntibodyStanza()
+		elif type == 'Cell Line':
+			if e['organism'] == 'human':
+				entry = CellLineStanza()
+			elif e['organism'] == 'mouse':
+				entry = MouseStanza()
+			else:
+				self.handler(NonmatchKeyError(e.name, e['organism'], 'organism'))
+				return ek, ev, None
+		elif type == 'age':
+			entry = AgeStanza()
+		elif type == 'dataType':
+			entry = DataTypeStanza()
+		elif type == 'lab':
+			entry = LabStanza()
+		elif type == 'seqPlatform':
+			entry = SeqPlatformStanza()
+		elif type == 'typeOfTerm':
+			entry = TypeOfTermStanza()
+		elif type == 'view':
+			entry = ViewStanza()
+		elif type == 'localization':
+			entry = LocalizationStanza()
+		elif type == 'rnaExtract':
+			entry = RnaExtractStanza()
+		elif type == 'treatment':
+			entry = TreatmentStanza()
+		elif type == 'grant':
+			entry = GrantStanza()
+		else:
+			entry = CvStanza()
+
+		key, val = entry.readStanza(stanza)
+		return key, val, entry
+
+
+	def validate(self):
+		"""base validation method which calls all stanzas' validate"""
+		for stanza in self.itervalues():
+				stanza.validate(self)
+
+				
+class CvStanza(ra.RaStanza):
+	"""base class for a single stanza in the cv, which adds validation"""
+	
+	def __init__(self):
+		ra.RaStanza.__init__(self)
+
+	def readStanza(self, stanza):
+		"""
+		Populates this entry from a single stanza
+		"""
+		
+		for line in stanza:
+			self.readLine(line)
+
+		return self.readName(stanza[0])
+		
+	def readName(self, line):
+		"""
+		Extracts the Stanza's name from the value of the first line of the
+		stanza.
+		"""
+
+		if len(line.split(' ', 1)) != 2:
+			raise ValueError()
+
+		names = map(str.strip, line.split(' ', 1))
+		self._name = names[1]
+		return names
+		
+	def readLine(self, line):
+		"""
+		Reads a single line from the stanza, extracting the key-value pair
+		""" 
+
+		if line.startswith('#') or line == '':
+			self.append(line)
+		else:
+			raKey = line.split(' ', 1)[0]
+			raVal = ''
+			if (len(line.split(' ', 1)) == 2):
+				raVal = line.split(' ', 1)[1]
+				
+			if raKey in self:
+				count = 0
+				while raKey + '__$$' + str(count) in self:
+					count = count + 1
+					
+				self[raKey + '__$$' + str(count)] = raVal
+				
+			else:
+				self[raKey] = raVal
+		
+	def validate(self, ra, necessary=None, optional=None):
+		"""default validation for a generic cv stanza. Should be called with all arguments if overidden"""
+		
+		if necessary == None:
+			necessary = set()
+			
+		if optional == None:
+			optional = set()
+		
+		baseNecessary = {'term', 'tag', 'type'}
+		
+		if self['type'] != 'Antibody':
+			baseNecessary.add('description')
+		
+		baseOptional = {'deprecated'}
+		self.checkMandatory(ra, necessary | baseNecessary)
+		self.checkExtraneous(ra, necessary | baseNecessary | optional | baseOptional)
+		
+		if self['type'] != 'Cell Line': # cv, you disgust me with your inconsistencies
+			if len(ra.filter(lambda s: s['term'] == self['type'] and s['type'] == 'typeOfTerm', lambda s: s)) == 0:
+				ra.handler(InvalidTypeError(self, self['type']))
+
+		self.checkDuplicates(ra)
+		
+		
+	def checkDuplicates(self, ra):
+		"""ensure that all keys are present and not blank in the stanza"""
+		for key in self.iterkeys():
+			if '__$$' in key:
+				newkey = key.split('__$$', 1)[0]
+				ra.handler(DuplicateKeyError(self, newkey))
+		
+	def checkMandatory(self, ra, keys):
+		"""ensure that all keys are present and not blank in the stanza"""
+		for key in keys:
+			if not key in self.keys():
+				ra.handler(MissingKeyError(self, key))
+			elif self[key] == '':
+				ra.handler(BlankKeyError(self, key))
+				
+	# def checkOptional(self, ra, keys):
+		# """ensure that all keys are present and not blank in the stanza"""
+		# for key in keys:
+			# if key in self and self[key] == '':
+				# ra.handler(BlankKeyError(self, key))
+		
+	def checkExtraneous(self, ra, keys):
+		"""check for keys that are not in the list of keys"""
+		for key in self.iterkeys():
+			if key not in keys and '__$$' not in key:
+				ra.handler(ExtraKeyError(self, key))
+	
+	def checkFullRelational(self, ra, key, other, type):
+		"""check that the value at key matches the value of another
+		stanza's value at other, where the stanza type is specified by type"""
+		
+		p = 0
+		if key not in self:
+			return
+		
+		for entry in ra.itervalues():
+			if 'type' in entry and other in entry:
+				if entry['type'] == type and self[key] == entry[other]:
+					p = 1
+					break
+		if p == 0:
+			ra.handler(NonmatchKeyError(self, key, other))
+	
+	def checkRelational(self, ra, key, other):
+		"""check that the value at key matches the value at other"""
+		p = 0
+		
+		if key not in self:
+			return
+		
+		for entry in ra.itervalues():
+			if 'type' in entry and other in entry:
+				if entry['type'] == key and self[key] == entry[other]:
+					p = 1
+					break
+		if p == 0:
+			ra.handler(NonmatchKeyError(self, key, other))
+			
+	def checkListRelational(self, ra, key, other):
+		"""check that the value at key matches the value at other"""
+		
+		if key not in self:
+			return
+		
+		for val in self[key].split(','):
+			val = val.strip()
+			p = 0
+		
+			for entry in ra.itervalues():
+				if 'type' in entry and other in entry:
+
+					if entry['type'] == key and val == entry[other]:
+						p = 1
+						break
+			if p == 0:
+				ra.handler(NonmatchKeyError(self, key, other))
+
+	def checkProtocols(self, ra, path):
+		if 'protocol' in self:
+			protocols = self['protocol'].split()
+			for protocol in protocols:
+				if ':' not in protocol:
+					ra.handler(InvalidProtocolError(self, protocol))
+				else:
+					p = protocol.split(':', 1)[1]
+					if not os.path.isfile(ra.protocolPath + path + p):
+						ra.handler(InvalidProtocolError(self, protocol))
+				
+class CvError(Exception):
+	"""base error class for the cv."""
+	def __init__(self, stanza):
+		self.stanza = stanza
+		self.msg = ''
+		
+	def __str__(self):
+		return str('%s[%s] %s: %s' % (self.stanza.name, self.stanza['type'], self.__class__.__name__, self.msg))
+		
+class MissingKeyError(CvError):
+	"""raised if a mandatory key is missing"""
+	
+	def __init__(self, stanza, key):
+		CvError.__init__(self, stanza)
+		self.msg =  key
+	
+	# def __str__(self):
+		# return str('%s(%s[%s])' % self.__class__.__name__ self.stanza + ': missing key (' + self.key + ')')
+	
+	
+class DuplicateKeyError(CvError):
+	"""raised if a key is duplicated"""
+	
+	def __init__(self, stanza, key):
+		CvError.__init__(self, stanza)
+		self.msg = key
+	
+	# def __str__(self):
+		# return str(self.stanza + ': duplicate key (' + self.key + ')')
+	
+	
+class BlankKeyError(CvError):
+	"""raised if a mandatory key is blank"""
+	
+	def __init__(self, stanza, key):
+		CvError.__init__(self, stanza)
+		self.msg = key
+	
+	# def __str__(self):
+		# return str(self.stanza + ': key (' + self.key + ') is blank')
+	
+	
+class ExtraKeyError(CvError):
+	"""raised if an extra key not in the list of keys is found"""
+
+	def __init__(self, stanza, key):
+		CvError.__init__(self, stanza)
+		self.msg = key
+	
+	# def __str__(self):
+		# return str(self.stanza + ': extra key (' + self.key + ')')	
+
+		
+class NonmatchKeyError(CvError):
+	"""raised if a relational key does not match any other value"""
+	
+	def __init__(self, stanza, key, val):
+		CvError.__init__(self, stanza)
+		self.msg = '%s does not match %s' % (key, val)
+	
+	# def __str__(self):
+		# return str(self.stanza + ': key (' + self.key + ') does not match any (' + self.val + ')')
+		
+		
+class DuplicateVendorIdError(CvError):
+	"""When there exists more than one connected component of stanzas (through derivedFrom) with the same vendorId"""
+	
+	def __init__(self, stanza):
+		CvError.__init__(self, stanza)
+		self.msg = '%s' % self.stanza['vendorId']
+		
+	# def __str__(self):
+		# return str('warning: ' + self.stanza.name + ': vendorId (' + self.stanza['vendorId'] + ') has multiple parent cell lines')
+		
+		
+class InvalidProtocolError(CvError):
+	"""raised if a protocol doesnt match anything in the directory"""
+	
+	def __init__(self, stanza, key):
+		CvError.__init__(self, stanza)
+		self.msg = key
+	
+	# def __str__(self):
+		# return str(self.stanza.name + ': missing protocol document (' + self.key + ')')	
+	
+		
+class InvalidTypeError(CvError):
+	"""raised if a relational key does not match any other value"""
+	
+	def __init__(self, stanza, key):
+		CvError.__init__(self, stanza)
+		self.msg = key
+	
+	# def __str__(self):
+		# return str(self.stanza + ': ' + self.key + ' does not match any types')
+		
+
+class LabStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'organism', 'labPi'}
+		optional = {'label', 'labInst', 'labPiFull', 'grantPi'}
+		CvStanza.validate(self, ra, necessary, optional)
+
+		self.checkRelational(ra, 'organism', 'term')
+		
+
+class AgeStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'stage'}
+		CvStanza.validate(self, ra, necessary)
+
+
+class DataTypeStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'label'}
+		CvStanza.validate(self, ra, necessary)
+
+
+class CellLineStanza(CvStanza):
+
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'organism', 'vendorName', 'orderUrl', 'sex', 'tier'}
+		optional = {'tissue', 'vendorId', 'karyotype', 'lineage', 'termId', 'termUrl', 'color', 'protocol', 'category', 'lots', 'derivedFrom', 'lab'}
+		CvStanza.validate(self, ra, necessary, optional)
+
+		self.checkRelational(ra, 'organism', 'term')
+		self.checkRelational(ra, 'sex', 'term')
+		self.checkRelational(ra, 'category', 'term')
+		self.checkRelational(ra, 'tier', 'term')
+		self.checkListRelational(ra, 'lab', 'labPi')
+		
+		# ensure the derivedFrom matches a valid cell line
+		if 'derivedFrom' in self and len(ra.filter(lambda s: s['term'] == self['derivedFrom'] and s['type'] == 'Cell Line', lambda s: s)) == 0:
+			ra.handler(NonmatchKeyError(self, self['derivedFrom'], 'Cell Line'))
+			
+		# ensure that there are no other non-related stanzas that have the same vendorId
+		if 'derivedFrom' not in self or ra[self['derivedFrom']]['vendorId'] != self['vendorId']:
+			otherstanzas = ra.filter(lambda s: s['type'] == 'Cell Line' and s != self and s['vendorId'] == self['vendorId'] and ('derivedFrom' not in s or ra[s['derivedFrom']]['vendorId'] != s['vendorId']), lambda s: s)
+			if len(otherstanzas) > 0:
+				ra.handler(DuplicateVendorIdError(self))
+			
+		self.checkProtocols(ra, 'protocols/cell/human/')
+		
+
+class SeqPlatformStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		optional = {'geo'}
+		CvStanza.validate(self, ra, None, optional)
+
+
+class AntibodyStanza(CvStanza):
+
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'target', 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId', 'orderUrl', 'targetId', 'lab'}
+		optional = {'validation', 'targetUrl', 'lots', 'displayName'}
+		CvStanza.validate(self, ra, necessary, optional)
+		self.checkListRelational(ra, 'lab', 'labPi')
+		self.checkProtocols(ra, 'validation/antibodies/')
+
+
+class ViewStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'label'}
+		CvStanza.validate(self, ra, necessary)
+		
+
+class TypeOfTermStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'searchable', 'cvDefined', 'validate', 'priority'}
+		optional = {'label', 'hidden'}
+		CvStanza.validate(self, ra, necessary, optional)
+		
+		if len(ra.filter(lambda s: s['term'] == self['type'] and s['type'] == 'typeOfTerm', lambda s: s)) == 0:
+			ra.handler(InvalidTypeError(self, self['type']))
+				
+
+class MouseStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'organism', 'vendorName', 'orderUrl', 'age', 'strain', 'sex'}
+		optional = {'tissue', 'termId', 'termUrl', 'color', 'protocol', 'category', 'vendorId', 'lots'}
+		CvStanza.validate(self, ra, necessary, optional)
+		
+		self.checkRelational(ra, 'organism', 'term')
+		self.checkRelational(ra, 'sex', 'term')
+		self.checkRelational(ra, 'category', 'term')
+		self.checkRelational(ra, 'age', 'term')
+		self.checkRelational(ra, 'strain', 'term')
+		self.checkProtocols(ra, 'protocols/cell/mouse/')
+
+		
+class LocalizationStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'termId', 'termUrl'}
+		optional = {'label'}
+		CvStanza.validate(self, ra, necessary, optional)
+		
+		
+class RnaExtractStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		optional = {'label'}
+		CvStanza.validate(self, ra, None, optional)
+		
+		
+class TreatmentStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		optional = {'label'}
+		CvStanza.validate(self, ra, None, optional)
+		
+		
+class GrantStanza(CvStanza):
+	
+	def __init__(self):
+		CvStanza.__init__(self)
+
+	def validate(self, ra):
+		necessary = {'grantInst', 'projectName'}
+		optional = {'label'}
+		CvStanza.validate(self, ra, necessary, optional)
+