a44e3254174152b2d4b3f241d1935654cd139181 vsmalladi Tue May 8 10:11:21 2012 -0700 Renamed library from ucscgenomics to ucscGb. Redmine #7029. diff --git python/lib/ucscGb/cv.py python/lib/ucscGb/cv.py new file mode 100644 index 0000000..014a733 --- /dev/null +++ python/lib/ucscGb/cv.py @@ -0,0 +1,453 @@ +import re +import os +from ucscgenomics import ra, encode + +def extractValue(val, prefix='', removeComments=1): + val2 = val.replace(prefix, '') + if removeComments and '#' in val2: + val2 = val2.split('#', 1)[0] + return val2.strip() + +def extractList(val, prefix='', removeComments=1): + val2 = val.replace(prefix, '') + if removeComments and '#' in val2: + val2 = val2.split('#', 1)[0] + return map(str.strip, val2.split(',')) + +class CvFile(ra.RaFile): + ''' + cv.ra representation. Mainly adds CV-specific validation to the RaFile + + To create a CvFile, the simplest way is just to call it with no params, + but you can specify a file path if you want to open up something other + than the alpha cv in your tree, specify this. The handler can almost + always be left blank, since that simply provides a function to handle + validation errors that would otherwise throw an exception. You also should + specify a protocolPath if you want to validate, since it will check the + protocol documents when you validate, to ensure that the cv matches them. + + Validation recurses over all stanzas, calling the overridden validation + function for the more developed stanzas. To start validation, you can + simply call validate() on the cv object. + + For more information about other things not specific to the cv, but for + all ra files, look at the RaFile documentation. + ''' + + def __init__(self, filePath=None, handler=None, protocolPath=None): + '''sets up exception handling method, and optionally reads from a file''' + ra.RaFile.__init__(self) + + self.handler = handler + if handler == None: + self.handler = self.raiseException + + if filePath == None: + filePath = encode.defaultCvPath() + + self.protocolPath = protocolPath + if protocolPath == None: + self.protocolPath == os.path.expanduser('~/htdocsExtras/ENCODE/') + + self.missingTypes = set() + + self.read(filePath) + + def raiseException(self, exception): + '''wrapper function for raising exception''' + raise exception + + def readStanza(self, stanza, key=None): + '''overriden method from RaFile which makes specialized stanzas based on type''' + entry = CvStanza() + + key, val = entry.readStanza(stanza) + return key, val, entry + + + def validate(self): + '''base validation method which calls all stanzas' validate''' + for stanza in self.itervalues(): + stanza.validate(self) + #print self.missingTypes + + def getTypeOfTermStanza(self, type): + + if type == 'mouseCellType': + mousestanza = CvStanza() + mousestanza['term'] = 'mouseCellType' + mousestanza['tag'] = 'MOUSECELLTYPE' + mousestanza['type'] = 'typeOfTerm' + mousestanza['label'] = 'Cell, tissue or DNA sample specific to mouse' + mousestanza['description'] = 'NOT FOR USE! ONLY FOR VALIDATION. Cell line or tissue used as the source of experimental material specific to mouse.' + mousestanza['searchable'] = 'multiSelect' + mousestanza['cvDefined'] = 'yes' + mousestanza['validate'] = 'cv or None' + mousestanza['requiredVars'] = 'term,tag,type,description,organism,vendorName,orderUrl,age,strain,sex #Provisional' + mousestanza['optionalVars'] = 'label,tissue,termId,termUrl,color,protocol,category,vendorId,lots,deprecated #Provisional' + return mousestanza + + types = self.filter(lambda s: s['term'] == type and s['type'] == 'typeOfTerm', lambda s: s) + if len(types) != 1: + return None + return types[0] + +class CvStanza(ra.RaStanza): + '''base class for a single stanza in the cv, which adds validation''' + + def __init__(self): + ra.RaStanza.__init__(self) + + def readStanza(self, stanza): + ''' + Populates this entry from a single stanza + ''' + + for line in stanza: + self.readLine(line) + + return self.readName(stanza[0]) + + def readName(self, line): + ''' + Extracts the Stanza's name from the value of the first line of the + stanza. + ''' + + if len(line.split(' ', 1)) != 2: + raise ValueError() + + names = map(str.strip, line.split(' ', 1)) + self._name = names[1] + return names + + def readLine(self, line): + ''' + Reads a single line from the stanza, extracting the key-value pair + ''' + + if line.startswith('#') or line == '': + self.append(line) + else: + raKey = line.split(' ', 1)[0] + raVal = '' + if (len(line.split(' ', 1)) == 2): + raVal = line.split(' ', 1)[1] + + if raKey in self: + count = 0 + while raKey + '__$$' + str(count) in self: + count = count + 1 + + self[raKey + '__$$' + str(count)] = raVal + + else: + self[raKey] = raVal + +# validate [cv/date/exists/float/integer/list:/none/regex:] outlines the expected values. ENFORCED by mdbPrint -validate +# cv: must be defined term in cv (e.g. cell=GM12878). "cv or None" indicates that "None is also acceptable. +# "cv or control" indicates that cv-defined terms of type "control" are also acceptable. +# date: must be date in YYYY-MM-DD format +# exists: not enforced. (e.g. fileName could be validated to exist in download directory) +# float: must be floating point number +# integer: must be integer +# "list:": must be one of several terms in comma delimeited list (e.g. "list: yes,no,maybe" ) # ("list:" includes colon) +# none: not validated in any way +# "regex:": must match regular expression (e.g. "regex: ^GS[M,E][0-9]$" ) # ("regex:" includes colon) +# # NOTE: that validate rules may end comment delimited by a '#' + + def validate(self, cvfile): + type = self['type'] + if self['type'] == 'Cell Line': # :( + if 'organism' in self and self['organism'] == 'human': + type = 'cellType' + elif 'organism' in self and self['organism'] == 'mouse': + type = 'mouseCellType' + else: + cvfile.handler(OrganismError(self)) + + typeStanza = cvfile.getTypeOfTermStanza(type) + if typeStanza == None: + cvfile.handler(InvalidTypeError(self, self['type'] + '(%s)' % type)) + return + required = list() + if 'requiredVars' in typeStanza: + required = extractList(typeStanza['requiredVars']) + optional = list() + if 'optionalVars' in typeStanza: + optional = extractList(typeStanza['optionalVars']) + + self.checkMandatory(cvfile, required) + required.extend(optional) + self.checkExtraneous(cvfile, required) + self.checkDuplicates(cvfile) + + for key in self.iterkeys(): + + itemType = cvfile.getTypeOfTermStanza(key) + if itemType == None: + cvfile.missingTypes.add(key) + #cvfile.handler(InvalidTypeError(self, key)) #RELEASE THE FLOODGATES + continue + validation = itemType['validate'] + val = self[key] + + if validation.startswith('cv'): + if validation == 'cv or None' and val == 'None': + pass + else: + self.checkRelational(cvfile, val, key) + elif validation == 'date': + try: + d = datetime.datetime.strptime(val, '%Y-%m-%d') + except: + cvfile.handler(InvalidDateError(self, val)) + elif validation == 'exists': + if not os.path.exists(val): + cvfile.handler(MissingFileError(self, val)) + elif validation == 'float': + try: + f = float(val) + except: + cvfile.handler(InvalidFloatError(self, val)) + elif validation == 'integer': + try: + i = int(val) + except: + cvfile.handler(InvalidIntError(self, val)) + elif validation.startswith('list:'): + validVals = extractList(validation, 'list:') + if val not in validVals: + cvfile.handler(InvalidListError(self, val, validVals)) + elif validation == 'none': + pass + elif validation.startswith('regex:'): + regex = extractValue(validation, 'regex:') + if not re.match(val, regex): + cvfile.handler(UnmatchedRegexError(self, val, regex)) + # else: + # cvfile.handler(TypeValidationError(itemType)) + + def checkDuplicates(self, cvfile): + '''ensure that all keys are present and not blank in the stanza''' + for key in self.iterkeys(): + if '__$$' in key: + newkey = key.split('__$$', 1)[0] + cvfile.handler(DuplicateKeyError(self, newkey)) + + def checkMandatory(self, cvfile, keys): + '''ensure that all keys are present and not blank in the stanza''' + for key in keys: + if not key in self.keys(): + cvfile.handler(MissingKeyError(self, key)) + elif self[key] == '': + cvfile.handler(BlankKeyError(self, key)) + + def checkExtraneous(self, cvfile, keys): + '''check for keys that are not in the list of keys''' + for key in self.iterkeys(): + if key not in keys and '__$$' not in key: + cvfile.handler(ExtraKeyError(self, key)) + + def checkFullRelational(self, cvfile, key, other, type): + '''check that the value at key matches the value of another + stanza's value at other, where the stanza type is specified by type''' + + p = 0 + if key not in self: + return + + for entry in cvfile.itervalues(): + if 'type' in entry and other in entry: + if entry['type'] == type and self[key] == entry[other]: + p = 1 + break + if p == 0: + cvfile.handler(NonmatchKeyError(self, key, other)) + + def checkRelational(self, cvfile, key, other): + '''check that the value at key matches the value at other''' + p = 0 + + if key not in self: + return + + for entry in cvfile.itervalues(): + if 'type' in entry and other in entry: + if entry['type'] == key and self[key] == entry[other]: + p = 1 + break + if p == 0: + cvfile.handler(NonmatchKeyError(self, key, other)) + + def checkListRelational(self, cvfile, key, other): + '''check that the value at key matches the value at other''' + + if key not in self: + return + + for val in self[key].split(','): + val = val.strip() + p = 0 + + for entry in cvfile.itervalues(): + if 'type' in entry and other in entry: + + if entry['type'] == key and val == entry[other]: + p = 1 + break + if p == 0: + cvfile.handler(NonmatchKeyError(self, key, other)) + + def checkProtocols(self, cvfile, path): + if 'protocol' in self: + protocols = self['protocol'].split() + for protocol in protocols: + if ':' not in protocol: + cvfile.handler(InvalidProtocolError(self, protocol)) + else: + p = protocol.split(':', 1)[1] + if cvfile.protocolPath != None and not os.path.isfile(cvfile.protocolPath + path + p): + cvfile.handler(InvalidProtocolError(self, protocol)) + +class CvError(Exception): + '''base error class for the cv.''' + def __init__(self, stanza): + Exception.__init__(self) + self.stanza = stanza + self.msg = '' + self.strict = 0 + + def __str__(self): + return str('%s[%s] %s: %s' % (self.stanza.name, self.stanza['type'], self.__class__.__name__, self.msg)) + +class MissingKeyError(CvError): + '''raised if a mandatory key is missing''' + + def __init__(self, stanza, key): + CvError.__init__(self, stanza) + self.msg = key + self.strict = 1 + +class DuplicateKeyError(CvError): + '''raised if a key is duplicated''' + + def __init__(self, stanza, key): + CvError.__init__(self, stanza) + self.msg = key + self.strict = 1 + +class BlankKeyError(CvError): + '''raised if a mandatory key is blank''' + + def __init__(self, stanza, key): + CvError.__init__(self, stanza) + self.msg = key + self.strict = 0 + +class ExtraKeyError(CvError): + '''raised if an extra key not in the list of keys is found''' + + def __init__(self, stanza, key): + CvError.__init__(self, stanza) + self.msg = key + self.strict = 0 + +class NonmatchKeyError(CvError): + '''raised if a relational key does not match any other value''' + + def __init__(self, stanza, key, val): + CvError.__init__(self, stanza) + self.msg = '%s does not match %s' % (key, val) + self.strict = 1 + +class DuplicateVendorIdError(CvError): + '''When there exists more than one connected component of stanzas (through derivedFrom) with the same vendorId''' + + def __init__(self, stanza): + CvError.__init__(self, stanza) + self.msg = '%s' % self.stanza['vendorId'] + self.strict = 0 + +class InvalidProtocolError(CvError): + '''raised if a protocol doesnt match anything in the directory''' + + def __init__(self, stanza, key): + CvError.__init__(self, stanza) + self.msg = key + self.strict = 0 + +class InvalidTypeError(CvError): + '''raised if a relational key does not match any other value''' + + def __init__(self, stanza, key): + CvError.__init__(self, stanza) + self.msg = key + self.strict = 1 + +class TypeValidationError(CvError): + '''raised if the terms type of term has an invalid validation value''' + + def __init__(self, stanza): + CvError.__init__(self, stanza) + self.msg = 'validation ' + stanza['validation'] + self.strict = 1 + +class InvalidDateError(CvError): + '''raised if the value is an invalid date''' + + def __init__(self, stanza, val): + CvError.__init__(self, stanza) + self.msg = val + ' does not match a YYYY-MM-DD date' + self.strict = 1 + +class MissingFileError(CvError): + '''raised if the value is a filename that does not exist''' + + def __init__(self, stanza, val): + CvError.__init__(self, stanza) + self.msg = val + ' does not exist' + self.strict = 1 + +class InvalidFloatError(CvError): + '''raised if the value not a float''' + + def __init__(self, stanza, val): + CvError.__init__(self, stanza) + self.msg = val + ' is not a float' + self.strict = 1 + +class InvalidIntError(CvError): + '''raised if the value is not an int''' + + def __init__(self, stanza, val): + CvError.__init__(self, stanza) + self.msg = val + ' is not an int' + self.strict = 1 + +class InvalidListError(CvError): + '''raised if the value is not among the given list of values''' + + def __init__(self, stanza, val, list): + CvError.__init__(self, stanza) + self.msg = val + ' is not in ' + list.join(',') + self.strict = 1 + +class UnmatchedRegexError(CvError): + '''raised if the value is a filename that does not exist''' + + def __init__(self, stanza, val, regex): + CvError.__init__(self, stanza) + self.msg = val + ' does not match the regex ' + regex + self.strict = 1 + +class OrganismError(CvError): + '''raised if the value is a filename that does not exist''' + + def __init__(self, stanza): + CvError.__init__(self, stanza) + if 'organism' in stanza: + self.msg = 'organism ' + stanza['organism'] + ' does not match human or mouse' + else: + self.msg = 'organism does not exist in stanza' + self.strict = 1 \ No newline at end of file