3aee104cf4c1e245dd020f743fbc58c17fd75976 mmaddren Mon Apr 9 12:12:44 2012 -0700 added encode.py to store global constants and other encode stuff, and made all other libraries interface correctly with it diff --git python/lib/ucscgenomics/cv.py python/lib/ucscgenomics/cv.py index dafe1c4..7709a32 100644 --- python/lib/ucscgenomics/cv.py +++ python/lib/ucscgenomics/cv.py @@ -1,18 +1,18 @@ import re import os -from ucscgenomics import ra +from ucscgenomics import ra, encode def extractValue(val, prefix='', removeComments=1): val2 = val.replace(prefix, '') if removeComments and '#' in val2: val2 = val2.split('#', 1)[0] return val2.strip() def extractList(val, prefix='', removeComments=1): val2 = val.replace(prefix, '') if removeComments and '#' in val2: val2 = val2.split('#', 1)[0] return map(str.strip, val2.split(',')) class CvFile(ra.RaFile): ''' @@ -31,77 +31,46 @@ simply call validate() on the cv object. For more information about other things not specific to the cv, but for all ra files, look at the RaFile documentation. ''' def __init__(self, filePath=None, handler=None, protocolPath=None): '''sets up exception handling method, and optionally reads from a file''' ra.RaFile.__init__(self) self.handler = handler if handler == None: self.handler = self.raiseException if filePath == None: - filePath = os.path.expanduser('~/kent/src/hg/makeDb/trackDb/') + 'cv/alpha/cv.ra' + filePath = encode.defaultCvPath() self.protocolPath = protocolPath if protocolPath == None: self.protocolPath == os.path.expanduser('~/htdocsExtras/ENCODE/') self.missingTypes = set() self.read(filePath) def raiseException(self, exception): '''wrapper function for raising exception''' raise exception def readStanza(self, stanza, key=None): '''overriden method from RaFile which makes specialized stanzas based on type''' - # e = ra.RaStanza() - # ek, ev = e.readStanza(stanza) - # type = e['type'] - - # if type == 'Antibody': - # entry = AntibodyStanza() - # elif type == 'Cell Line': - # if e['organism'] == 'human': - # entry = CellLineStanza() - # elif e['organism'] == 'mouse': - # entry = MouseStanza() - # else: - # self.handler(NonmatchKeyError(e.name, e['organism'], 'organism')) - # return ek, ev, None - # elif type == 'age': - # entry = AgeStanza() - # elif type == 'dataType': - # entry = DataTypeStanza() - # elif type == 'lab': - # entry = LabStanza() - # elif type == 'seqPlatform': - # entry = SeqPlatformStanza() - # elif type == 'typeOfTerm': - # entry = TypeOfTermStanza() - # elif type == 'view': - # entry = ViewStanza() - # elif type == 'localization': - # entry = LocalizationStanza() - # elif type == 'grant': - # entry = GrantStanza() - # else: entry = CvStanza() key, val = entry.readStanza(stanza) return key, val, entry def validate(self): '''base validation method which calls all stanzas' validate''' for stanza in self.itervalues(): stanza.validate(self) print self.missingTypes def getTypeOfTermStanza(self, type): types = self.filter(lambda s: s['term'] == type and s['type'] == 'typeOfTerm', lambda s: s) if len(types) != 1: @@ -160,31 +129,34 @@ else: self[raKey] = raVal def validate(self, cvfile): type = self['type'] if self['type'] == 'Cell Line': # :( if 'organism' in self and self['organism'] == 'human': type = 'cellType' elif 'organism' in self and self['organism'] == 'mouse': type = 'mouseCellType' else: cvfile.handler(OrganismError(self)) typeStanza = cvfile.getTypeOfTermStanza(type) if typeStanza == None: - cvfile.handler(InvalidTypeError(self, self['type'])) + #print cvfile.filter2(lambda s: s['type'] == 'typeOfTerm').keys() + #print '>%s<' % cvfile['mouseCellType ']['term'] + #print '>%s<' % cvfile['mouseCellType ']['type'] + cvfile.handler(InvalidTypeError(self, self['type'] + '(%s)' % type)) return required = list() if 'requiredVars' in typeStanza: required = extractList(typeStanza['requiredVars']) optional = list() if 'optionalVars' in typeStanza: optional = extractList(typeStanza['optionalVars']) self.checkMandatory(cvfile, required) required.extend(optional) self.checkExtraneous(cvfile, required) self.checkDuplicates(cvfile) for key in self.iterkeys(): @@ -234,105 +206,96 @@ # validate [cv/date/exists/float/integer/list:/none/regex:] outlines the expected values. ENFORCED by mdbPrint -validate # cv: must be defined term in cv (e.g. cell=GM12878). "cv or None" indicates that "None is also acceptable. # "cv or control" indicates that cv-defined terms of type "control" are also acceptable. # date: must be date in YYYY-MM-DD format # exists: not enforced. (e.g. fileName could be validated to exist in download directory) # float: must be floating point number # integer: must be integer # "list:": must be one of several terms in comma delimeited list (e.g. "list: yes,no,maybe" ) # ("list:" includes colon) # none: not validated in any way # "regex:": must match regular expression (e.g. "regex: ^GS[M,E][0-9]$" ) # ("regex:" includes colon) # # NOTE: that validate rules may end comment delimited by a '#' - def validate2(self, cvfile, necessary=None, optional=None): - '''default validation for a generic cv stanza. Should be called with all arguments if overidden''' + # def validate2(self, cvfile, necessary=None, optional=None): + # '''default validation for a generic cv stanza. Should be called with all arguments if overidden''' - if necessary == None: - necessary = set() + # if necessary == None: + # necessary = set() - if optional == None: - optional = set() + # if optional == None: + # optional = set() - baseNecessary = {'term', 'tag', 'type'} + # baseNecessary = {'term', 'tag', 'type'} - if self['type'] != 'Antibody': - baseNecessary.add('description') + # if self['type'] != 'Antibody': + # baseNecessary.add('description') - baseOptional = {'deprecated', 'label'} - self.checkMandatory(cvfile, necessary | baseNecessary) - self.checkExtraneous(cvfile, necessary | baseNecessary | optional | baseOptional) + # baseOptional = {'deprecated', 'label'} + # self.checkMandatory(cvfile, necessary | baseNecessary) + # self.checkExtraneous(cvfile, necessary | baseNecessary | optional | baseOptional) - temptype = self['type'] - if self['type'] == 'Cell Line': # :( - temptype = 'cellType' - if len(cvfile.filter(lambda s: s['term'] == temptype and s['type'] == 'typeOfTerm', lambda s: s)) == 0: - cvfile.handler(InvalidTypeError(self, self['type'])) + # temptype = self['type'] + # if self['type'] == 'Cell Line': # :( + # temptype = 'cellType' + # if len(cvfile.filter(lambda s: s['term'] == temptype and s['type'] == 'typeOfTerm', lambda s: s)) == 0: + # cvfile.handler(InvalidTypeError(self, self['type'])) - self.checkDuplicates(cvfile) + # self.checkDuplicates(cvfile) def checkDuplicates(self, cvfile): '''ensure that all keys are present and not blank in the stanza''' for key in self.iterkeys(): if '__$$' in key: newkey = key.split('__$$', 1)[0] cvfile.handler(DuplicateKeyError(self, newkey)) def checkMandatory(self, cvfile, keys): '''ensure that all keys are present and not blank in the stanza''' for key in keys: if not key in self.keys(): cvfile.handler(MissingKeyError(self, key)) elif self[key] == '': cvfile.handler(BlankKeyError(self, key)) - # def checkOptional(self, cvfile, keys): - # '''ensure that all keys are present and not blank in the stanza''' - # for key in keys: - # if key in self and self[key] == '': - # cvfile.handler(BlankKeyError(self, key)) - def checkExtraneous(self, cvfile, keys): '''check for keys that are not in the list of keys''' for key in self.iterkeys(): if key not in keys and '__$$' not in key: cvfile.handler(ExtraKeyError(self, key)) def checkFullRelational(self, cvfile, key, other, type): '''check that the value at key matches the value of another stanza's value at other, where the stanza type is specified by type''' p = 0 if key not in self: return for entry in cvfile.itervalues(): if 'type' in entry and other in entry: if entry['type'] == type and self[key] == entry[other]: p = 1 break if p == 0: cvfile.handler(NonmatchKeyError(self, key, other)) def checkRelational(self, cvfile, key, other): '''check that the value at key matches the value at other''' - - - p = 0 if key not in self: return for entry in cvfile.itervalues(): if 'type' in entry and other in entry: if entry['type'] == key and self[key] == entry[other]: p = 1 break if p == 0: cvfile.handler(NonmatchKeyError(self, key, other)) def checkListRelational(self, cvfile, key, other): '''check that the value at key matches the value at other''' @@ -371,117 +334,86 @@ self.stanza = stanza self.msg = '' self.strict = 0 def __str__(self): return str('%s[%s] %s: %s' % (self.stanza.name, self.stanza['type'], self.__class__.__name__, self.msg)) class MissingKeyError(CvError): '''raised if a mandatory key is missing''' def __init__(self, stanza, key): CvError.__init__(self, stanza) self.msg = key self.strict = 1 - # def __str__(self): - # return str('%s(%s[%s])' % self.__class__.__name__ self.stanza + ': missing key (' + self.key + ')') - - class DuplicateKeyError(CvError): '''raised if a key is duplicated''' def __init__(self, stanza, key): CvError.__init__(self, stanza) self.msg = key self.strict = 1 - # def __str__(self): - # return str(self.stanza + ': duplicate key (' + self.key + ')') - - class BlankKeyError(CvError): '''raised if a mandatory key is blank''' def __init__(self, stanza, key): CvError.__init__(self, stanza) self.msg = key self.strict = 0 - # def __str__(self): - # return str(self.stanza + ': key (' + self.key + ') is blank') - - class ExtraKeyError(CvError): '''raised if an extra key not in the list of keys is found''' def __init__(self, stanza, key): CvError.__init__(self, stanza) self.msg = key self.strict = 0 - # def __str__(self): - # return str(self.stanza + ': extra key (' + self.key + ')') - - class NonmatchKeyError(CvError): '''raised if a relational key does not match any other value''' def __init__(self, stanza, key, val): CvError.__init__(self, stanza) self.msg = '%s does not match %s' % (key, val) self.strict = 1 - # def __str__(self): - # return str(self.stanza + ': key (' + self.key + ') does not match any (' + self.val + ')') - - class DuplicateVendorIdError(CvError): '''When there exists more than one connected component of stanzas (through derivedFrom) with the same vendorId''' def __init__(self, stanza): CvError.__init__(self, stanza) self.msg = '%s' % self.stanza['vendorId'] self.strict = 0 - # def __str__(self): - # return str('warning: ' + self.stanza.name + ': vendorId (' + self.stanza['vendorId'] + ') has multiple parent cell lines') - - class InvalidProtocolError(CvError): '''raised if a protocol doesnt match anything in the directory''' def __init__(self, stanza, key): CvError.__init__(self, stanza) self.msg = key self.strict = 0 - # def __str__(self): - # return str(self.stanza.name + ': missing protocol document (' + self.key + ')') - - class InvalidTypeError(CvError): '''raised if a relational key does not match any other value''' def __init__(self, stanza, key): CvError.__init__(self, stanza) self.msg = key self.strict = 1 - # def __str__(self): - # return str(self.stanza + ': ' + self.key + ' does not match any types') - class TypeValidationError(CvError): '''raised if the terms type of term has an invalid validation value''' def __init__(self, stanza): CvError.__init__(self, stanza) self.msg = 'validation ' + stanza['validation'] self.strict = 1 class InvalidDateError(CvError): '''raised if the value is an invalid date''' def __init__(self, stanza, val): CvError.__init__(self, stanza) self.msg = val + ' does not match a YYYY-MM-DD date' self.strict = 1