97ab92339890a2c48de767060aba8e714663c5cc tdreszer Mon Sep 12 16:53:24 2011 -0700 Validation of cv terms should treat 'deprecated' terms similar to terms not found. diff --git src/hg/lib/cv.c src/hg/lib/cv.c index c1788b7..f696201 100644 --- src/hg/lib/cv.c +++ src/hg/lib/cv.c @@ -1,594 +1,595 @@ // cv.c stands for Controlled Vocabullary and this file contains the // library APIs for reading and making sense of the contents of cv.ra. #include <regex.h> #include "common.h" #include "linefile.h" #include "dystring.h" #include "hdb.h" #include "ra.h" #include "hui.h" #include "mdb.h" // CV Defines that should not necessarily be public // CV UGLY TERMS (NOTE: These should be hiddne inside cv.c APIS and callers should use non-UGLY terms) #define CV_UGLY_TOT_CELLTYPE "cellType" #define CV_UGLY_TERM_CELL_LINE "Cell Line" #define CV_UGLY_TERM_ANTIBODY "Antibody" // Type of Terms searchable defines #define CV_SEARCHABLE_SINGLE_SELECT "select" #define CV_SEARCHABLE_MULTI_SELECT "multiSelect" #define CV_SEARCHABLE_FREE_TEXT "freeText" #define CV_SEARCHABLE_WILD_LIST "wildList" const char *cvTypeNormalized(const char *sloppyTerm) // returns the proper term to use when requesting a typeOfTerm { static const char *badCvSpellingOfCell = CV_UGLY_TOT_CELLTYPE; static const char *badCvSpellingOfAntibody = CV_UGLY_TERM_ANTIBODY; if (sameWord((char *)sloppyTerm,CV_TERM_CELL) || sameWord((char *)sloppyTerm,CV_UGLY_TERM_CELL_LINE)) return badCvSpellingOfCell; if (sameWord((char *)sloppyTerm,CV_TERM_ANTIBODY)) return badCvSpellingOfAntibody; return sloppyTerm; } const char *cvTermNormalized(const char *sloppyTerm) // returns the proper term to use when requesting a cvTerm hash { static const char *approvedSpellingOfCell = CV_TERM_CELL; static const char *approvedSpellingOfAntibody = CV_TERM_ANTIBODY; if (sameWord((char *)sloppyTerm,CV_UGLY_TOT_CELLTYPE) || sameWord((char *)sloppyTerm,CV_UGLY_TERM_CELL_LINE)) return approvedSpellingOfCell; if (sameWord((char *)sloppyTerm,CV_UGLY_TERM_ANTIBODY)) return approvedSpellingOfAntibody; return sloppyTerm; } char *cvLabNormalize(const char *sloppyTerm) /* CV inconsistency work-arounds. Return lab name trimmed of parenthesized trailing * info (a few ENCODE labs have this in metaDb and/or in CV term -- * PI name embedded in parens in the CV term). Also fixes other problems until * cleaned up in CV, metaDb and user processes. Caller must free mem. */ { char *lab = (char *)sloppyTerm; if (containsStringNoCase((char *)sloppyTerm, "Weissman")) lab = "Yale-Weissman"; char *ret = cloneString(lab); chopSuffixAt(ret, '('); return ret; } /* TBD char *cvLabDeNormalize(char *minimalTerm) // returns lab name with parenthesized trailing info, by lookup in cv.ra, and restores // other oddities caught by Normalize } */ static char *cvFileRequested = NULL; void cvFileDeclare(const char *filePath) // Declare an altername cv.ra file to use // (The cv.ra file is normally discovered based upon CGI/Tool and envirnment) { cvFileRequested = cloneString(filePath); } const char *cvFile() // return default location of cv.ra { static char filePath[PATH_LEN]; if (cvFileRequested != NULL) { safecpy(filePath, sizeof(filePath), cvFileRequested); } else { char *root = hCgiRoot(); if (root == NULL || *root == 0) root = "/usr/local/apache/cgi-bin/"; // Make this check out sandboxes? safef(filePath, sizeof(filePath), "%s/encode/%s", root,CV_FILE_NAME); } if(!fileExists(filePath)) errAbort("Error: can't locate %s; %s doesn't exist\n", CV_FILE_NAME, filePath); return filePath; } const struct hash *cvTermHash(const char *term) // returns a hash of hashes of a term which should be defined in cv.ra // NOTE: in static memory: DO NOT FREE { static struct hash *cvHashOfHashOfHashes = NULL; if (sameString(term,CV_TERM_CELL)) term = CV_UGLY_TERM_CELL_LINE; else if (sameString(term,CV_TERM_ANTIBODY)) term = CV_UGLY_TERM_ANTIBODY; if (cvHashOfHashOfHashes == NULL) cvHashOfHashOfHashes = hashNew(9); struct hash *cvHashForTerm = hashFindVal(cvHashOfHashOfHashes,(char *)term); // Establish cv hash of Term Types if it doesn't already exist if (cvHashForTerm == NULL) { cvHashForTerm = raReadWithFilter((char *)cvFile(), CV_TERM,CV_TYPE,(char *)term); if (cvHashForTerm != NULL) hashAdd(cvHashOfHashOfHashes,(char *)term,cvHashForTerm); } return cvHashForTerm; } const struct hash *cvOneTermHash(const char *type,const char *term) // returns a hash for a single term of a given type // NOTE: in static memory: DO NOT FREE { const struct hash *typeHash = cvTermHash(type); if (typeHash != NULL) return hashFindVal((struct hash *)typeHash,(char *)term); return NULL; } const struct hash *cvTermTypeHash() // returns a hash of hashes of mdb and controlled vocabulary (cv) term types // Those terms should contain label,description,searchable,cvDefined,hidden // NOTE: in static memory: DO NOT FREE { // NOTE: "typeOfTerm" is specialized, so don't use cvTermHash static struct hash *cvHashOfTermTypes = NULL; // Establish cv hash of Term Types if it doesn't already exist if (cvHashOfTermTypes == NULL) { cvHashOfTermTypes = raReadWithFilter((char *)cvFile(), CV_TERM,CV_TYPE,CV_TOT); // Patch up an ugly inconsistency with 'cell' struct hash *cellHash = hashRemove(cvHashOfTermTypes,CV_UGLY_TOT_CELLTYPE); if (cellHash) { hashAdd(cvHashOfTermTypes,CV_TERM_CELL,cellHash); hashReplace(cellHash, CV_TERM, cloneString(CV_TERM_CELL)); // spilling memory of 'cellType' val } struct hash *abHash = hashRemove(cvHashOfTermTypes,CV_UGLY_TERM_ANTIBODY); if (abHash) { hashAdd(cvHashOfTermTypes,CV_TERM_ANTIBODY,abHash); hashReplace(abHash, CV_TERM, cloneString(CV_TERM_ANTIBODY)); // spilling memory of 'Antibody' val } } return cvHashOfTermTypes; } static boolean cvHiddenIsTrue(const char *setting) // returns TRUE if hidden setting passed in is true for this browser { if (setting != NULL) { if (sameWord((char *)setting,"yes" ) || sameWord((char *)setting,"on" ) || sameWord((char *)setting,"true")) return TRUE; if (hIsPrivateHost() || hIsPreviewHost()) { if (strstrNoCase((char *)setting,"alpha")) return TRUE; } else if (hIsBetaHost()) { if (strstrNoCase((char *)setting,"beta")) return TRUE; } else // RR and everyone else { if (strstrNoCase((char *)setting,"public")) return TRUE; } } return FALSE; } struct slPair *cvWhiteList(boolean searchTracks, boolean cvDefined) // returns the official mdb/controlled vocabulary terms that have been whitelisted for certain uses. // TODO: change to return struct that includes searchable! { struct slPair *whitePairs = NULL; // Get the list of term types from thew cv struct hash *termTypeHash = (struct hash *)cvTermTypeHash(); struct hashCookie hc = hashFirst(termTypeHash); struct hashEl *hEl; while ((hEl = hashNext(&hc)) != NULL) { char *setting = NULL; struct hash *typeHash = (struct hash *)hEl->val; //if (!includeHidden) { setting = hashFindVal(typeHash,CV_TOT_HIDDEN); if (cvHiddenIsTrue(setting)) continue; } if (searchTracks) { setting = hashFindVal(typeHash,CV_TOT_SEARCHABLE); if (setting == NULL || ( differentWord(setting,CV_SEARCHABLE_SINGLE_SELECT) && differentWord(setting,CV_SEARCHABLE_MULTI_SELECT) && differentWord(setting,CV_SEARCHABLE_FREE_TEXT) && differentWord(setting,CV_SEARCHABLE_WILD_LIST))) continue; } if (cvDefined) { setting = hashFindVal(typeHash,CV_TOT_CV_DEFINED); if(SETTING_NOT_ON(setting)) continue; } char *term = hEl->name; char *label = hashFindVal(typeHash,CV_LABEL); if (label == NULL) label = term; slPairAdd(&whitePairs, term, cloneString(label)); // Term gets cloned in slPairAdd } if (whitePairs != NULL) slPairValSortCase(&whitePairs); return whitePairs; } enum cvSearchable cvSearchMethod(const char *term) // returns whether the term is searchable { // Get the list of term types from thew cv struct hash *termTypeHash = (struct hash *)cvTermTypeHash(); struct hash *termHash = hashFindVal(termTypeHash,(char *)term); if (termHash != NULL) { char *searchable = hashFindVal(termHash,CV_TOT_SEARCHABLE); if (searchable != NULL) { if (sameWord(searchable,CV_SEARCHABLE_SINGLE_SELECT)) return cvSearchBySingleSelect; if (sameWord(searchable,CV_SEARCHABLE_MULTI_SELECT)) return cvSearchByMultiSelect; if (sameWord(searchable,CV_SEARCHABLE_FREE_TEXT)) return cvSearchByFreeText; if (sameWord(searchable,CV_SEARCHABLE_WILD_LIST)) return cvSearchByWildList; if (sameWord(searchable,"date")) return cvSearchByDateRange; if (sameWord(searchable,"numeric")) return cvSearchByIntegerRange; } } return cvNotSearchable; } const char *cvValidationRule(const char *term) // returns validation rule, trimmed of comment { // Get the list of term types from thew cv struct hash *termTypeHash = (struct hash *)cvTermTypeHash(); struct hash *termHash = hashFindVal(termTypeHash,(char *)term); if (termHash != NULL) { char *validationRule = hashFindVal(termHash,CV_VALIDATE); // NOTE: Working on memory in hash but we are throwing away a comment and removing trailing spaces so that is okay strSwapChar(validationRule,'#','\0'); // Chop off any comment in the setting validationRule = trimSpaces(validationRule); return validationRule; // Clone? } return NULL; } enum cvDataType cvDataType(const char *term) // returns the dataType if it can be determined { const char *validationRule = cvValidationRule(term); if (validationRule != NULL) { if (startsWithWord(CV_VALIDATE_INT,(char *)validationRule)) return cvInteger; else if (startsWithWord(CV_VALIDATE_FLOAT,(char *)validationRule)) return cvFloat; else if (startsWithWord(CV_VALIDATE_DATE,(char *)validationRule)) return cvDate; else return cvString; } return cvIndeterminant; } const char *cvLabel(const char *type,const char *term) // returns cv label if term found or else just term // If type not supplied, must be a typeOfTerm definition { struct hash *termHash = NULL; if (type == NULL) // Must be a typeOfTerm { // Get the list of term types from thew cv struct hash *termTypeHash = (struct hash *)cvTermTypeHash(); termHash = hashFindVal(termTypeHash,(char *)term); } else termHash = (struct hash *)cvOneTermHash(type,term); if (termHash != NULL) { char *label = hashFindVal(termHash,CV_LABEL); if (label != NULL) return label; } return term; } const char *cvTag(const char *type,const char *term) // returns cv Tag if term found or else NULL { const struct hash *termHash = cvOneTermHash(type,term); if (termHash != NULL) return hashFindVal((struct hash *)termHash,CV_TAG); return NULL; } #ifdef OMIT // may want this someday const char *cvTerm(const char *tag) // returns the cv Term if tag found or else NULL { // Get the list of term types from thew cv struct hash *termTypeHash = (struct hash *)cvTermTypeHash(); struct hashCookie hcTOT = hashFirst(termTypeHash); struct hashEl *helType; while ((helType = hashNext(&hcTOT)) != NULL) // Walk through each type { struct hash *typeHash = cvTermHash(helType->name); struct hashCookie hcType = hashFirst(typeHash); struct hashEl *helTerm; while ((helTerm = hashNext(&hcType)) != NULL) // Walk through each term in this type { struct hash *termHash = (struct hash *)helTerm->val; char *foundTag = hashFindVal(termHash,CV_TAG); if (foundTag != NULL && sameString(tag,foundTag)) { return helTerm->name; } } } return NULL; } #endif///def OMIT boolean cvTermIsHidden(const char *term) // returns TRUE if term is defined as hidden in cv.ra { struct hash *termTypeHash = (struct hash *)cvTermTypeHash(); struct hash *termHash = hashFindVal(termTypeHash,(char *)term); if (termHash != NULL) { char *setting = hashFindVal(termHash,CV_TOT_HIDDEN); return cvHiddenIsTrue(setting); } return FALSE; } boolean cvTermIsCvDefined(const char *term) // returns TRUE if the terms values are defined in the cv.ra // For example anitobody is cv defined but expId isn't { struct hash *termTypeHash = (struct hash *)cvTermTypeHash(); struct hash *termHash = hashFindVal(termTypeHash,(char *)term); if (termHash != NULL) { char *setting = hashFindVal(termHash,CV_TOT_CV_DEFINED); return SETTING_IS_ON(setting); } return FALSE; } boolean cvTermIsEmpty(const char *term,const char *val) // returns TRUE if term has validation of "cv or None" and the val is None { if (val == NULL) return TRUE; // Empty whether it is supposed to be or not struct hash *termTypeHash = (struct hash *)cvTermTypeHash(); struct hash *termHash = hashFindVal(termTypeHash,(char *)term); if (termHash != NULL) { char *validationRule = hashFindVal(termHash,CV_VALIDATE); if (validationRule != NULL) { // Currently only supporting special case for "None" if (sameString(validationRule,CV_VALIDATE_CV_OR_NONE) && sameString(val,MDB_VAL_ENCODE_EDV_NONE)) return TRUE; } } return FALSE; } boolean cvValidateTerm(const char *term,const char *val,char *reason,int len) // returns TRUE if term is valid. Can pass in a reason buffer of len to get reason. { if (reason != NULL) *reason = '\0'; char *validationRule = (char *)cvValidationRule(term); if (validationRule == NULL) { if (reason != NULL) safef(reason,len,"ERROR in %s: Term '%s' in typeOfTerms but has no '%s' setting.",CV_FILE_NAME,(char *)term,CV_VALIDATE); return FALSE; } // Validate should be or start with known word if (startsWithWord(CV_VALIDATE_CV,validationRule)) { struct hash *termTypeHash = (struct hash *)cvTermTypeHash(); struct hash *termHash = hashFindVal(termTypeHash,(char *)term); if (SETTING_NOT_ON(hashFindVal(termHash,CV_TOT_CV_DEFINED))) // Known type of term but no validation to be done { if (reason != NULL) safef(reason,len,"ERROR in %s: Term '%s' says validate in cv but is not '%s'.",CV_FILE_NAME,(char *)term,CV_TOT_CV_DEFINED); return FALSE; } // cvDefined so every val should be in cv struct hash *cvHashForTerm = (struct hash *)cvTermHash((char *)term); if (cvHashForTerm == NULL) { if (reason != NULL) safef(reason,len,"ERROR in %s: Term '%s' says validate in cv but not found as a cv term.",CV_FILE_NAME,(char *)term); return FALSE; } - if (hashFindVal(cvHashForTerm,(char *)val) == NULL) // No cv definition for term so no validation can be done + struct hash *cvHashForVal = hashFindVal(cvHashForTerm,(char *)val); + if (cvHashForVal == NULL || hashFindVal(cvHashForVal, "deprecated")) // No cv definition for term so no validation can be done { if (sameString(validationRule,CV_VALIDATE_CV_OR_NONE) && sameString((char *)val,MDB_VAL_ENCODE_EDV_NONE)) return TRUE; else if (sameString(validationRule,CV_VALIDATE_CV_OR_CONTROL)) { cvHashForTerm = (struct hash *)cvTermHash(CV_TERM_CONTROL); if (cvHashForTerm == NULL) { if (reason != NULL) safef(reason,len,"ERROR in %s: Term '%s' says validate in cv but not found as a cv term.",CV_FILE_NAME,CV_TERM_CONTROL); return FALSE; } if (hashFindVal(cvHashForTerm,(char *)val) != NULL) return TRUE; } if (reason != NULL) - safef(reason,len,"INVALID cv lookup: %s = '%s'",(char *)term,(char *)val); + safef(reason,len,"INVALID cv lookup: %s = '%s' %s",(char *)term,(char *)val,(cvHashForVal?"DEPRECATED.":"not found.")); return FALSE; } } else if (startsWithWord(CV_VALIDATE_DATE,validationRule)) { if (dateToSeconds((char *)val,"%F") == 0) { if (reason != NULL) safef(reason,len,"INVALID date: %s = %s",(char *)term,(char *)val); return FALSE; } } else if (startsWithWord(CV_VALIDATE_EXISTS,validationRule)) { return TRUE; // (e.g. fileName exists) Nothing to be done at this time. } else if (startsWithWord(CV_VALIDATE_FLOAT,validationRule)) { char* end; double notNeeded = strtod((char *)val, &end); // Don't want float, just error (However, casting to void resulted in a compile error on Ubuntu Maveric and Lucid) if ((end == (char *)val) || (*end != '\0')) { if (reason != NULL) safef(reason,len,"INVALID float: %s = %s (resulting double: %g)",(char *)term,(char *)val,notNeeded); return FALSE; } } else if (startsWithWord(CV_VALIDATE_INT,validationRule)) { char *p0 = (char *)val; if (*p0 == '-') p0++; char *p = p0; while ((*p >= '0') && (*p <= '9')) p++; if ((*p != '\0') || (p == p0)) { if (reason != NULL) safef(reason,len,"INVALID integer: %s = %s",(char *)term,(char *)val); return FALSE; } } else if (startsWithWord(CV_VALIDATE_LIST,validationRule)) { validationRule = skipBeyondDelimit(validationRule,' '); if (validationRule == NULL) { if (reason != NULL) safef(reason,len,"ERROR in %s: Invalid '%s' for %s.",CV_FILE_NAME,CV_VALIDATE_LIST,(char *)term); return FALSE; } int count = chopByChar(validationRule, ',', NULL, 0); //////////////////////// if (count == 1) { if (differentString((char *)val,validationRule)) { if (reason != NULL) safef(reason,len,"INVALID list '%s' match: %s = '%s'",validationRule,(char *)term,(char *)val); return FALSE; } } else if (count > 1) { char **array = needMem(count*sizeof(char*)); chopByChar(cloneString(validationRule), ',', array, count); // Want to also trimSpaces()? No if (stringArrayIx((char *)val, array, count) == -1) { if (reason != NULL) safef(reason,len,"INVALID list '%s' match: %s = '%s'",validationRule,(char *)term,(char *)val); return FALSE; } } else { if (reason != NULL) safef(reason,len,"ERROR in %s: Invalid 'validate list: %s' for term %s.",CV_FILE_NAME,validationRule,(char *)term); return FALSE; } } else if (startsWithWord(CV_VALIDATE_NONE,validationRule)) { return TRUE; } else if (startsWithWord(CV_VALIDATE_REGEX,validationRule)) { validationRule = skipBeyondDelimit(validationRule,' '); if (validationRule == NULL) { if (reason != NULL) safef(reason,len,"ERROR in %s: Invalid '%s' for %s.",CV_FILE_NAME,CV_VALIDATE_REGEX,(char *)term); return FALSE; } // Real work ahead interpreting regex regex_t regEx; int err = regcomp(®Ex, validationRule, REG_NOSUB); if(err != 0) // Compile the regular expression so that it can be used. Use: REG_EXTENDED ? { char buffer[128]; regerror(err, ®Ex, buffer, sizeof buffer); if (reason != NULL) safef(reason,len,"ERROR in %s: Invalid regular expression for %s - %s. %s.",CV_FILE_NAME,(char *)term,validationRule,buffer); return FALSE; } err = regexec(®Ex, (char *)val, 0, NULL, 0); if (err != 0) { //char buffer[128]; //regerror(err, ®Ex, buffer, sizeof buffer); if (reason != NULL) safef(reason,len,"INVALID regex '%s' match: %s = '%s'",validationRule,(char *)term,(char *)val); return FALSE; } regfree(®Ex); } else { if (reason != NULL) safef(reason,len,"ERROR in %s: Unknown validationRule rule '%s' for term %s.",CV_FILE_NAME,validationRule,(char *)term); return FALSE; } return TRUE; }