5dcfe2b1322ee5d4c989006b6161fcdac490580e
tdreszer
  Fri Jan 28 16:13:11 2011 -0800
Added -validate option to mdbPrint, which will validate mdb vars against cv.ra.
diff --git src/hg/lib/mdb.c src/hg/lib/mdb.c
index 6baeb4f..693ceda 100644
--- src/hg/lib/mdb.c
+++ src/hg/lib/mdb.c
@@ -1,27 +1,28 @@
 /* mdb.c was originally generated by the autoSql program, which also
  * generated mdb.h and mdb.sql.  This module links the database and
  * the RAM representation of objects. */
 
 #include "common.h"
 #include "linefile.h"
 #include "dystring.h"
 #include "jksql.h"
 #include "hdb.h"
 #include "cheapcgi.h"
 #include "hui.h"
 #include "mdb.h"
+#include <regex.h>
 
 static char const rcsid[] = "$Id: mdb.c,v 1.8 2010/06/11 17:11:28 tdreszer Exp $";
 
 void mdbStaticLoad(char **row, struct mdb *ret)
 /* Load a row from mdb table into ret.  The contents of ret will
  * be replaced at the next call to this function. */
 {
 
 ret->obj = row[0];
 ret->var = row[1];
 ret->varType = row[2];
 ret->val = row[3];
 }
 
 struct mdb *mdbLoadByQuery(struct sqlConnection *conn, char *query)
@@ -2446,31 +2447,35 @@
     dyStringPrintf(dyQuery,"and exists (select l2.obj from %s l2 where l2.obj = l1.obj and l2.var='objType' and l2.val='%s')",
                    tableName,tables?"table":"file");
 dyStringAppend(dyQuery," order by val");
 
 retVal = sqlQuickList(conn, dyStringCannibalize(&dyQuery));
 slNameSortCase(&retVal);
 return retVal;
 }
 
 // TODO: decide to make this public or hide it away inside the one function so far that uses it.
 static struct hash *cvHash = NULL;
 static char *cv_file()
 // return default location of cv.ra
 {
 static char filePath[PATH_LEN];
-safef(filePath, sizeof(filePath), "%s/encode/cv.ra", hCgiRoot());
+char *root = hCgiRoot();
+if (root == NULL || *root == 0)
+    root = "/usr/local/apache/cgi-bin/"; // Make this check out sandboxes?
+//    root = "/cluster/home/tdreszer/kent/src/hg/makeDb/trackDb/cv/alpha/"; // Make this check out sandboxes?
+safef(filePath, sizeof(filePath), "%s/encode/cv.ra", root);
 if(!fileExists(filePath))
     errAbort("Error: can't locate cv.ra; %s doesn't exist\n", filePath);
 return filePath;
 }
 
 struct slPair *mdbValLabelSearch(struct sqlConnection *conn, char *var, int limit, boolean tables, boolean files)
 // Search the metaDb table for vals by var and returns controlled vocabulary (cv) label
 // (if it exists) and val as a pair.  Can impose (non-zero) limit on returned string size of name.
 // Return is case insensitive sorted on name (label or else val).
 {  // TODO: Change this to use normal mdb struct routines?
 if (!tables && !files)
     errAbort("mdbValSearch requests values for neither table nor file objects.\n");
 
 char *tableName = mdbTableName(conn,TRUE); // Look for sandBox name first
 
@@ -2516,34 +2521,58 @@
         if (label != NULL)
             {
             freeMem(pair->name); // Allocated when pair was created
             pair->name = strSwapChar(cloneString(label),'_',' ');  // vestigial _ meaning space
             if (limit > 0 && strlen(pair->name) > limit)
                 pair->name[limit] = '\0';
             }
         }
     slAddHead(&pairs, pair);
     }
 sqlFreeResult(&sr);
 slPairSortCase(&pairs);
 return pairs;
 }
 
+struct hash *mdbCvTermHash(char *term)
+// returns a hash of hashes of a term which should be defined in cv.ra
+{
+static struct hash *cvHashOfHashOfHashes = NULL;
+if (sameString(term,"cell"))
+    term = "Cell Line";
+else if (sameString(term,"antibody"))
+    term = "Antibody";
+
+if (cvHashOfHashOfHashes == NULL)
+    cvHashOfHashOfHashes = hashNew(0);
+
+struct hash *cvTermHash = hashFindVal(cvHashOfHashOfHashes,term);
+// Establish cv hash of Term Types if it doesn't already exist
+if (cvTermHash == NULL)
+    {
+    cvTermHash = raReadWithFilter(cv_file(), "term","type",term);
+    if (cvTermHash != NULL)
+        hashAdd(cvHashOfHashOfHashes,term,cvTermHash);
+    }
+
+return cvTermHash;
+}
+
 struct hash *mdbCvTermTypeHash()
 // returns a hash of hashes of mdb and controlled vocabulary (cv) term types
 // Those terms should contain label,description,searchable,cvDefined,hidden
-{
+{ // NOTE: "typeOfTerm" is specialized, so don't use mdbCvTermHash
 static struct hash *cvHashOfTermTypes = NULL;
 
 // Establish cv hash of Term Types if it doesn't already exist
 if (cvHashOfTermTypes == NULL)
     {
     cvHashOfTermTypes = raReadWithFilter(cv_file(), "term","type","typeOfTerm");
     // Patch up an ugly inconsistency with 'cell'
     struct hash *cellHash = hashRemove(cvHashOfTermTypes,"cellType");
     if (cellHash)
         {
         hashAdd(cvHashOfTermTypes,"cell",cellHash);
         hashReplace(cellHash, "term", cloneString("cell")); // spilling memory of 'cellType' val
         }
     struct hash *abHash = hashRemove(cvHashOfTermTypes,"Antibody");
     if (abHash)
@@ -2624,15 +2653,179 @@
 
 const char *cvLabel(char *term)
 // returns cv label if term found or else just term
 {
 // Get the list of term types from thew cv
 struct hash *termTypeHash = mdbCvTermTypeHash();
 struct hash *termHash = hashFindVal(termTypeHash,term);
 if (termHash != NULL)
     {
     char *label = hashFindVal(termHash,"label");
     if (label != NULL)
         return label;
     }
 return term;
 }
+
+int mdbObjsValidate(struct mdbObj *mdbObjs)
+// Validates vars and vals against cv.ra.  Returns count of errors found
+{
+struct hash *termTypeHash = mdbCvTermTypeHash();
+struct mdbObj *mdbObj = NULL;
+int invalids = 0;
+for( mdbObj=mdbObjs; mdbObj!=NULL; mdbObj=mdbObj->next )
+    {
+    struct mdbVar *mdbVar = NULL;
+    for(mdbVar = mdbObj->vars;mdbVar != NULL;mdbVar=mdbVar->next)
+        {
+        struct hash *termHash = hashFindVal(termTypeHash,mdbVar->var);
+        if (termHash == NULL) // No cv definition for term so no validation can be done
+            continue;
+        char *validationRule = hashFindVal(termHash,"validate");
+        if (validationRule == NULL)
+            {
+            verbose(1,"ERROR in cv.ra: Term %s in typeOfTerms but has no 'validate' setting.\n",mdbVar->var);
+            continue;  // Should we errAbort?
+            }
+
+        // NOTE: Working on memory in hash but we are throwing away a comment and removing trailing spaces so that is okay
+        strSwapChar(validationRule,'#','\0'); // Chop off any comment in the setting
+        validationRule = trimSpaces(validationRule);
+
+        // Validate should be or start with known word
+        if (startsWithWord("cv",validationRule))
+            {
+            if (SETTING_NOT_ON(hashFindVal(termHash,"cvDefined"))) // Known type of term but no validation to be done
+                {
+                verbose(1,"ERROR in cv.ra: Term %s says validate in cv but is not 'cvDefined'.\n",mdbVar->var);
+                continue;
+                }
+
+           // cvDefined so every val should be in cv
+           struct hash *cvTermHash = mdbCvTermHash(mdbVar->var);
+           if (cvTermHash == NULL)
+                {
+                verbose(1,"ERROR in cv.ra: Term %s says validate in cv but not found as a cv term.\n",mdbVar->var);
+                continue;
+                }
+            if (hashFindVal(cvTermHash,mdbVar->val) == NULL) // No cv definition for term so no validation can be done
+                {
+                char * orControl = skipBeyondDelimit(validationRule,' ');
+                if (orControl && sameString(orControl,"or None") && sameString(mdbVar->val,"None"))
+                    continue;
+                else if (orControl && sameString(orControl,"or control"))
+                    {
+                    cvTermHash = mdbCvTermHash("control");
+                    if (cvTermHash == NULL)
+                        {
+                        verbose(1,"ERROR in cv.ra: Term control says validate in cv but not found as a cv term.\n");
+                        continue;
+                        }
+                    if (hashFindVal(cvTermHash,mdbVar->val) != NULL)
+                        continue;
+                    }
+                printf("INVALID cv lookup: %s -> %s = %s\n",mdbObj->obj,mdbVar->var,mdbVar->val);
+                invalids++;
+                }
+           }
+        else if (startsWithWord("date",validationRule))
+            {
+            if (dateToSeconds(mdbVar->val,"%F") == 0)
+                {
+                printf("INVALID date: %s -> %s = %s\n",mdbObj->obj,mdbVar->var,mdbVar->val);
+                invalids++;
+                }
+            }
+        else if (startsWithWord("exists",validationRule))
+            continue;  // (e.g. fileName exists) Nothing to be done at this time.
+        else if (startsWithWord("float",validationRule))
+            {
+            char* end;
+            (void)strtod(mdbVar->val, &end); // Don't want float, just error
+
+            if ((end == mdbVar->val) || (*end != '\0'))
+                {
+                printf("INVALID float: %s -> %s = %s\n",mdbObj->obj,mdbVar->var,mdbVar->val);
+                invalids++;
+                }
+            }
+        else if (startsWithWord("integer",validationRule))
+            {
+            char *p0 = mdbVar->val;
+            if (*p0 == '-')
+                p0++;
+            char *p = p0;
+            while ((*p >= '0') && (*p <= '9'))
+                p++;
+            if ((*p != '\0') || (p == p0))
+                {
+                printf("INVALID integer: %s -> %s = %s\n",mdbObj->obj,mdbVar->var,mdbVar->val);
+                invalids++;
+                }
+            }
+        else if (startsWithWord("list:",validationRule))
+            {
+            validationRule = skipBeyondDelimit(validationRule,' ');
+            if (validationRule == NULL)
+                {
+                verbose(1,"ERROR in cv.ra: Invalid 'list:' for %s.\n",mdbVar->var);
+                continue;
+                }
+            int count = chopByChar(validationRule, ',', NULL, 0);
+            if (count == 1)
+                {
+                if (differentString(mdbVar->val,validationRule))
+                    {
+                    printf("INVALID list '%s' match: %s -> %s = '%s'.\n",validationRule, mdbObj->obj,mdbVar->var,mdbVar->val);
+                    invalids++;
+                    }
+                }
+            else if (count > 1)
+                {
+                char **array = needMem(count*sizeof(char*));
+                chopByChar(cloneString(validationRule), ',', array, count); // Want to also trimSpaces()? No
+
+                if (stringArrayIx(mdbVar->val, array, count) == -1)
+                    {
+                    printf("INVALID list '%s' match: %s -> %s = '%s'.\n",validationRule, mdbObj->obj,mdbVar->var,mdbVar->val);
+                    invalids++;
+                    }
+                }
+            else
+                verbose(1,"ERROR in cv.ra: Invalid 'validate list: %s' for term %s,\n",validationRule,mdbVar->var);
+            }
+        else if (startsWithWord("none",validationRule))
+            continue;
+        else if (startsWithWord("regex:",validationRule))
+            {
+            validationRule = skipBeyondDelimit(validationRule,' ');
+            if (validationRule == NULL)
+                {
+                verbose(1,"ERROR in cv.ra: Invalid 'regex:' for %s.\n",mdbVar->var);
+                continue;
+                }
+            // Real work ahead interpreting regex
+            regex_t regEx;
+            int err = regcomp(&regEx, validationRule, REG_NOSUB);
+            if(err != 0)  // Compile the regular expression so that it can be used.  Use: REG_EXTENDED ?
+                {
+                char buffer[128];
+                regerror(err, &regEx, buffer, sizeof buffer);
+                verbose(1,"ERROR in cv.ra: Invalid regular expression for %s - %s.  %s\n",mdbVar->var,validationRule,buffer);
+                continue;
+                }
+            err = regexec(&regEx, mdbVar->val, 0, NULL, 0);
+            if (err != 0)
+                {
+                //char buffer[128];
+                //regerror(err, &regEx, buffer, sizeof buffer);
+                printf("INVALID regex '%s' match: %s -> %s = '%s'.\n",validationRule, mdbObj->obj,mdbVar->var,mdbVar->val);
+                invalids++;
+                }
+            regfree(&regEx);
+            }
+        else
+            verbose(1,"ERROR in cv.ra: Unknown validationRule rule '%s' for term %s.\n",validationRule,mdbVar->var);
+        }
+    }
+return invalids;
+}