2eff21b32f29ea544fc203846bcb58c63c89363f tdreszer Thu Mar 17 17:05:35 2011 -0700 Integrated mdbPrint/mdbUpdate -encodeExp with the encodeExp APIs. diff --git src/hg/lib/mdb.c src/hg/lib/mdb.c index fe4bba8..58cea2d 100644 --- src/hg/lib/mdb.c +++ src/hg/lib/mdb.c @@ -1,27 +1,28 @@ /* mdb.c was originally generated by the autoSql program, which also * generated mdb.h and mdb.sql. This module links the database and * the RAM representation of objects. */ #include "common.h" #include "linefile.h" #include "dystring.h" #include "jksql.h" #include "hdb.h" #include "cheapcgi.h" #include "hui.h" #include "mdb.h" +#include "encode/encodeExp.h" #include <regex.h> static char const rcsid[] = "$Id: mdb.c,v 1.8 2010/06/11 17:11:28 tdreszer Exp $"; void mdbStaticLoad(char **row, struct mdb *ret) /* Load a row from mdb table into ret. The contents of ret will * be replaced at the next call to this function. */ { ret->obj = row[0]; ret->var = row[1]; ret->val = row[2]; } struct mdb *mdbLoadByQuery(struct sqlConnection *conn, char *query) @@ -233,53 +234,79 @@ #define MDB_METADATA_KEY "metadata" #define MDB_METAOBJ_RAKEY "metaObject" #define MDB_METAVAR_RAKEY "metaVariable" #define MDB_OBJ_TYPE "objType" // ------- (static) convert from autoSql ------- static void mdbVarFree(struct mdbVar **mdbVarPtr) // Frees a single mdbVar struct { freeMem((*mdbVarPtr)->val); freeMem((*mdbVarPtr)->var); freez(mdbVarPtr); } +static void mdbVarsFree(struct mdbVar **mdbVarPtr) +// Frees an mdbVars list +{ +struct mdbVar *mdbVar = NULL; +while((mdbVar = slPopHead(mdbVarPtr)) != NULL) + mdbVarFree(&mdbVar); +} + static void mdbLeafObjFree(struct mdbLeafObj **leafObjPtr) // Frees a single mdbVar struct { freeMem((*leafObjPtr)->obj); freez(leafObjPtr); } static void mdbLimbValFree(struct mdbLimbVal **limbValPtr) // Frees a single mdbVar struct { struct mdbLimbVal *limbVal = *limbValPtr; // Free hash first (shared memory) hashFree(&(limbVal->objHash)); struct mdbLeafObj *leafObj = NULL; while((leafObj = slPopHead(&(limbVal->objs))) != NULL) mdbLeafObjFree(&leafObj); freeMem(limbVal->val); freez(limbValPtr); } +static struct mdbVar *mdbVarNew(char *var, void *val) +// Creates a new mdbVar and adds it onto the head of the list +{ +struct mdbVar *mdbVar; +AllocVar(mdbVar); +mdbVar->var = cloneString(var); +mdbVar->val = cloneString(val); +return mdbVar; +} + +static struct mdbVar *mdbVarAdd(struct mdbVar **pMdbVars, char *var, void *val) +// Creates a new mdbVar and adds it onto the head of the list +{ +struct mdbVar *mdbVar = mdbVarNew(var,val); +slAddHead(pMdbVars, mdbVar); +return mdbVar; +} + static struct mdbObj *mdbObjsLoadFromMemory(struct mdb **mdbPtr,boolean buildHashes) // Load all mdbObjs from in memory mdb struct, cannibalize strings. Expects sorted order. { struct mdbObj *mdbObj = NULL; struct mdbObj *mdbObjs = NULL; struct mdbVar *mdbVar; struct mdb *thisRow; while((thisRow = slPopHead(mdbPtr)) != NULL) { if (mdbObj == NULL || differentString(thisRow->obj,mdbObj->obj) ) { // Finish last object before starting next! if(mdbObj!= NULL) slReverse(&(mdbObjs->vars)); // Start new object @@ -736,30 +763,56 @@ mdbObj->obj = cloneString(obj); if(var != NULL) { struct mdbVar * mdbVar; AllocVar(mdbVar); mdbVar->var = cloneString(var); if(val != NULL) mdbVar->val = cloneString(val); mdbObj->vars = mdbVar; // Only one } return mdbObj; } +struct mdbObj *mdbObjNew(char *obj,struct mdbVar *mdbVars) +// Returns a new mdbObj with whatever was passed in. +// An mdbObj requires and obj, so if one is not supplied it will be "[unknown]" +{ +struct mdbObj *mdbObj = NULL; +if (obj == NULL) + obj = "[unknown]"; +if (mdbVars == NULL) + { + AllocVar(mdbObj); + mdbObj->obj = cloneString(obj); + return mdbObj; + } +else + { + mdbObj = mdbObjCreate(obj,mdbVars->var,mdbVars->val); + mdbObj->varHash = hashNew(0); + hashAddUnique(mdbObj->varHash, mdbVars->var, mdbObj->vars); // pointer to struct to resolve type + + struct mdbVar *var = mdbVars->next; + for(;var != NULL;var = var->next); + mdbObjSetVar(mdbObj, var->var,var->val); + } +return mdbObj; +} + struct mdbObj *mdbObjsLoadFromHashes(struct hash *objsHash) // Load all mdbObjs from a file containing metadata formatted lines { struct mdbObj *mdbObjs = NULL; struct hashEl* objEl = NULL; struct hashCookie objCookie = hashFirst(objsHash); while((objEl = hashNext(&objCookie)) != NULL) { struct mdbObj *mdbObj; AllocVar(mdbObj); mdbObj->obj = cloneString(objEl->name); mdbObj->varHash = hashNew(0); struct hash *hashedVars = objEl->val; struct hashCookie varCookie = hashFirst(hashedVars); @@ -2053,33 +2106,31 @@ safef(buf,sizeof(buf),"%d",val); return mdbObjSetVar(mdbObj,var,buf); } void mdbObjSwapVars(struct mdbObj *mdbObjs, char *vars,boolean deleteThis) // Replaces objs' vars with var=vap pairs provided, preparing for DB update. { struct mdbObj *mdbObj = NULL; for( mdbObj=mdbObjs; mdbObj!=NULL; mdbObj=mdbObj->next ) { mdbObj->deleteThis = deleteThis; if(mdbObj->varHash != NULL) hashFree(&mdbObj->varHash); - struct mdbVar *mdbVar = NULL; - while((mdbVar = slPopHead(&(mdbObj->vars))) != NULL) - mdbVarFree(&mdbVar); + mdbVarsFree(&(mdbObj->vars)); mdbObjAddVarPairs(mdbObj,vars); } } struct mdbObj *mdbObjsFilter(struct mdbObj **pMdbObjs, char *var, char *val,boolean returnMatches) // Filters mdb objects to only those that include/exclude vars. Optionally checks (case insensitive) val too. // Returns matched or unmatched items objects as requested, maintaining sort order { struct mdbObj *mdbObjsReturned = NULL; struct mdbObj *mdbObjs = *pMdbObjs; *pMdbObjs = NULL; struct mdbObj **pMatchTail = returnMatches ? &mdbObjsReturned : pMdbObjs; // Slightly faster than slAddHead/slReverse struct mdbObj **pNoMatchTail = returnMatches ? pMdbObjs : &mdbObjsReturned; // Also known as too clever by half while (mdbObjs!=NULL) @@ -2208,36 +2259,35 @@ return mdbObjsDropped; } void mdbObjTransformToUpdate(struct mdbObj *mdbObjs, char *var, char *val,boolean deleteThis) // Turns one or more mdbObjs into the stucture needed to add/update or delete. { struct mdbObj *mdbObj = NULL; for( mdbObj=mdbObjs; mdbObj!=NULL; mdbObj=mdbObj->next ) { mdbObj->deleteThis = deleteThis; if(mdbObj->varHash != NULL) hashFree(&mdbObj->varHash); - struct mdbVar *mdbVar = NULL; - while((mdbVar = slPopHead(&(mdbObj->vars))) != NULL) - mdbVarFree(&mdbVar); + mdbVarsFree(&(mdbObj->vars)); if(var != NULL) { + struct mdbVar *mdbVar; AllocVar(mdbVar); mdbVar->var = cloneString(var); if(val != NULL) mdbVar->val = cloneString(val); mdbObj->vars = mdbVar; // Only one } } } struct mdbObj *mdbObjClone(const struct mdbObj *mdbObj) // Clones a single mdbObj, including hash and maintining order { if(mdbObj == NULL) return NULL; @@ -2451,106 +2501,105 @@ invalids++; } regfree(®Ex); } else verbose(1,"ERROR in cv.ra: Unknown validationRule rule '%s' for term %s.\n",validationRule,mdbVar->var); } } return invalids; } #define EXPERIMENTS_TABLE "hgFixed.encodeExp" #define EDV_VAR_NAME "expVars" #define EXP_ID_NAME "expId" #define COMPOSITE_VAR "composite" -#define SPECIES_VAR "species" #define DCC_ACCESSION "dccAccession" -struct mdbObj *mdbObjsEncodeExperimentify(struct sqlConnection *conn,char *db,char *tableName,struct mdbObj **pMdbObjs,int warn) +struct mdbObj *mdbObjsEncodeExperimentify(struct sqlConnection *conn,char *db,char *tableName,struct mdbObj **pMdbObjs, + int warn,boolean createExpIfNecessary) // Organizes objects into experiments and validates experiment IDs. Will add/update the ids in the structures. // If warn=1, then prints to stdout all the experiments/obs with missing or wrong expIds; // warn=2, then print line for each obj with expId or warning. +// createExpIfNecessary means go ahead and add to the hgFixed.encodeExp table to get an ID // Returns a new set of mdbObjs that is what can (and should) be used to update the mdb via mdbObjsSetToDb(). { if (pMdbObjs == NULL || *pMdbObjs == NULL) return 0; struct mdbObj *mdbObjs = *pMdbObjs; struct mdbObj *mdbProcessedObs = NULL; struct mdbObj *mdbUpdateObjs = NULL; /* Here is what "experimentify" does from "mdbPrint -encodeExp" and "mdbUpdate -encodeExp": - Uses normal selection methods to get a set of objects (e.g. one composite worth) or all objs. (in mdbPrint and mdbUpdate) - This API: - Breaks up and walks through set of objects composite by composite - Looks up EDVs (expiment defining vars) for composite. Currently these are defined in the mdb under objType=composite expVars= (e.g. obj=wgEncodeBroadHistone objType=composite expVars=lab,dataType,cell,antibody) FIXME: Nice to add white-list to cv.ra typeOfTerms - Breaks up and walks through composite objects exp by exp (handle's "None"s gracefully) - Determines what expId should be. - FIXME: This needs APIs to get the id from the hgFixed.encodeExp table - FIXME: Could also use API to set the expId in the hgFixed.encodeExp table - Creates new mdbObjs list of updates needed to put expId and dccAccession into the mdb. - From "mdbPrint", this API warns of mismatches or missing expIds - From "mdbUpdate" (not -test) then that utility will update the mdb from this API's return structs. If -test, will reveal what would be updated. */ // Sort all objects by composite, so that we handle composite by composite mdbObjsSortOnVars(&mdbObjs, COMPOSITE_VAR); struct dyString *dyVars = dyStringNew(256); while(mdbObjs != NULL) { // Work on a composite at a time char *compName = NULL; while(mdbObjs != NULL && compName == NULL) { compName = mdbObjFindValue(mdbObjs,COMPOSITE_VAR); if (compName == NULL) { verbose(1, "Object '%s' has no %s defined.\n",mdbObjs->obj,COMPOSITE_VAR); mdbProcessedObs = slCat(mdbProcessedObs,slPopHead(&mdbObjs)); continue; } } struct mdbObj *mdbCompositeObjs = mdbObjsFilter(&mdbObjs, COMPOSITE_VAR, compName,TRUE); // --- At this point we have nibbled off a composite worth of objects from the full set of objects // Find the composite obj if it exists struct mdbObj *compObj = mdbObjsFilter(&mdbCompositeObjs, "objType", "composite",TRUE); - if (compObj == NULL) + if (compObj == NULL) // May be NULL if mdbObjs passed in was produced by too narrow of selection criteria { dyStringClear(dyVars); dyStringPrintf(dyVars,"composite=%s %s=", compName,EDV_VAR_NAME); struct mdbByVar *mdbByVars = mdbByVarsLineParse(dyStringContents(dyVars)); compObj = mdbObjsQueryByVars(conn,tableName,mdbByVars); } // Obtain experiment defining variables for the composite dyStringClear(dyVars); if (compObj != NULL) { char *expVars = mdbObjFindValue(compObj,EDV_VAR_NAME); if (expVars) dyStringAppend(dyVars, expVars); // expVars in form of "var1 var2 var3" } if (dyStringLen(dyVars) == 0) { // figure them out? - // FIXME: White list of EDVs from the cv + // NOTE: Kate wants white list of EDVs from the cv. Wranglers satisfied with defining them in an mdbObj of objType=composite // Walk through the mdbCompositeObjs looking for matching vars. verbose(1, "There are no experiment defining variables established for this composite. Add them to obj %s => var:%s.\n",compName,EDV_VAR_NAME); mdbProcessedObs = slCat(mdbProcessedObs,mdbCompositeObjs); mdbCompositeObjs = NULL; continue; } // Parse into individual Experiment Defining Variables (no vals at the composite level) if (strchr(dyStringContents(dyVars), ',') != NULL) // Tolerate delimit by commas strSwapChar(dyStringContents(dyVars),',',' '); else if (strchr(dyStringContents(dyVars), ';') != NULL) // Tolerate delimit by semicolons strSwapChar(dyStringContents(dyVars),';',' '); struct slName *compositeEdvs = slNameListFromString(dyStringContents(dyVars), ' '); assert(slCount(compositeEdvs) > 0); @@ -2561,95 +2610,101 @@ dyStringPrintf(dyVars, " view replicate "); // Allows for nicer sorted list char *edvSortOrder = cloneString(dyStringContents(dyVars)); // Walk through objs for an exp as defined by EDVs int expCount=0; // Count of experiments in composite int expMissing=0; // Count of experiments with missing expId int expObjsCount=0; // Total of all experimental object accoss the composite int expMax=0; // Largest experiment (in number of objects) int expMin=999; // Smallest experiment (in number of objects) while(mdbCompositeObjs != NULL) { // Must sort each cycle, because sort order is lost during mdbObjs FilterByVars(); mdbObjsSortOnVars(&mdbCompositeObjs, edvSortOrder); // Construct the var=val string for the exp at the top of the stack + struct mdbVar *edvVars = NULL,*edvVar = NULL; dyStringClear(dyVars); struct dyString *filterVars = dyStringNew(256); struct slName *var = compositeEdvs; int valsFound = 0; for(;var!=NULL;var=var->next) { char *val = mdbObjFindValue(mdbCompositeObjs,var->name); // Looking at first obj in queue if (val) { valsFound++; dyStringPrintf(filterVars,"%s=%s ",var->name,val); - dyStringPrintf(dyVars,"%s=%s ",var->name,val); + edvVar = mdbVarAdd(&edvVars, var->name,val); + dyStringPrintf(dyVars,"%s=%s ",edvVar->var,edvVar->val); } else { - if (sameWord(var->name,SPECIES_VAR)) - dyStringPrintf(dyVars,"%s=%s ",SPECIES_VAR,(startsWith("mm",db)?"Mouse":"Human")); // Can't go into mdbObj FilterVars - else + if (differentWord(var->name,ENCODE_EXP_FIELD_ORGANISM)) // Does not go into EDV's sent to encodeExp table { - dyStringPrintf(dyVars,"%s=None ",var->name); dyStringPrintf(filterVars,"%s=None ",var->name); + edvVar = mdbVarAdd(&edvVars, var->name,"None"); + dyStringPrintf(dyVars,"%s=%s ",edvVar->var,edvVar->val); } } } dyStringContents(dyVars)[dyStringLen(dyVars) -1] = '\0'; // Nicer printing is all if (valsFound == 0) { verbose(1, "There are no experiment defining variables for this object '%s'.\n",mdbCompositeObjs->obj); slAddHead(&mdbProcessedObs,slPopHead(&mdbCompositeObjs)); // We're done with this one dyStringFree(&filterVars); + mdbVarsFree(&edvVars); continue; } // Work on one experiment at a time struct mdbObj *mdbExpObjs = mdbObjsFilterByVars(&mdbCompositeObjs,dyStringContents(filterVars),TRUE,TRUE); dyStringFree(&filterVars); // --- At this point we have nibbled off an experiment worth of objects from the composite set of objects int objsInExp = slCount(mdbExpObjs); assert(objsInExp > 0); expCount++; expObjsCount += objsInExp; // Total of all experimental object across the composite // Look up each exp in EXPERIMENTS_TABLE - // FIXME: Kate. Need the encodeExp lib - // Further FIXME: dyStringContents(dyVars) could have species=hg18 when what would be desired is species=Human - // int expId = encodeExpGetExpId(dyStringContents(dyVars)); - int expId = -1; char experimentId[128]; + int expId = -1; + struct encodeExp *exp = encodeExpGetByMdbVars(db, edvVars); + if (exp == NULL && createExpIfNecessary) + exp = encodeExpGetOrCreateByMdbVars(db, edvVars); + mdbVarsFree(&edvVars); // No longer needed + + if (exp != NULL) + expId = exp->ix; if (expId == -1) { - // FIXME: Kate should provide an API to create an experiment in the hgFixed.encodeExp table. This will leave one algorithm for grouping experiments by EDVs safef(experimentId,sizeof(experimentId),"{missing}"); if (warn > 0) printf("Experiment %s EDV: [%s] is not defined in %s table.\n",experimentId,dyStringContents(dyVars),EXPERIMENTS_TABLE); //printf("Experiment %s EDV: [%s] is not defined in %s table. Remaining:%d and %d\n",experimentId,dyStringContents(dyVars),EXPERIMENTS_TABLE,slCount(mdbCompositeObjs),slCount(mdbObjs)); if (warn < 2) // From mdbUpdate (warn=1), just interested in testing waters. From mdbPrint (warn=2) list all objs in exp. { expMissing++; mdbProcessedObs = slCat(mdbProcessedObs,mdbExpObjs); mdbExpObjs = NULL; + encodeExpFree(&exp); continue; } } else { safef(experimentId,sizeof(experimentId),"%d",expId); if (warn > 0) printf("Experiment %s has %d objects based upon %d EDVs: [%s].\n",experimentId,slCount(mdbExpObjs),valsFound,dyStringContents(dyVars)); // Set the stage } // Now we can walk through each obj in experiment and determine if it has the coorect expId int foundId = FALSE; int errors = objsInExp; if (expMax < objsInExp) expMax = objsInExp; @@ -2683,84 +2738,102 @@ printf(" %s obj='%s' has bad %s=%s.\n",experimentId,obj->obj,EXP_ID_NAME,val); } } else { updateObj = (expId != -1); if ((foundId && warn > 0) || warn > 1) printf(" %s obj='%s' has no %s.\n",experimentId,obj->obj,EXP_ID_NAME); } // This object needs to be updated. if (updateObj) { mdbObjSetVarInt(obj,EXP_ID_NAME,expId); struct mdbObj *newObj = mdbObjCreate(obj->obj,EXP_ID_NAME, experimentId); - char buf[128]; - safef(buf,sizeof(buf),"wgEncode%c%06d",(startsWith("mm",db)?'M':'H'),expId); - mdbObjSetVar(newObj,DCC_ACCESSION,buf); + assert(exp != NULL); + mdbObjSetVar(newObj,DCC_ACCESSION,exp->accession); slAddHead(&mdbUpdateObjs,newObj); } slAddHead(&mdbProcessedObs,obj); } // Done with one experiment + encodeExpFree(&exp); if (!foundId && errors > 0) { expMissing++; if (warn > 0) printf(" %s all %d objects are missing an %s.\n",experimentId,objsInExp,EXP_ID_NAME); } } // Done with one composite if (expCount > 0) printf("Composite '%s' has %d recognizable experiment%s with %d missing an %s.\n objects/experiment: min:%d max:%d mean:%lf.\n", compName,expCount,(expCount != 1?"s":""),expMissing,EXP_ID_NAME,expMin,expMax,((double)expObjsCount/expCount)); if (edvSortOrder != NULL) freeMem(edvSortOrder); slNameFreeList(compositeEdvs); } // Done with all composites dyStringFree(&dyVars); *pMdbObjs = mdbProcessedObs; return mdbUpdateObjs; } +boolean mdbObjIsEncode(struct mdbObj *mdb) +// Return true if this metaDb object is for ENCODE +{ +char *project = mdbObjFindValue(mdb, "project"); +if (sameOk(project, ENCODE_MDB_PROJECT)) + return TRUE; +return FALSE; + +// Could be more stringent: +//return (mdbObjFindValue(mdbObj, "lab") != NULL && mdbObjFindValue(mdbObj, "dataType") != NULL && mdbObjFindValue(mdbObj, "subId")); +} + +boolean mdbObjInComposite(struct mdbObj *mdb, char *composite) +// Return true if metaDb object is in specified composite. +// If composite is NULL, always return true +{ +if (composite == NULL || sameOk(composite, mdbObjFindValue(mdb, "composite"))) + return TRUE; +return FALSE; +} // --------------- Free at last ---------------- void mdbObjsFree(struct mdbObj **mdbObjsPtr) // Frees one or more metadata objects and any contained mdbVars. Will free any hashes as well. { if(mdbObjsPtr != NULL && *mdbObjsPtr != NULL) { // free all roots struct mdbObj *mdbObj = NULL; while((mdbObj = slPopHead(mdbObjsPtr)) != NULL) { // Free hash first (shared memory) hashFree(&(mdbObj->varHash)); // free all leaves - struct mdbVar *mdbVar = NULL; - while((mdbVar = slPopHead(&(mdbObj->vars))) != NULL) - mdbVarFree(&mdbVar); + mdbVarsFree(&(mdbObj->vars)); // The rest of root freeMem(mdbObj->obj); freeMem(mdbObj); } freez(mdbObjsPtr); } } void mdbByVarsFree(struct mdbByVar **mdbByVarsPtr) // Frees one or more metadata vars and any contained vals and objs. Will free any hashes as well. { if(mdbByVarsPtr != NULL && *mdbByVarsPtr != NULL) { // free all roots @@ -3191,50 +3264,15 @@ const char *cvLabel(char *term) // returns cv label if term found or else just term { // Get the list of term types from thew cv struct hash *termTypeHash = (struct hash *)mdbCvTermTypeHash(); struct hash *termHash = hashFindVal(termTypeHash,term); if (termHash != NULL) { char *label = hashFindVal(termHash,"label"); if (label != NULL) return label; } return term; } - -int mdbObjIsEncode(struct mdbObj *mdb) -/* Return true if this metaDb object is for ENCODE */ -{ -char *project = mdbObjFindValue(mdb, "project"); -if (sameOk(project, ENCODE_MDB_PROJECT)) - return TRUE; -return FALSE; -} - -int mdbObjInComposite(struct mdbObj *mdb, char *composite) -/* Return true if metaDb object is in specified composite. - If composite is NULL, always return true */ -{ -if (composite == NULL || sameOk(composite, mdbObjFindValue(mdb, "composite"))) - return TRUE; -return FALSE; -} - -struct mdbObj *mdbObjNew(char *name, struct mdbVar *vars) -/* Create an mdbObj from a name and var list */ -{ -struct mdbObj *mdb; -struct mdbVar *var; - -AllocVar(mdb); -mdb->obj = name; -mdb->vars = vars; -mdb->varHash = hashNew(0); -for (var = mdb->vars; var != NULL; var = var->next) - hashAdd(mdb->varHash, var->var, var); -return mdb; -} - -//NOTE: Need mdbObjFree