6327f6e56fb4615024546cae265f117c2c151320 tdreszer Fri May 31 17:27:33 2013 -0700 Three changes to attempt to recover from mysql performace degradation (ref #10910). 1. Added index to val. 2. Replaced subqueries with joins. 3. Replaced ORDER BY query with in code sort. diff --git src/hg/lib/mdb.c src/hg/lib/mdb.c index 6fce0a1..e1f9ec2 100644 --- src/hg/lib/mdb.c +++ src/hg/lib/mdb.c @@ -222,30 +222,55 @@ fputc('"',f); fputc('}',f); } /* -------------------------------- End autoSql Generated Code -------------------------------- */ #include "ra.h" #include "hgConfig.h" #include "obscure.h" #define MDB_METADATA_KEY "metadata" #define MDB_METAOBJ_RAKEY "metaObject" #define MDB_METAVAR_RAKEY "metaVariable" +// ------- Sorts for low level mdb structs ------- +int mdbCmp(const void *va, const void *vb) +// Compare mdb to sort on obj,var, case-sensitive. Needed to override ORDER_BY performance hit +{ +const struct mdb *a = *((struct mdb **)va); +const struct mdb *b = *((struct mdb **)vb); +int ret = strcmp(a->obj, b->obj); +if (ret == 0) + ret = strcmp(a->var, b->var); +return ret; +} + +int mdbVarValCmp(const void *va, const void *vb) +// Compare mdb to sort on var,val case-sensitive. Needed to override ORDER_BY performance hit +{ +const struct mdb *a = *((struct mdb **)va); +const struct mdb *b = *((struct mdb **)vb); +int ret = strcmp(a->var, b->var); +if (ret == 0) + ret = strcmp(a->val, b->val); +if (ret == 0) + ret = strcmp(a->obj, b->obj); +return ret; +} + // ------- (static) convert from autoSql ------- static void mdbLeafObjFree(struct mdbLeafObj **leafObjPtr) // Frees a single mdbVar struct { freeMem((*leafObjPtr)->obj); freez(leafObjPtr); } static void mdbLimbValFree(struct mdbLimbVal **limbValPtr) // Frees a single mdbVar struct { struct mdbLimbVal *limbVal = *limbValPtr; // Free hash first (shared memory) hashFree(&(limbVal->objHash)); @@ -905,31 +930,32 @@ } // ------ Table name and creation ------ void mdbReCreate(struct sqlConnection *conn,char *tblName,boolean testOnly) // Creates ore Recreates the named mdb. { char *sqlCreate = "# Contains metadata for a table, file or other objects.\n" "CREATE TABLE %s (\n" " obj varchar(255) not null, # Object name or ID.\n" " var varchar(255) not null, # Metadata variable name.\n" " val varchar(2048) not null, # Metadata value.\n" " #Indices\n" " PRIMARY KEY(obj,var),\n" - " INDEX varKey (var,val(32),obj)\n" + " INDEX varKey (var),\n" + " INDEX valKey (val(64))\n" ")"; if (sqlTableExists(conn,tblName)) verbose(2, "Table '%s' already exists. It will be recreated.\n",tblName); struct dyString *dy = newDyString(512); dyStringPrintf(dy, sqlCreate, tblName); verbose(2, "Requesting table creation:\n%s;\n", dyStringContents(dy)); if (!testOnly) sqlRemakeTable(conn, tblName, dyStringContents(dy)); dyStringFree(&dy); } #define HG_CONF_SANDBOX_MDB "db.metaDb" @@ -1212,34 +1238,34 @@ { if (mdbVar->var != NULL) dyStringPrintf(dy, " and "); dyStringPrintf(dy, "val %s '%s'", (strchr(mdbVar->val,'%') ? "like" : "="), sqlEscapeString(mdbVar->val)); if (mdbVar->var != NULL) dyStringPrintf(dy, ")"); } if (mdbVar->var == NULL && mdbVar->val) errAbort("mdbObjQuery has empty mdbVar struct.\n"); buildHash = FALSE; // too few variables } if (mdbObj->vars != NULL) dyStringPrintf(dy, ")"); } -dyStringPrintf(dy, " order by binary obj, var"); // binary forces case-sensitive sort -verbose(3, "Requesting query:\n\t%s;\n",dyStringContents(dy)); +verbose(2, "Requesting mdbObjQuery query:\n\t%s;\n",dyStringContents(dy)); struct mdb *mdb = mdbLoadByQuery(conn, dyStringCannibalize(&dy)); +slSort(&mdb,mdbCmp); // Use internal sort instead of ORDER BY because of mysql inefficiency struct mdbObj *mdbObjs = mdbObjsLoadFromMemory(&mdb,buildHash); verbose(3, "Returned %d object(s) with %d var(s).\n", mdbObjCount(mdbObjs,TRUE),mdbObjCount(mdbObjs,FALSE)); return mdbObjs; } struct mdbObj *mdbObjQueryByObj(struct sqlConnection *conn,char *table,char *obj,char *var) // Query a single metadata object and optional var from a table (default mdb). { if (obj == NULL) return mdbObjQuery(conn,table,NULL); struct mdbObj *queryObj = mdbObjCreate(obj,var,NULL); struct mdbObj *resultObj = mdbObjQuery(conn,table,queryObj); mdbObjsFree(&queryObj); @@ -1300,196 +1326,176 @@ (strchr(limbVal->val,'%')?"like":"="), sqlEscapeString(limbVal->val)); break; } else dyStringPrintf(dy, "in ("); multiVals=TRUE; } else dyStringPrintf(dy, ","); dyStringPrintf(dy, "'%s'", sqlEscapeString(limbVal->val)); } if (multiVals) dyStringPrintf(dy, ")"); dyStringPrintf(dy, ")"); } -dyStringPrintf(dy, " order by var, val, obj"); -verbose(2, "Requesting query:\n\t%s;\n",dyStringContents(dy)); +verbose(2, "Requesting mdbByVarsQuery query:\n\t%s;\n",dyStringContents(dy)); struct mdb *mdb = mdbLoadByQuery(conn, dyStringCannibalize(&dy)); verbose(3, "rows (vars) returned: %d\n",slCount(mdb)); +slSort(&mdb,mdbVarValCmp); // Use internal sort instead of ORDER BY because of mysql inefficiency struct mdbByVar *mdbByVarsFromMem = mdbByVarsLoadFromMemory(&mdb,TRUE); verbose(3, "Returned %d vars(s) with %d val(s) with %d object(s).\n", mdbByVarCount(mdbByVarsFromMem,TRUE,FALSE), mdbByVarCount(mdbByVarsFromMem,FALSE,TRUE ), mdbByVarCount(mdbByVarsFromMem,FALSE,FALSE)); return mdbByVarsFromMem; } struct mdbByVar *mdbByVarQueryByVar(struct sqlConnection *conn,char *table,char *varName,char *val) // Query a single metadata variable & optional val from a table (default mdb) for searching val->obj { if (varName == NULL) return mdbByVarsQuery(conn,table,NULL); struct mdbByVar *queryVar = mdbByVarCreate(varName,val); struct mdbByVar *resultVar = mdbByVarsQuery(conn,table,queryVar); mdbByVarsFree(&queryVar); return resultVar; } struct mdbObj *mdbObjsQueryByVars(struct sqlConnection *conn,char *table,struct mdbByVar *mdbByVars) // Query the metadata table by one or more var=val pairs // to find the distinct set of objs that satisfy ALL conditions. // Returns new mdbObj struct fully populated and sorted in obj,var order. { // MOST POPULAR WAY TO QUERY MDB. Building example queries like: // "cell=GM12878" or "cell!=GM12878" -// SELECT T1.obj,T1.var,T1.val FROM metaDb T1 -// WHERE EXISTS (SELECT T2.obj FROM metaDb T2 -// WHERE T2.obj = T1.obj AND T2.var = 'cell' AND T2.val = 'GM12878') +// => SELECT T1.obj,T1.var,T1.val FROM metaDb T1 +// JOIN metaDb T2 WHERE T1.obj = T2.obj AND T2.var = 'cell' AND T2.val = 'GM12878' // ORDER BY T1.obj, T1.var; -// SELECT T1.obj,T1.var,T1.val FROM metaDb T1 -// WHERE EXISTS (SELECT T2.obj FROM metaDb T2 -// WHERE T2.obj = T1.obj AND T2.var = 'cell' AND T2.val != 'GM12878') +// => SELECT T1.obj,T1.var,T1.val FROM metaDb T1 +// JOIN metaDb T2 WHERE T1.obj = T2.obj AND T2.var = 'cell' AND T2.val != 'GM12878' // ORDER BY T1.obj, T1.var; // "cell=GM%" or "cell!=GM%" -// SELECT T1.obj,T1.var,T1.val FROM metaDb T1 -// WHERE EXISTS (SELECT T2.obj FROM metaDb T2 -// WHERE T2.obj = T1.obj AND T2.var = 'cell' AND T2.val LIKE 'GM%') +// => SELECT T1.obj,T1.var,T1.val FROM metaDb T1 +// JOIN metaDb T2 WHERE T1.obj = T2.obj AND T2.var = 'cell' AND T2.val LIKE 'GM%' // ORDER BY T1.obj, T1.var; -// SELECT T1.obj,T1.var,T1.val FROM metaDb T1 -// WHERE EXISTS (SELECT T2.obj FROM metaDb T2 -// WHERE T2.obj = T1.obj AND T2.var = 'cell' AND T2.val NOT LIKE 'GM%') +// => SELECT T1.obj,T1.var,T1.val FROM metaDb T1 +// JOIN metaDb T2 WHERE T1.obj = T2.obj AND T2.var = 'cell' AND T2.val NOT LIKE 'GM%' // ORDER BY T1.obj, T1.var; -// "cell=" or "cell!=" -// SELECT T1.obj,T1.var,T1.val FROM metaDb T1 -// WHERE EXISTS (SELECT T2.obj FROM metaDb T2 -// WHERE T2.obj = T1.obj AND T2.var = 'cell') +// "cell=" or "cell!=" NOTE the tricky and inscrutable left join for cell!= +// => SELECT T1.obj,T1.var,T1.val FROM metaDb T1 +// JOIN metaDb T2 WHERE T1.obj = T2.obj AND T2.var = 'cell' // ORDER BY T1.obj, T1.var; -// SELECT T1.obj,T1.var,T1.val FROM metaDb T1 -// WHERE NOT EXISTS (SELECT T2.obj FROM metaDb T2 -// WHERE T2.obj = T1.obj AND T2.var = 'cell') +// => SELECT T1.obj,T1.var,T1.val FROM metaDb T1 +// LEFT JOIN metaDb T2 WHERE T1.obj = T2.obj AND T2.var = 'cell' WHERE T2.obj IS NULL // ORDER BY T1.obj, T1.var; // "cell=GM12878,K562" or "cell!=GM12878,K562" -// SELECT T1.obj,T1.var,T1.val FROM metaDb T1 -// WHERE EXISTS (SELECT T2.obj FROM metaDb T2 -// WHERE T2.obj = T1.obj AND T2.var = 'cell' AND T2.val IN ('GM12878','K562')) +// => SELECT T1.obj,T1.var,T1.val FROM metaDb T1 +// JOIN metaDb T2 WHERE T1.obj = T2.obj +// AND T2.var = 'cell' AND T2.val IN ('GM12878','K562')' // ORDER BY T1.obj, T1.var; -// SELECT T1.obj,T1.var,T1.val FROM metaDb T1 -// WHERE EXISTS (SELECT T2.obj FROM metaDb T2 -// WHERE T2.obj = T1.obj AND T2.var = 'cell' -// AND T2.val NOT IN ('K562','GM12878')) +// => SELECT T1.obj,T1.var,T1.val FROM metaDb T1 +// JOIN metaDb T2 WHERE T1.obj = T2.obj +// AND T2.var = 'cell' AND T2.val NOT IN ('GM12878','K562')' // ORDER BY T1.obj, T1.var; // "cell=GM% cell!=GM12878" (very powerful) // SELECT T1.obj,T1.var,T1.val FROM metaDb T1 -// WHERE EXISTS (SELECT T2.obj FROM metaDb T2 -// WHERE T2.obj = T1.obj AND T2.var = 'cell' AND T2.val LIKE 'GM%') -// AND EXISTS (SELECT T3.obj FROM metaDb T3 -// WHERE T3.obj = T1.obj AND T3.var = 'cell' AND T3.val != 'GM12878') +// JOIN metaDb T2 WHERE T1.obj = T2.obj AND T2.var = 'cell' AND T2.val LIKE 'GM%' +// JOIN metaDb T3 WHERE T1.obj = T3.obj AND T3.var = 'cell' AND T3.val != 'GM12878' // ORDER BY T1.obj, T1.var; if (table == NULL) table = mdbTableName(conn,TRUE); // defaults to sandbox, if exists, else MDB_DEFAULT_NAME else if (!sqlTableExists(conn,table)) return NULL; struct dyString *dy = newDyString(4096); dyStringPrintf(dy, "SELECT T1.obj,T1.var,T1.val FROM %s T1", table); struct mdbByVar *rootVar; -boolean gotVar = FALSE; int tix; for (rootVar=mdbByVars,tix=2;rootVar!=NULL;rootVar=rootVar->next,tix++) { boolean hasVal = (rootVar->vals != NULL); - //boolean hasVal = ( rootVar->vals != NULL - // && rootVar->vals->val != NULL - // && strlen(rootVar->vals->val) > 0); - if (!gotVar) - { - dyStringPrintf(dy, " WHERE "); - gotVar=TRUE; - } - else - dyStringPrintf(dy, " AND "); + boolean varWild = (strchr(rootVar->var,'%') != NULL); + // If you want objects where var='cell' does not exist, then we need the tricky and inscrutable + // LEFT JOIN metaDb T2 ON T2.obj = T1.obj AND T2.var = 'cell' WHERE T2.obj is NULL if (!hasVal && rootVar->notEqual) - dyStringPrintf(dy, "NOT EXISTS "); - else - dyStringPrintf(dy, "EXISTS "); + dyStringAppend(dy, " LEFT"); - dyStringPrintf(dy, "(SELECT T%d.obj FROM %s T%d WHERE T%d.obj = T1.obj AND T%d.var ", - tix,table,tix,tix,tix); + dyStringPrintf(dy, " JOIN %s T%d ON T%d.obj = T1.obj AND T%d.var ",table,tix,tix,tix); + // var = 'x' || var != 'x' || var LIKE 'x%' || var NOT LIKE 'x%' if (hasVal && rootVar->notEqual && rootVar->vals == NULL) - dyStringPrintf(dy, "%s",strchr(rootVar->var,'%')?"NOT ":"!"); + dyStringAppend(dy, (varWild ? "NOT " : "!")); + dyStringPrintf(dy, "%s '%s'",(varWild ? "LIKE" : "="), rootVar->var); - dyStringPrintf(dy, "%s '%s'", - (strchr(rootVar->var,'%') ? "LIKE" : "="), rootVar->var); + // Finish the tricky and inscrutable LEFT JOIN / WHERE NULL + if (!hasVal && rootVar->notEqual) + dyStringPrintf(dy, " WHERE T%d.obj IS NULL",tix); + // Now 1 or more vals. First some booleans struct mdbLimbVal *limbVal; boolean multiVals = (rootVar->vals != NULL && rootVar->vals->next != NULL); boolean wilds = FALSE; - for (limbVal=rootVar->vals;limbVal!=NULL;limbVal=limbVal->next) - { - if (strchr(limbVal->val,'%') != NULL) - wilds = TRUE; - } + for (limbVal=rootVar->vals;wilds == FALSE && limbVal!=NULL; limbVal=limbVal->next) + wilds = (limbVal->val != NULL && strchr(limbVal->val,'%') != NULL); // breaks when true + + // Now walk through vals creating: + // AND (val = 'x' or val = 'y') or val IN ('a','b','c') or val NOT LIKE 'd%' for (limbVal=rootVar->vals;limbVal!=NULL;limbVal=limbVal->next) { if (limbVal->val == NULL || strlen(limbVal->val) < 1) continue; if (limbVal==rootVar->vals) // First val { if (wilds && multiVals) - dyStringPrintf(dy, " AND (T%d.val ",tix); + dyStringAppend(dy, " AND ("); // starts AND (val LIKE 'd%' or ...) else - dyStringPrintf(dy, " AND T%d.val ",tix); + dyStringAppend(dy, " AND "); // starts AND val = 'a' || AND val IN ('a','b'...) + dyStringPrintf(dy, "T%d.val ",tix); } else // successive vals { if (wilds && multiVals) - dyStringPrintf(dy, " or T%d.val ",tix); + dyStringPrintf(dy, " or T%d.val ",tix); // continues LIKE 'd%' else - dyStringPrintf(dy, ","); + dyStringAppend(dy, ","); // continues IN ('a' } if (limbVal==rootVar->vals // First val || (wilds && multiVals)) // and successive if wildcards { + boolean valWild = (strchr(limbVal->val,'%') != NULL); if (rootVar->notEqual) - dyStringPrintf(dy, "%s",(strchr(limbVal->val,'%') || limbVal->next)?"NOT ":"!"); - if (strchr(limbVal->val,'%') != NULL) - dyStringPrintf(dy, "LIKE "); - else if (!multiVals || wilds) - dyStringPrintf(dy, "= "); - else - dyStringPrintf(dy, "IN ("); + dyStringAppend(dy, (valWild || limbVal->next)?"NOT ":"!"); + dyStringAppend(dy, (valWild ? "LIKE " : (!multiVals || wilds ? "= " : "IN ("))); } dyStringPrintf(dy, "'%s'", sqlEscapeString(limbVal->val)); } if (multiVals) - dyStringPrintf(dy, ")"); - dyStringPrintf(dy, ")"); + dyStringPrintf(dy, ")"); // closes IN ('a','b','c') || AND (val LIKE 'd%' or val LIKE 'e%') } -dyStringPrintf(dy, " ORDER BY binary T1.obj, T1.var"); // binary forces case sensitive sort -verbose(2, "Requesting query:\n\t%s;\n",dyStringContents(dy)); +verbose(2, "Requesting mdbObjsQueryByVars query:\n\t%s;\n",dyStringContents(dy)); struct mdb *mdb = mdbLoadByQuery(conn, dyStringCannibalize(&dy)); verbose(3, "rows (vars) returned: %d\n",slCount(mdb)); +slSort(&mdb,mdbCmp); // Use internal sort instead of ORDER BY because of mysql inefficiency struct mdbObj *mdbObjs = mdbObjsLoadFromMemory(&mdb,TRUE); verbose(3, "Returned %d object(s) with %d var(s).\n", mdbObjCount(mdbObjs,TRUE),mdbObjCount(mdbObjs,FALSE)); return mdbObjs; } struct mdbObj *mdbObjsQueryByVarValString(struct sqlConnection *conn,char *tableName,char *varVals) // returns mdbObjs matching varVals in form of: // [var1=val1 var2=val2a,val2b var3=v%3 var4="val 4" var5!=val5 var6=] // var2=val2a,val2b: matches asny of comma separated list // var3=v%3 : matches '%' and '?' wild cards. // var4="val 4" : matches simple double quoted strings. // var5!=val5 : matches not equal. // var6= : matches that var exists (same as var6=%). var6!= also works. { @@ -3312,54 +3318,54 @@ struct mdbObj *mdbObjSearch(struct sqlConnection *conn, char *var, char *val, char *op, int limit) // Search the metaDb table for objs by var and val. // Can restrict by op "is", "like", "in" and accept (non-zero) limited string size // Search is via mysql, so it's case-insensitive. Return is sorted on obj. { if (var == NULL && val == NULL) errAbort("mdbObjSearch requests objects but provides no criteria.\n"); char *tableName = mdbTableName(conn,TRUE); // Look for sandBox name first // Build a query string struct dyString *dyQuery = dyStringNew(512); dyStringPrintf(dyQuery,"select l1.obj, l1.var, l1.val from %s l1",tableName); if (var != NULL || val != NULL) - dyStringPrintf(dyQuery," where exists (select l2.obj from %s l2 where l2.obj = l1.obj and ", + dyStringPrintf(dyQuery," JOIN %s l2 ON l2.obj = l1.obj and ", tableName); if (var != NULL) dyStringPrintf(dyQuery,"l2.var = '%s'", var); if (var != NULL && val != NULL) dyStringAppend(dyQuery," and "); if (val != NULL) { dyStringAppend(dyQuery,"l2.val "); if (sameString(op, "in")) dyStringPrintf(dyQuery,"in (%s)", val); // Note, must be a formatted string already: 'a','b','c' or 1,2,3 else if (sameString(op, "contains") || sameString(op, "like")) dyStringPrintf(dyQuery,"like '%%%s%%'", val); else if (limit > 0 && strlen(val) != limit) dyStringPrintf(dyQuery,"like '%.*s%%'", limit, val); else dyStringPrintf(dyQuery,"= '%s'", val); } -dyStringAppendC(dyQuery,')'); -dyStringAppend(dyQuery," order by obj"); +verbose(2, "Requesting mdbObjSearch query:\n\t%s;\n",dyStringContents(dyQuery)); struct mdb *mdb = mdbLoadByQuery(conn, dyStringCannibalize(&dyQuery)); verbose(3, "rows (vars) returned: %d\n",slCount(mdb)); +slSort(&mdb,mdbCmp); // Use internal sort instead of ORDER BY because of mysql inefficiency struct mdbObj *mdbObjs = mdbObjsLoadFromMemory(&mdb,TRUE); return mdbObjs; } struct mdbObj *mdbObjRepeatedSearch(struct sqlConnection *conn,struct slPair *varValPairs, boolean tables,boolean files) // Search the metaDb table for objs by var,val pairs. Uses mdbCvSearchMethod() if available. // This method will use mdbObjsQueryByVars() { struct slPair *onePair; struct dyString *dyTerms = dyStringNew(256); // Build list of terms as "var1=val1 var2=val2a,val2b,val2c var3=%val3%" for (onePair = varValPairs; onePair != NULL; onePair = onePair->next) {