c8e1dc987ba5532c65be54d8f43af1ec4337fb10 angie Sat Nov 21 09:52:07 2015 -0800 Back-end implementation of Data Integrator's support for related tables and fields using all.joiner. Most joins are implemented using a new module, hashJoin.c; but SQL joins are used in certain cases when hash joins are impractical and SQL joins are actually faster. A new module joinMixer determines which joins should be implemented by hashJoin vs SQL, and computes row indices for hashJoin objects to find keys (from SQL or other hashJoins) and store results. The SQL join info from joinMixer is translated into SQL queries in annoStreamDb. annoStreamDb also generates its own autoSql asObject, adding the fields from related tables after the fields of the main track table. Main changes: - annoStreamDb.c - main table SQL query now uses <table>.<field> instead of just <field> to avoid clashes with same field name in different tables - SQL joins return multiple rows for a single main table row when there are multiple matching rows in a related table; these rows need to be squashed into one row with the multiple matches comma-separated, both to match hgTables behavior and to avoid overflow of rowBuf. (glomSqlDup) - as mentioned above, generate joining SQL queries when necessary and generate own asObj including selected fields from related tables. - parse JSON config object with relatedTables spec from UI via hgi_querySpec hashJoin basically slurps a related table into a big hash of keys to values, perform lookups (possibly of multiple keys), and formats each column's results. It includes a lot of tweaks to match hgTables/joining.c output char-for-char: collapse adjacent duplicate matches, commas at end of matches from multiple key lookups, reversed order of multiple match values. hgTables/joining.c uses arrays of slNames, but in order to avoid all that allocation I'm just glomming into an array of reused dyStrings. joinMixer takes a list of fields to include in output, gets a list of joins to be performed (from joinerRouteThroughAll), applies some simple rough heuristics to guess whether a join is practical in SQL, and decides which joins to do by SQL and which to do by hashJoin. It plans a row format with several groups of fields in this order: main table fields, related table fields to appear in the output, related table fields needed by hashJoins, hashJoin result fields needed by other hashJoins, and hashJoin result fields to appear in output. It initializes hashJoins with precomputed row indexes and also provides a mapping from big-row columns to the columns that appear in output. Thanks to Matt for testing on demo6 during development. refs #15544 diff --git src/hg/lib/joiner.c src/hg/lib/joiner.c index 7ff4883..280f0f1 100644 --- src/hg/lib/joiner.c +++ src/hg/lib/joiner.c @@ -957,59 +957,101 @@ struct joinerDtf *joinerDtfNew(char *database, char *table, char *field) /* Create new joinerDtf. */ { struct joinerDtf *dtf; AllocVar(dtf); dtf->database = cloneString(database); dtf->table = cloneString(table); dtf->field = cloneString(field); return dtf; } struct joinerDtf *joinerDtfClone(struct joinerDtf *dtf) /* Return duplicate (deep copy) of joinerDtf. */ { -return joinerDtfNew(dtf->database, dtf->table, dtf->field); +return dtf ? joinerDtfNew(dtf->database, dtf->table, dtf->field) : NULL; } static void notTriple(char *s) /* Complain that s is not in dotted triple format. */ { errAbort("%s not a dotted triple", s); } struct joinerDtf *joinerDtfFromDottedTriple(char *triple) /* Get joinerDtf from something in db.table.field format. */ { char *s, *e; struct joinerDtf *dtf; AllocVar(dtf); s = triple; e = strchr(s, '.'); if (e == NULL) notTriple(triple); dtf->database = cloneStringZ(s, e-s); s = e+1; e = strrchr(s, '.'); if (e == NULL) notTriple(triple); dtf->table = cloneStringZ(s, e-s); dtf->field = cloneString(e+1); return dtf; } +boolean joinerDtfSame(struct joinerDtf *dtfA, struct joinerDtf *dtfB) +/* Return TRUE if both are NULL or if both have same db, table and field. */ +{ +if (dtfA == NULL && dtfB == NULL) + return TRUE; +else if (dtfA != NULL && dtfB != NULL) + return (sameString(dtfA->database, dtfB->database) && + sameString(dtfA->table, dtfB->table) && + sameString(dtfA->field, dtfB->field)); +return FALSE; +} + +struct joinerDtf *joinerDtfFind(struct joinerDtf *dtfList, struct joinerDtf *dtf) +/* Return the first element of dtfList that is joinerDtfSame as dtf, or NULL if no such. */ +{ +struct joinerDtf *el; +for (el = dtfList; el != NULL; el = el->next) + if (joinerDtfSame(el, dtf)) + return el; +return NULL; +} + +void joinerDtfToSqlFieldString(struct joinerDtf *dtf, char *db, char *buf, size_t bufSize) +/* If dtf->database is different from db (or db is NULL), write database.table.field info buf, + * otherwise just table.field. */ +{ +if (differentString(dtf->database, db)) + safef(buf, bufSize, "%s.%s.%s", dtf->database, dtf->table, dtf->field); +else + safef(buf, bufSize, "%s.%s", dtf->table, dtf->field); +} + +void joinerDtfToSqlTableString(struct joinerDtf *dtf, char *db, char *buf, size_t bufSize) +/* If dtf->database is different from db (or db is NULL), write database.table info buf, + * otherwise just table. */ +{ +if (db == NULL || differentString(dtf->database, db)) + safef(buf, bufSize, "%s.%s", dtf->database, dtf->table); +else + safef(buf, bufSize, "%s", dtf->table); +} + void joinerDtfFree(struct joinerDtf **pDtf) /* Free up resources associated with joinerDtf. */ { struct joinerDtf *dtf = *pDtf; if (dtf != NULL) { freeMem(dtf->database); freeMem(dtf->table); freeMem(dtf->field); freez(pDtf); } } void joinerDtfFreeList(struct joinerDtf **pList) /* Free up memory associated with list of joinerDtfs. */ @@ -1111,30 +1153,56 @@ struct slRef *joinerSetInheritanceChain(struct joinerSet *js) /* Return list of self, children, and parents (but not siblings). * slFreeList result when done. */ { struct slRef *list = NULL; struct joinerSet *parent; /* Add self and parents. */ for (parent = js; parent != NULL; parent = parent->parent) refAdd(&list, parent); addChildren(js, &list); slReverse(&list); return list; } +struct joinerField *joinerSetFindField(struct joinerSet *js, struct joinerDtf *dtf) +/* Find field in set if any that matches dtf */ +{ +struct slRef *chain = joinerSetInheritanceChain(js), *link; +struct joinerField *jf, *ret = NULL; +for (link = chain; link != NULL; link = link->next) + { + js = link->val; + for (jf = js->fieldList; jf != NULL; jf = jf->next) + { + if (sameString(dtf->table, jf->table) && sameString(dtf->field, jf->field)) + { + if (slNameInList(jf->dbList, dtf->database)) + { + ret = jf; + break; + } + } + } + if (ret != NULL) + break; + } +slFreeList(&chain); +return ret; +} + static struct joinerPair *joinerToField( char *aDatabase, struct joinerField *aJf, char *bDatabase, struct joinerField *bJf, struct joinerSet *identifier) /* Construct joiner pair linking from a to b. */ { struct joinerPair *jp; AllocVar(jp); jp->a = joinerDtfNew(aDatabase, aJf->table, aJf->field); jp->b = joinerDtfNew(bDatabase, bJf->table, bJf->field); jp->identifier = identifier; return jp; } @@ -1358,15 +1426,78 @@ if (first->next == NULL) return NULL; for (dtf = first->next; dtf != NULL; dtf = dtf->next) { if (!inRoute(fullRoute, dtf) && !joinerDtfSameTable(first, dtf)) { pairRoute = joinerFindRoute(joiner, first, dtf); fullRoute = slCat(fullRoute, pairRoute); } } joinerPairRemoveDupes(&fullRoute); return fullRoute; } +char *joinerFieldChopKey(struct joinerField *jf, char *key) +/* If jf includes chopBefore and/or chopAfter, apply those to key and return a starting + * offset in key, which may be modified. */ +{ +struct slName *n; +for (n = jf->chopBefore; n != NULL; n = n->next) + { + if (startsWith(n->name, key)) + { + key += strlen(n->name); + break; + } + } +for (n = jf->chopAfter; n!=NULL; n = n->next) + { + char *e = strstr(key, n->name); + if (e != NULL) + { + *e = 0; + break; + } + } +return key; +} + +void joinerFieldIterateKey(struct joinerField *jf, void(*callback)(void *context, char *key), + void *context, char *key) +/* Process key according to jf -- if jf->separator, may result in list of processed keys -- + * and invoke callback with each nonempty processed key and context. */ +{ +if (isNotEmpty(jf->separator) || jf->chopBefore || jf->chopAfter) + { + int len = strlen(key); + char keyClone[len+1]; + safencpy(keyClone, sizeof(keyClone), key, len); + if (isEmpty(jf->separator)) + { + // No separator; just chop. + char *choppedKey = joinerFieldChopKey(jf, keyClone); + if (isNotEmpty(choppedKey)) + callback(context, choppedKey); + } + else + { + // Scan for separator; chop each separated key. + char *s = keyClone, *e; + char sep = jf->separator[0]; + while (isNotEmpty(s)) + { + e = strchr(s, sep); + if (e != NULL) + *e++ = 0; + s = joinerFieldChopKey(jf, s); + if (s[0] != 0) + callback(context, s); + s = e; + } + } + } +else if (isNotEmpty(key)) + // Just use the plain old key. + callback(context, key); +}