src/hg/lib/joinMixer.c c8e1dc987ba5532c65be54d8f43af1ec4337fb10

c8e1dc987ba5532c65be54d8f43af1ec4337fb10
angie
  Sat Nov 21 09:52:07 2015 -0800
Back-end implementation of Data Integrator's support for related tables and
fields using all.joiner.  Most joins are implemented using a new module,
hashJoin.c; but SQL joins are used in certain cases when hash joins are
impractical and SQL joins are actually faster.  A new module joinMixer
determines which joins should be implemented by hashJoin vs SQL, and computes
row indices for hashJoin objects to find keys (from SQL or other hashJoins)
and store results.  The SQL join info from joinMixer is translated into SQL
queries in annoStreamDb.  annoStreamDb also generates its own autoSql asObject,
adding the fields from related tables after the fields of the main track table.

Main changes:

- annoStreamDb.c
- main table SQL query now uses <table>.<field> instead of just <field> to
avoid clashes with same field name in different tables
- SQL joins return multiple rows for a single main table row when there are
multiple matching rows in a related table; these rows need to be squashed
into one row with the multiple matches comma-separated, both to match
hgTables behavior and to avoid overflow of rowBuf.  (glomSqlDup)
- as mentioned above, generate joining SQL queries when necessary and
generate own asObj including selected fields from related tables.
- parse JSON config object with relatedTables spec from UI via hgi_querySpec

hashJoin basically slurps a related table into a big hash of keys to values,
perform lookups (possibly of multiple keys), and formats each column's results.
It includes a lot of tweaks to match hgTables/joining.c output char-for-char:
collapse adjacent duplicate matches, commas at end of matches from multiple
key lookups, reversed order of multiple match values.  hgTables/joining.c uses
arrays of slNames, but in order to avoid all that allocation I'm just glomming
into an array of reused dyStrings.

joinMixer takes a list of fields to include in output, gets a list of joins
to be performed (from joinerRouteThroughAll), applies some simple rough
heuristics to guess whether a join is practical in SQL, and decides which joins
to do by SQL and which to do by hashJoin.  It plans a row format with several
groups of fields in this order: main table fields, related table fields to
appear in the output, related table fields needed by hashJoins, hashJoin
result fields needed by other hashJoins, and hashJoin result fields to appear
in output.  It initializes hashJoins with precomputed row indexes and also
provides a mapping from big-row columns to the columns that appear in output.

Thanks to Matt for testing on demo6 during development.

refs #15544

diff --git src/hg/lib/joinMixer.c src/hg/lib/joinMixer.c
new file mode 100644
index 0000000..9b5316a
--- /dev/null
+++ src/hg/lib/joinMixer.c
@@ -0,0 +1,363 @@
+/* joinMixer - implement multiple joins by sql joins and/or hashJoins, depending on which
+ * should work best for each joined table. */
+
+/* Copyright (C) 2015 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
+#include "joinMixer.h"
+#include "hdb.h"
+
+static void addUniqueJoinerDt(struct joinerDtf **pTableList, struct hash *uniqHash,
+                              struct joinerDtf *dt, char *db)
+/* Use a string key into the hash to build up a unique list of cloned joinerDtf's with
+ * ignored (possibly NULL) fields (table-only, hence "dt" not "dtf"). */
+{
+char dbTable[PATH_LEN];
+joinerDtfToSqlTableString(dt, db, dbTable, sizeof(dbTable));
+if (! hashLookup(uniqHash, dbTable))
+    {
+    slAddHead(pTableList, joinerDtfClone(dt));
+    hashAdd(uniqHash, dbTable, NULL);
+    }
+}
+
+static struct joinerDtf *tablesInOutput(char *db, char *mainTable,
+                                        struct joinerDtf *outputFieldList)
+/* Return a uniquified list of tables (NULL for field) that we need for selected output fields
+ * and filters, if applicable.  Joining these tables may require additional intermediate tables. */
+{
+struct hash *uniqHash = hashNew(0);
+struct joinerDtf *tableList = NULL;
+// Always include the main table, even if its fields aren't in output.
+struct joinerDtf *mainDt = joinerDtfNew(db, mainTable, NULL);
+addUniqueJoinerDt(&tableList, uniqHash, mainDt, db);
+struct joinerDtf *dtf;
+for (dtf = outputFieldList;  dtf != NULL;  dtf = dtf->next)
+    addUniqueJoinerDt(&tableList, uniqHash, dtf, db);
+hashFree(&uniqHash);
+slReverse(&tableList);
+return tableList;
+}
+
+static boolean noFancyKeywords(struct joinerField *jfA, struct joinerField *jfB)
+/* Return TRUE unless jfA or jfB has a keyword that is too fancy for sql joins. */
+{
+if (isNotEmpty(jfA->separator) && differentString(jfA->separator, ","))
+    return FALSE;
+if (isNotEmpty(jfB->separator))
+    return FALSE;
+if (jfA->chopBefore || jfA->chopAfter || jfB->chopBefore || jfB->chopAfter)
+    return FALSE;
+return TRUE;
+}
+
+static boolean dependsOn(struct joinerPair *jpA, struct joinerPair *jpBList)
+/* Return TRUE if jpA->a's table is found among the ->b's in jpBList. */
+{
+struct joinerPair *jpB;
+for (jpB = jpBList;  jpB != NULL;  jpB = jpB->next)
+    {
+    if (joinerDtfSameTable(jpA->a, jpB->b))
+        return TRUE;
+    }
+return FALSE;
+}
+
+static void partitionJoins(char *db, struct joinerPair *routeList, uint mainTableRowCount,
+                           struct joinerPair **retSqlRouteList,
+                           struct joinerPair **retHashJoinRouteList)
+/* For each table pair in routeList, figure out whether a sql join is possible and
+ * perhaps faster than brute force join.  If it looks like sql is faster, add to
+ * retSqlRouteList, otherwise to retHashJoinRouteList. */
+{
+struct sqlConnection *conn = hAllocConn(db);
+boolean mysqlFasterForMainTable = mainTableRowCount < 10000;
+struct joinerPair *jp, *jpNext;
+for (jp = routeList;  jp != NULL;  jp = jpNext)
+    {
+    jpNext = jp->next;
+    boolean useSql = FALSE;
+    struct joinerField *jfA = joinerSetFindField(jp->identifier, jp->a);
+    struct joinerField *jfB = joinerSetFindField(jp->identifier, jp->b);
+    if (noFancyKeywords(jfA, jfB))
+        {
+        char dbTable[PATH_LEN];
+        joinerDtfToSqlTableString(jp->b, db, dbTable, sizeof(dbTable));
+        uint rowCountB = sqlRowCount(conn, dbTable);
+        boolean relatedMuchBigger = mainTableRowCount + 10000000 < rowCountB;
+        if (mysqlFasterForMainTable || relatedMuchBigger)
+            useSql = TRUE;
+        }
+    if (useSql && !dependsOn(jp, *retHashJoinRouteList))
+        slAddHead(retSqlRouteList, jp);
+    else
+        slAddHead(retHashJoinRouteList, jp);
+    }
+slReverse(retSqlRouteList);
+slReverse(retHashJoinRouteList);
+hFreeConn(&conn);
+}
+
+static struct hash *tablesInRouteList(char *db, char *mainTable, struct joinerPair *routeList)
+/* Return a hash of names of tables in routeList to NULL values. */
+{
+struct hash *hash = hashNew(0);
+// Always add mainTable
+char dbTable[PATH_LEN];
+hashAdd(hash, mainTable, NULL);
+// Add all pairs' ->b's
+struct joinerPair *jp;
+for (jp = routeList;  jp != NULL;  jp = jp->next)
+    {
+    joinerDtfToSqlTableString(jp->b, db, dbTable, sizeof(dbTable));
+    hashAdd(hash, dbTable, NULL);
+    }
+return hash;
+}
+
+static void addField(struct hash *fieldToIx, struct joinerDtf *dtf, char *db, uint *pIx)
+/* If stringified dtf is not already in hash, add it with value=*pIx and increment *pIx. */
+{
+char dtField[PATH_LEN];
+joinerDtfToSqlFieldString(dtf, db, dtField, sizeof(dtField));
+if (!hashLookup(fieldToIx, dtField))
+    {
+    hashAddInt(fieldToIx, dtField, *pIx);
+    *pIx = *pIx + 1;
+    }
+}
+
+static uint mustFindIx(struct hash *fieldToIx, struct joinerDtf *dtf, char *db, char *errMsg)
+/* Look up nonnegative int value of stringified dtf; die if not found, otherwise return it. */
+{
+char dtField[PATH_LEN];
+joinerDtfToSqlFieldString(dtf, db, dtField, sizeof(dtField));
+int bigRowKeyIx = hashIntValDefault(fieldToIx, dtField, -1);
+if (bigRowKeyIx < 0)
+    errAbort("%s '%s'", errMsg, dtField);
+return (uint)bigRowKeyIx;
+}
+
+struct hjTableInfo
+// Info accumulator for constructing a new hashJoin
+{
+    struct hjTableInfo *next;
+    char *dbTable;
+    struct joinerPair *jp;
+    struct joinerDtf *hashValCols;
+};
+
+static struct hjTableInfo *hjTableInfoNew(char *dbTable, struct joinerPair *jp,
+                                          struct joinerDtf *hashValCol)
+// Return a new hjTableInfo for dbTable; if jp is non-NULL, jp->b is the hash key field.
+{
+struct hjTableInfo *self;
+AllocVar(self);
+self->dbTable = cloneString(dbTable);
+self->jp = jp;
+self->hashValCols = joinerDtfClone(hashValCol);
+return self;
+}
+
+static struct hjTableInfo *hjTableInfoFind(struct hjTableInfo *list, char *dbTable)
+// Just use slPair find because the first two fields are next and char *name.
+{
+return (struct hjTableInfo *)slPairFind((struct slPair *)list, dbTable);
+}
+
+static void hjTableInfoAddKey(struct hjTableInfo **pList, struct joinerPair *jp, char *db)
+// jp->b is the hash key field for its table; look up jp->b's table in pList, add if necessary,
+// setting the table's jp.
+{
+char dbTable[PATH_LEN];
+joinerDtfToSqlTableString(jp->b, db, dbTable, sizeof(dbTable));
+struct hjTableInfo *infoForTable = hjTableInfoFind(*pList, dbTable);
+if (infoForTable == NULL)
+    slAddHead(pList, hjTableInfoNew(dbTable, jp, NULL));
+else if (infoForTable->jp != NULL)
+    errAbort("Multiple keys for %s: %s and %s", dbTable, infoForTable->jp->b->field, jp->b->field);
+else
+    infoForTable->jp = jp;
+}
+
+static void hjTableInfoAddVal(struct hjTableInfo **pList, struct joinerDtf *dtf, char *db)
+// Find dtf in pList -- it should be there already if hjRouteList is traversed in order --
+// and add dtf to its hash value column list if not already there.
+{
+char dbTable[PATH_LEN];
+joinerDtfToSqlTableString(dtf, db, dbTable, sizeof(dbTable));
+struct hjTableInfo *infoForTable = hjTableInfoFind(*pList, dbTable);
+if (infoForTable == NULL)
+    {
+    errAbort("hjTableInfoAddVal: can't find table '%s' in list", dbTable);
+    }
+else if (! joinerDtfFind(infoForTable->hashValCols, dtf))
+    slAddTail(&infoForTable->hashValCols, joinerDtfClone(dtf));
+}
+
+static void hjTableInfoFree(struct hjTableInfo **pHjti)
+// Free up *pHjti & contents if not NULL.
+{
+if (! pHjti || ! *pHjti)
+    return;
+struct hjTableInfo *hjti = *pHjti;
+freeMem(hjti->dbTable);
+joinerDtfFreeList(&hjti->hashValCols);
+freez(pHjti);
+}
+
+static void hjTableInfoFreeList(struct hjTableInfo **pHjtiList)
+// Free up every member of *pHjtiList if not NULL.
+{
+if (! pHjtiList || ! *pHjtiList)
+    return;
+struct hjTableInfo *hjti = *pHjtiList;
+while (hjti != NULL)
+    {
+    struct hjTableInfo *hjtiNext = hjti->next;
+    hjTableInfoFree(&hjti);
+    hjti = hjtiNext;
+    }
+*pHjtiList = NULL;
+}
+
+struct joinMixer *joinMixerNew(struct joiner *joiner, char *db, char *mainTable,
+                               struct joinerDtf *outputFieldList,
+                               uint mainTableRowCount, boolean naForMissing)
+/* If outputFieldList contains fields from more than one table, use joiner to figure
+ * out the route of table joins to relate all fields; for each table, predict whether
+ * it would be more efficient to join by sql or by hashJoin, taking into account
+ * anticipated row counts.  Return info sufficient for building a sql query, a list
+ * of hashJoins, bigRow size, and indexes in bigRow for each output.
+ * If naForMissing is TRUE then the hashJoiner result columns will contain "n/a" when
+ * there is no match in the hash. */
+{
+struct joinMixer *jmOut;
+AllocVar(jmOut);
+// Figure out what tables we need to query, and whether each one should be joined
+// by sql or hash:
+struct joinerDtf *tablesNeeded = tablesInOutput(db, mainTable, outputFieldList);
+struct joinerPair *routeList = joinerFindRouteThroughAll(joiner, tablesNeeded);
+struct joinerPair *hjRouteList = NULL;
+partitionJoins(db, routeList, mainTableRowCount, &jmOut->sqlRouteList, &hjRouteList);
+// routeList was clobbered, make sure we don't try to use it again:
+routeList = NULL;
+char dbTable[PATH_LEN];
+// Split output fields into those that come from sql and those that come from hashJoins:
+// Assign indices in external row to each output from mysql.
+struct hash *fieldToIx = hashNew(0);
+uint bigRowIx = 0;
+struct hash *sqlTables = tablesInRouteList(db, mainTable, jmOut->sqlRouteList);
+struct joinerDtf *hjFieldList = NULL;
+struct joinerDtf *dtf;
+for (dtf = outputFieldList;  dtf != NULL;  dtf = dtf->next)
+    {
+    joinerDtfToSqlTableString(dtf, db, dbTable, sizeof(dbTable));
+    if (hashLookup(sqlTables, dbTable) || sameString(dbTable, mainTable))
+        {
+        slAddHead(&jmOut->sqlFieldList, joinerDtfClone(dtf));
+        addField(fieldToIx, dtf, db, &bigRowIx);
+        }
+    else
+        {
+        slAddHead(&hjFieldList, joinerDtfClone(dtf));
+        // Don't add to bigRow/fieldToIx because we might need to tack on some sql fields
+        // for hashJoin keys.
+        }
+    }
+slReverse(&jmOut->sqlFieldList);
+slReverse(&hjFieldList);
+
+// Add hashJoin jp->a keys that come from sqlTables to sqlFieldList and bigRow if not already
+// in sqlFieldList
+struct joinerPair *jp;
+for (jp = hjRouteList;  jp != NULL;  jp = jp->next)
+    {
+    joinerDtfToSqlTableString(jp->a, db, dbTable, sizeof(dbTable));
+    if (hashLookup(sqlTables, dbTable) && !joinerDtfFind(jmOut->sqlFieldList, jp->a))
+        {
+        slAddTail(&jmOut->sqlFieldList, joinerDtfClone(jp->a));
+        addField(fieldToIx, jp->a, db, &bigRowIx);
+        }
+    }
+
+// Now that sqlFieldList is complete, add hashJoin output fields to bigRow.
+for (dtf = hjFieldList;  dtf != NULL;  dtf = dtf->next)
+    addField(fieldToIx, dtf, db, &bigRowIx);
+
+// Add hashJoin key info (jp->b) to hjTableCols.  If any hashJoin key fields take values
+// produced by other hashJoins (jp->a), but those columns don't appear in output, add them
+// to bigRow.
+struct hjTableInfo *hjTableCols = NULL;
+for (jp = hjRouteList;  jp != NULL;  jp = jp->next)
+    {
+    hjTableInfoAddKey(&hjTableCols, jp, db);
+    joinerDtfToSqlTableString(jp->a, db, dbTable, sizeof(dbTable));
+    if (! hashLookup(sqlTables, dbTable))
+        hjTableInfoAddVal(&hjTableCols, jp->a, db);
+    addField(fieldToIx, jp->a, db, &bigRowIx);
+    }
+
+// Done assigning bigRow indices; set bigRowSize.
+jmOut->bigRowSize = bigRowIx;
+
+// Add hash output fields to hjTableCols
+for (dtf = hjFieldList;  dtf != NULL;  dtf = dtf->next)
+    hjTableInfoAddVal(&hjTableCols, dtf, db);
+
+// Build up jmOut->hashJoins
+slReverse(&hjTableCols);
+struct hjTableInfo *hjti;
+int i;
+for (i = 0, hjti = hjTableCols;  hjti != NULL;  hjti = hjti->next, i++)
+    {
+    if (hjti->jp == NULL)
+        errAbort("hjTableInfo for %s has NULL jp", hjti->dbTable);
+    if (hjti->hashValCols == NULL)
+        errAbort("hjTableInfo for %s has NULL hashValCols", hjti->dbTable);
+    uint bigRowKeyIx = mustFindIx(fieldToIx, hjti->jp->a, db,
+                                  "joinMixerNew: Can't find index of hashJoin index");
+    struct joinerField *jfA = joinerSetFindField(hjti->jp->identifier, hjti->jp->a);
+    struct joinerField *jfB = joinerSetFindField(hjti->jp->identifier, hjti->jp->b);
+    uint valCount = slCount(hjti->hashValCols);
+    uint bigRowColIxs[valCount];
+    struct joinerDtf *col;
+    int colIx;
+    for (colIx = 0, col = hjti->hashValCols;  col != NULL;  col = col->next, colIx++)
+        bigRowColIxs[colIx] = mustFindIx(fieldToIx, col, db,
+                                         "joinMixerNew: Missing bigRow ix for");
+    slAddHead(&jmOut->hashJoins,
+              hashJoinNew(hjti->jp->b, bigRowKeyIx,
+                          hjti->hashValCols, bigRowColIxs,
+                          jfA, jfB, naForMissing));
+    }
+slReverse(&jmOut->hashJoins);
+
+// Fill in each output field's index into bigRow
+AllocArray(jmOut->outIxs, slCount(outputFieldList));
+uint outRowIx;
+for (outRowIx = 0, dtf = outputFieldList;  dtf != NULL;  dtf = dtf->next, outRowIx++)
+    jmOut->outIxs[outRowIx] = mustFindIx(fieldToIx, dtf, db,
+                                        "joinMixerNew: no bigRowIx for output field");
+
+joinerPairFreeList(&hjRouteList);
+joinerDtfFreeList(&hjFieldList);
+hjTableInfoFreeList(&hjTableCols);
+hashFree(&fieldToIx);
+hashFree(&sqlTables);
+joinerDtfFreeList(&tablesNeeded);
+return jmOut;
+}
+
+void joinMixerFree(struct joinMixer **pJm)
+/* Free joinMixer's holdings unless already NULL. */
+{
+if (!pJm || !*pJm)
+    return;
+struct joinMixer *jm = *pJm;
+joinerPairFreeList(&jm->sqlRouteList);
+joinerDtfFreeList(&jm->sqlFieldList);
+hashJoinFreeList(&jm->hashJoins);
+freeMem(jm->outIxs);
+freez(pJm);
+}