src/hg/lib/hashJoin.c c8e1dc987ba5532c65be54d8f43af1ec4337fb10

c8e1dc987ba5532c65be54d8f43af1ec4337fb10
angie
  Sat Nov 21 09:52:07 2015 -0800
Back-end implementation of Data Integrator's support for related tables and
fields using all.joiner.  Most joins are implemented using a new module,
hashJoin.c; but SQL joins are used in certain cases when hash joins are
impractical and SQL joins are actually faster.  A new module joinMixer
determines which joins should be implemented by hashJoin vs SQL, and computes
row indices for hashJoin objects to find keys (from SQL or other hashJoins)
and store results.  The SQL join info from joinMixer is translated into SQL
queries in annoStreamDb.  annoStreamDb also generates its own autoSql asObject,
adding the fields from related tables after the fields of the main track table.

Main changes:

- annoStreamDb.c
- main table SQL query now uses <table>.<field> instead of just <field> to
avoid clashes with same field name in different tables
- SQL joins return multiple rows for a single main table row when there are
multiple matching rows in a related table; these rows need to be squashed
into one row with the multiple matches comma-separated, both to match
hgTables behavior and to avoid overflow of rowBuf.  (glomSqlDup)
- as mentioned above, generate joining SQL queries when necessary and
generate own asObj including selected fields from related tables.
- parse JSON config object with relatedTables spec from UI via hgi_querySpec

hashJoin basically slurps a related table into a big hash of keys to values,
perform lookups (possibly of multiple keys), and formats each column's results.
It includes a lot of tweaks to match hgTables/joining.c output char-for-char:
collapse adjacent duplicate matches, commas at end of matches from multiple
key lookups, reversed order of multiple match values.  hgTables/joining.c uses
arrays of slNames, but in order to avoid all that allocation I'm just glomming
into an array of reused dyStrings.

joinMixer takes a list of fields to include in output, gets a list of joins
to be performed (from joinerRouteThroughAll), applies some simple rough
heuristics to guess whether a join is practical in SQL, and decides which joins
to do by SQL and which to do by hashJoin.  It plans a row format with several
groups of fields in this order: main table fields, related table fields to
appear in the output, related table fields needed by hashJoins, hashJoin
result fields needed by other hashJoins, and hashJoin result fields to appear
in output.  It initializes hashJoins with precomputed row indexes and also
provides a mapping from big-row columns to the columns that appear in output.

Thanks to Matt for testing on demo6 during development.

refs #15544

diff --git src/hg/lib/hashJoin.c src/hg/lib/hashJoin.c
new file mode 100644
index 0000000..06ca10b
--- /dev/null
+++ src/hg/lib/hashJoin.c
@@ -0,0 +1,312 @@
+/* hashJoin - join one or more columns of a hashed database table to an externally provided
+ * char **row that contains the key and empty slot(s) for the column value(s) */
+
+/* Copyright (C) 2015 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
+#include "hashJoin.h"
+#include "hdb.h"
+#include "obscure.h"
+
+struct hashJoin
+// Implements table join as a hash lookup: the key is taken from some column of an externally
+// provided row, and one or more values are retrieved and then stored in specified columns
+// of the external row.  If a key has more than one set of matching columns, then each
+// column's values are glommed into a comma-separated list for that column in the external row.
+{
+    struct hashJoin *next;
+    struct hash *hash;			// Hash some kind of key to char **row of column values
+    uint extRowKeyIx;			// Index of hash key to take from external row
+    uint valCount;			// Number of columns in hash value rows
+    uint *extRowValIxs;			// Index of each hash value column to store in external row
+    struct dyString **colValues;	// Accumulators for hash value columns -- multiple
+					// results from hash lookup become comma-sep strings
+    struct lm *lm;			// For storing hash values, misc strings & arrays
+    struct joinerField *jfA;		// If non-NULL, its separator, chopBefore and chopAfter
+					// are applied to each key accessed by hashJoinOneRow.
+    struct joinerField *jfB;		// If non-NULL, its chopBefore and chopAfter
+					// are applied to each key passed to hashJoinAddMapping.
+    char *db;				// Database from which to load hash
+    char *table;			// Table from which to load hash
+    char *query;			// SQL query to execute when loading hash
+    boolean loaded;			// TRUE when table contents have been loaded into hash
+    boolean naForMissing;		// If TRUE, then output "n/a" when there's no match
+};
+
+struct hashJoin *hashJoinNew(struct joinerDtf *keyDtf, uint extRowKeyIx,
+                             struct joinerDtf *valDtfs, uint *extRowValIxs,
+                             struct joinerField *jfA, struct joinerField *jfB,
+                             boolean naForMissing)
+/* Return a new hashJoin.  extRowKeyIx is the index in an external row of the key
+ * to use in the join.  extRowValIxs[valCount] contains each hash val column's index
+ * into an external row.  jfA and jfB are optional; if given, then jfA's separator,
+ * chopBefore and chopAfter will be applied to each key retrieved from the external row
+ * and jfB's separator, chopBefore and chopAfter will be applied to each hash key.
+ * If naForMissing is TRUE then the result columns will contain "n/a" when there is
+ * no match in the hash. */
+{
+struct hashJoin *self;
+AllocVar(self);
+self->extRowKeyIx = extRowKeyIx;
+int valCount = slCount(valDtfs);
+self->valCount = valCount;
+// Save some inner-loop tests if no separating or chopping will be required:
+if (jfA && (jfA->separator || jfA->chopBefore || jfA->chopAfter))
+    self->jfA = jfA;
+if (jfB && (jfB->separator || jfB->chopBefore || jfB->chopAfter))
+    self->jfB = jfB;
+self->lm = lmInit(0);
+lmAllocArray(self->lm, self->extRowValIxs, valCount);
+CopyArray(extRowValIxs, self->extRowValIxs, valCount);
+lmAllocArray(self->lm, self->colValues, valCount);
+int i;
+for (i = 0;  i < valCount;  i++)
+    self->colValues[i] = dyStringNew(0);
+self->db = lmCloneString(self->lm, keyDtf->database);
+self->table = lmCloneString(self->lm, keyDtf->table);
+struct dyString *query = sqlDyStringCreate("select %s", keyDtf->field);
+struct joinerDtf *dtf;
+for (dtf = valDtfs;  dtf != NULL;  dtf = dtf->next)
+    {
+    if (differentString(dtf->database, self->db) ||
+        differentString(dtf->table, self->table))
+        errAbort("hashJoinNew: inconsistent key field (%s.%s.%s) and value field (%s.%s.%s)",
+                 keyDtf->database, keyDtf->table, keyDtf->field,
+                 dtf->database, dtf->table, dtf->field);
+    dyStringAppendC(query, ',');
+    dyStringAppend(query, dtf->field);
+    }
+dyStringPrintf(query, " from %s", self->table);
+self->query = dyStringCannibalize(&query);
+self->naForMissing = naForMissing;
+return self;
+}
+
+struct hashJoin *hashJoinNext(struct hashJoin *el)
+/* Get the next hashJoin in a list of hashJoins. */
+{
+return el->next;
+}
+
+struct hjAddOneContext
+// joinerFieldIterateKey context for use by hashJoinAddOne
+{
+    struct hash *hash;
+    char **clonedValues;
+};
+
+static void hashJoinAddOne(void *context, char *key)
+/* Add values from context to hash from context for key.
+ * This is a callback for joinerFieldIterateKey; context is struct hjAddOneContext *. */
+{
+struct hjAddOneContext *ctx = context;
+hashAdd(ctx->hash, key, ctx->clonedValues);
+}
+
+static void hashJoinLoad(struct hashJoin *self)
+/* Load table contents into hash. */
+{
+if (self->loaded)
+    errAbort("hashJoinLoad: loaded flag already set");
+struct sqlConnection *conn = hAllocConn(self->db);
+int rowCount = sqlRowCount(conn, self->table);
+int hashSize = min(digitsBaseTwo(rowCount), hashMaxSize);
+self->hash = hashNew(hashSize);
+char **row;
+struct sqlResult *sr = sqlGetResult(conn, self->query);
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char **clonedValues = lmCloneRow(self->lm, row+1, self->valCount);
+    struct hjAddOneContext context = { self->hash, clonedValues };
+    // If necessary, process key according to self->jfA.
+    if (self->jfB)
+        joinerFieldIterateKey(self->jfB, hashJoinAddOne, &context, row[0]);
+    else
+        hashAdd(self->hash, row[0], clonedValues);
+    }
+self->loaded = TRUE;
+hFreeConn(&conn);
+}
+
+struct hjKeyContext
+{
+    struct hashJoin *self;
+    boolean includeEmpties;
+    boolean matchCount;
+};
+
+static void hashJoinOneKey(void *context, char *key)
+/* Look up some processed key in hash and accumulate results for each column.
+ * This is a callback for joinerFieldIterateKey; context is struct hashJoin *. */
+{
+struct hjKeyContext *ctx = context;
+struct hashJoin *self = ctx->self;
+struct hashEl *helFirst = hashLookup(self->hash, key);
+// hgTables accumulates multiple match values with slAddHead so they are
+// printed in reverse.  Use arrays to accumulate multiple matched rows; we'll step
+// through them backwards in hashJoinGlomMultipleMatches to match hgTables' order.
+int helMaxCount = slCount(helFirst);
+char **matchRows[helMaxCount];
+struct hashEl *hel;
+int matchIx;
+for (matchIx = 0, hel = helFirst;  hel != NULL;  hel = hashLookupNext(hel), matchIx++)
+    {
+    char **row = hel->val;
+    matchRows[matchIx] = row;
+    }
+int matchCount = matchIx;
+ctx->matchCount += matchCount;
+// When there are multiple matches, hgTables includes empty vals and prints a comma after each item.
+boolean includeEmpties = ctx->includeEmpties || (matchCount > 1);
+// Step through matchRows in reverse order to match hgTables.
+for (matchIx = matchCount - 1;  matchIx >= 0;  matchIx--)
+    {
+    char **row = matchRows[matchIx];
+    int valIx;
+    for (valIx = 0;  valIx < self->valCount;  valIx++)
+        {
+        char *val = row[valIx];
+        if (isNotEmpty(val) || includeEmpties)
+            {
+            // Skip over adjacent duplicate values
+            struct dyString *colDy = self->colValues[valIx];
+            int colDyLen = dyStringLen(colDy);
+            boolean isDup = FALSE;
+            if (matchIx < matchCount - 1)
+                {
+                char **prevRow = matchRows[matchIx+1];
+                char *prevVal = (prevRow == NULL) ? NULL : prevRow[valIx];
+                isDup = sameOk(val, prevVal);
+                }
+            else
+                // If there's no previous row to compare to from this key, but colDy already
+                // ends with the same value, consider this a duplicate:
+                isDup = colDyLen > 0 && endsWithWordComma(colDy->string, val);
+            if (! isDup)
+                {
+                if (includeEmpties)
+                    {
+                    if (isNotEmpty(val))
+                        dyStringAppend(colDy, val);
+                    dyStringAppendC(colDy, ',');
+                    }
+                else
+                    {
+                    if (colDyLen > 0)
+                        dyStringAppendC(colDy, ',');
+                    dyStringAppend(colDy, val);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void hashJoinChopCommaKey(struct hjKeyContext *context, struct joinerField *jfA, char *key)
+/* Chop key by comma, regardless of jfA->separator; for each item, apply jfA's chopBefore
+ * and chopAfter if applicable, and try to join the result. */
+{
+context->includeEmpties = TRUE;
+int len = strlen(key);
+char keyClone[len+1];
+safencpy(keyClone, sizeof(keyClone), key, len);
+char *s = keyClone, *e;
+while (isNotEmpty(s))
+    {
+    e = strchr(s, ',');
+    if (e != NULL)
+        *e++ = 0;
+    if (jfA)
+        s = joinerFieldChopKey(jfA, s);
+    if (s[0] != 0)
+        hashJoinOneKey(context, s);
+    s = e;
+    }
+}
+
+void hashJoinOneRow(struct hashJoin *self, char **extRow)
+/* Look up some column of extRow in hash and place result(s) in other columns of extRow.
+ * Don't call this again until done with extRow -- column value storage is reused. */
+{
+if (!self->loaded)
+    hashJoinLoad(self);
+char *key = extRow[self->extRowKeyIx];
+if (isNotEmpty(key))
+    {
+    // Clear accumulators
+    uint i;
+    for (i = 0;  i < self->valCount;  i++)
+        dyStringClear(self->colValues[i]);
+    // If necessary, process key according to self->jfA. Look up key(s) and accumulate results.
+    struct joinerField *jfA = self->jfA;
+    struct hjKeyContext context = { self, FALSE, FALSE };
+    if (jfA)
+        {
+        context.includeEmpties = TRUE;
+        joinerFieldIterateKey(jfA, hashJoinOneKey, &context, key);
+        }
+    else
+        hashJoinOneKey(&context, key);
+    // In case we're processing comma-glommed results from some other hash join --
+    // if there were no results, but the key contains commas and wasn't already comma-chopped
+    // by joinerFieldIterateKey, try comma-chopping it and looking up the pieces.
+    if (context.matchCount == 0 &&
+        ! (jfA && sameOk(jfA->separator, ",")) &&
+        strchr(key, ','))
+        {
+        hashJoinChopCommaKey(&context, jfA, key);
+        }
+    // When includeEmpties is set, we assume we're going to have multiple outputs.
+    // However, there might be only one match among multiple keys.  If so, remove trailing commas.
+    if (context.includeEmpties && context.matchCount == 1)
+        {
+        int valIx;
+        for (valIx = 0;  valIx < self->valCount;  valIx++)
+            {
+            struct dyString *colDy = self->colValues[valIx];
+            char *end = colDy->string + dyStringLen(colDy) - 1;
+            if (*end == ',')
+                *end = '\0';
+            }
+        }
+    // Set the external row result columns to point to accumulated values.
+    for (i = 0;  i < self->valCount;  i++)
+        {
+        struct dyString *colDy = self->colValues[i];
+        if (self->naForMissing && context.matchCount == 0)
+            dyStringAppend(colDy, "n/a");
+        uint extRowValIx = self->extRowValIxs[i];
+        extRow[extRowValIx] = colDy->string;
+        }
+    }
+}
+
+void hashJoinFree(struct hashJoin **pSelf)
+/* Free hashJoin (if necessary). */
+{
+if (pSelf == NULL || *pSelf == NULL)
+    return;
+struct hashJoin *self = *pSelf;
+hashFree(&self->hash);
+uint i;
+for (i = 0;  i < self->valCount;  i++)
+    dyStringFree(&self->colValues[i]);
+freeMem(self->query);
+lmCleanup(&self->lm);
+freez(pSelf);
+}
+
+void hashJoinFreeList(struct hashJoin **pList)
+/* Free a list of hashJoins. */
+{
+if (pList == NULL || *pList == NULL)
+    return;
+struct hashJoin *el = *pList;
+while (el != NULL)
+    {
+    struct hashJoin *elNext = el->next;
+    hashJoinFree(&el);
+    el = elNext;
+    }
+*pList = NULL;
+}