533112afe2a2005e80cdb1f82904ea65032d4302 braney Sat Oct 2 11:37:34 2021 -0700 split hg/lib into two separate libaries, one only used by the cgis diff --git src/hg/cgilib/hashJoin.c src/hg/cgilib/hashJoin.c new file mode 100644 index 0000000..06ca10b --- /dev/null +++ src/hg/cgilib/hashJoin.c @@ -0,0 +1,312 @@ +/* hashJoin - join one or more columns of a hashed database table to an externally provided + * char **row that contains the key and empty slot(s) for the column value(s) */ + +/* Copyright (C) 2015 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "hashJoin.h" +#include "hdb.h" +#include "obscure.h" + +struct hashJoin +// Implements table join as a hash lookup: the key is taken from some column of an externally +// provided row, and one or more values are retrieved and then stored in specified columns +// of the external row. If a key has more than one set of matching columns, then each +// column's values are glommed into a comma-separated list for that column in the external row. +{ + struct hashJoin *next; + struct hash *hash; // Hash some kind of key to char **row of column values + uint extRowKeyIx; // Index of hash key to take from external row + uint valCount; // Number of columns in hash value rows + uint *extRowValIxs; // Index of each hash value column to store in external row + struct dyString **colValues; // Accumulators for hash value columns -- multiple + // results from hash lookup become comma-sep strings + struct lm *lm; // For storing hash values, misc strings & arrays + struct joinerField *jfA; // If non-NULL, its separator, chopBefore and chopAfter + // are applied to each key accessed by hashJoinOneRow. + struct joinerField *jfB; // If non-NULL, its chopBefore and chopAfter + // are applied to each key passed to hashJoinAddMapping. + char *db; // Database from which to load hash + char *table; // Table from which to load hash + char *query; // SQL query to execute when loading hash + boolean loaded; // TRUE when table contents have been loaded into hash + boolean naForMissing; // If TRUE, then output "n/a" when there's no match +}; + +struct hashJoin *hashJoinNew(struct joinerDtf *keyDtf, uint extRowKeyIx, + struct joinerDtf *valDtfs, uint *extRowValIxs, + struct joinerField *jfA, struct joinerField *jfB, + boolean naForMissing) +/* Return a new hashJoin. extRowKeyIx is the index in an external row of the key + * to use in the join. extRowValIxs[valCount] contains each hash val column's index + * into an external row. jfA and jfB are optional; if given, then jfA's separator, + * chopBefore and chopAfter will be applied to each key retrieved from the external row + * and jfB's separator, chopBefore and chopAfter will be applied to each hash key. + * If naForMissing is TRUE then the result columns will contain "n/a" when there is + * no match in the hash. */ +{ +struct hashJoin *self; +AllocVar(self); +self->extRowKeyIx = extRowKeyIx; +int valCount = slCount(valDtfs); +self->valCount = valCount; +// Save some inner-loop tests if no separating or chopping will be required: +if (jfA && (jfA->separator || jfA->chopBefore || jfA->chopAfter)) + self->jfA = jfA; +if (jfB && (jfB->separator || jfB->chopBefore || jfB->chopAfter)) + self->jfB = jfB; +self->lm = lmInit(0); +lmAllocArray(self->lm, self->extRowValIxs, valCount); +CopyArray(extRowValIxs, self->extRowValIxs, valCount); +lmAllocArray(self->lm, self->colValues, valCount); +int i; +for (i = 0; i < valCount; i++) + self->colValues[i] = dyStringNew(0); +self->db = lmCloneString(self->lm, keyDtf->database); +self->table = lmCloneString(self->lm, keyDtf->table); +struct dyString *query = sqlDyStringCreate("select %s", keyDtf->field); +struct joinerDtf *dtf; +for (dtf = valDtfs; dtf != NULL; dtf = dtf->next) + { + if (differentString(dtf->database, self->db) || + differentString(dtf->table, self->table)) + errAbort("hashJoinNew: inconsistent key field (%s.%s.%s) and value field (%s.%s.%s)", + keyDtf->database, keyDtf->table, keyDtf->field, + dtf->database, dtf->table, dtf->field); + dyStringAppendC(query, ','); + dyStringAppend(query, dtf->field); + } +dyStringPrintf(query, " from %s", self->table); +self->query = dyStringCannibalize(&query); +self->naForMissing = naForMissing; +return self; +} + +struct hashJoin *hashJoinNext(struct hashJoin *el) +/* Get the next hashJoin in a list of hashJoins. */ +{ +return el->next; +} + +struct hjAddOneContext +// joinerFieldIterateKey context for use by hashJoinAddOne +{ + struct hash *hash; + char **clonedValues; +}; + +static void hashJoinAddOne(void *context, char *key) +/* Add values from context to hash from context for key. + * This is a callback for joinerFieldIterateKey; context is struct hjAddOneContext *. */ +{ +struct hjAddOneContext *ctx = context; +hashAdd(ctx->hash, key, ctx->clonedValues); +} + +static void hashJoinLoad(struct hashJoin *self) +/* Load table contents into hash. */ +{ +if (self->loaded) + errAbort("hashJoinLoad: loaded flag already set"); +struct sqlConnection *conn = hAllocConn(self->db); +int rowCount = sqlRowCount(conn, self->table); +int hashSize = min(digitsBaseTwo(rowCount), hashMaxSize); +self->hash = hashNew(hashSize); +char **row; +struct sqlResult *sr = sqlGetResult(conn, self->query); +while ((row = sqlNextRow(sr)) != NULL) + { + char **clonedValues = lmCloneRow(self->lm, row+1, self->valCount); + struct hjAddOneContext context = { self->hash, clonedValues }; + // If necessary, process key according to self->jfA. + if (self->jfB) + joinerFieldIterateKey(self->jfB, hashJoinAddOne, &context, row[0]); + else + hashAdd(self->hash, row[0], clonedValues); + } +self->loaded = TRUE; +hFreeConn(&conn); +} + +struct hjKeyContext +{ + struct hashJoin *self; + boolean includeEmpties; + boolean matchCount; +}; + +static void hashJoinOneKey(void *context, char *key) +/* Look up some processed key in hash and accumulate results for each column. + * This is a callback for joinerFieldIterateKey; context is struct hashJoin *. */ +{ +struct hjKeyContext *ctx = context; +struct hashJoin *self = ctx->self; +struct hashEl *helFirst = hashLookup(self->hash, key); +// hgTables accumulates multiple match values with slAddHead so they are +// printed in reverse. Use arrays to accumulate multiple matched rows; we'll step +// through them backwards in hashJoinGlomMultipleMatches to match hgTables' order. +int helMaxCount = slCount(helFirst); +char **matchRows[helMaxCount]; +struct hashEl *hel; +int matchIx; +for (matchIx = 0, hel = helFirst; hel != NULL; hel = hashLookupNext(hel), matchIx++) + { + char **row = hel->val; + matchRows[matchIx] = row; + } +int matchCount = matchIx; +ctx->matchCount += matchCount; +// When there are multiple matches, hgTables includes empty vals and prints a comma after each item. +boolean includeEmpties = ctx->includeEmpties || (matchCount > 1); +// Step through matchRows in reverse order to match hgTables. +for (matchIx = matchCount - 1; matchIx >= 0; matchIx--) + { + char **row = matchRows[matchIx]; + int valIx; + for (valIx = 0; valIx < self->valCount; valIx++) + { + char *val = row[valIx]; + if (isNotEmpty(val) || includeEmpties) + { + // Skip over adjacent duplicate values + struct dyString *colDy = self->colValues[valIx]; + int colDyLen = dyStringLen(colDy); + boolean isDup = FALSE; + if (matchIx < matchCount - 1) + { + char **prevRow = matchRows[matchIx+1]; + char *prevVal = (prevRow == NULL) ? NULL : prevRow[valIx]; + isDup = sameOk(val, prevVal); + } + else + // If there's no previous row to compare to from this key, but colDy already + // ends with the same value, consider this a duplicate: + isDup = colDyLen > 0 && endsWithWordComma(colDy->string, val); + if (! isDup) + { + if (includeEmpties) + { + if (isNotEmpty(val)) + dyStringAppend(colDy, val); + dyStringAppendC(colDy, ','); + } + else + { + if (colDyLen > 0) + dyStringAppendC(colDy, ','); + dyStringAppend(colDy, val); + } + } + } + } + } +} + +static void hashJoinChopCommaKey(struct hjKeyContext *context, struct joinerField *jfA, char *key) +/* Chop key by comma, regardless of jfA->separator; for each item, apply jfA's chopBefore + * and chopAfter if applicable, and try to join the result. */ +{ +context->includeEmpties = TRUE; +int len = strlen(key); +char keyClone[len+1]; +safencpy(keyClone, sizeof(keyClone), key, len); +char *s = keyClone, *e; +while (isNotEmpty(s)) + { + e = strchr(s, ','); + if (e != NULL) + *e++ = 0; + if (jfA) + s = joinerFieldChopKey(jfA, s); + if (s[0] != 0) + hashJoinOneKey(context, s); + s = e; + } +} + +void hashJoinOneRow(struct hashJoin *self, char **extRow) +/* Look up some column of extRow in hash and place result(s) in other columns of extRow. + * Don't call this again until done with extRow -- column value storage is reused. */ +{ +if (!self->loaded) + hashJoinLoad(self); +char *key = extRow[self->extRowKeyIx]; +if (isNotEmpty(key)) + { + // Clear accumulators + uint i; + for (i = 0; i < self->valCount; i++) + dyStringClear(self->colValues[i]); + // If necessary, process key according to self->jfA. Look up key(s) and accumulate results. + struct joinerField *jfA = self->jfA; + struct hjKeyContext context = { self, FALSE, FALSE }; + if (jfA) + { + context.includeEmpties = TRUE; + joinerFieldIterateKey(jfA, hashJoinOneKey, &context, key); + } + else + hashJoinOneKey(&context, key); + // In case we're processing comma-glommed results from some other hash join -- + // if there were no results, but the key contains commas and wasn't already comma-chopped + // by joinerFieldIterateKey, try comma-chopping it and looking up the pieces. + if (context.matchCount == 0 && + ! (jfA && sameOk(jfA->separator, ",")) && + strchr(key, ',')) + { + hashJoinChopCommaKey(&context, jfA, key); + } + // When includeEmpties is set, we assume we're going to have multiple outputs. + // However, there might be only one match among multiple keys. If so, remove trailing commas. + if (context.includeEmpties && context.matchCount == 1) + { + int valIx; + for (valIx = 0; valIx < self->valCount; valIx++) + { + struct dyString *colDy = self->colValues[valIx]; + char *end = colDy->string + dyStringLen(colDy) - 1; + if (*end == ',') + *end = '\0'; + } + } + // Set the external row result columns to point to accumulated values. + for (i = 0; i < self->valCount; i++) + { + struct dyString *colDy = self->colValues[i]; + if (self->naForMissing && context.matchCount == 0) + dyStringAppend(colDy, "n/a"); + uint extRowValIx = self->extRowValIxs[i]; + extRow[extRowValIx] = colDy->string; + } + } +} + +void hashJoinFree(struct hashJoin **pSelf) +/* Free hashJoin (if necessary). */ +{ +if (pSelf == NULL || *pSelf == NULL) + return; +struct hashJoin *self = *pSelf; +hashFree(&self->hash); +uint i; +for (i = 0; i < self->valCount; i++) + dyStringFree(&self->colValues[i]); +freeMem(self->query); +lmCleanup(&self->lm); +freez(pSelf); +} + +void hashJoinFreeList(struct hashJoin **pList) +/* Free a list of hashJoins. */ +{ +if (pList == NULL || *pList == NULL) + return; +struct hashJoin *el = *pList; +while (el != NULL) + { + struct hashJoin *elNext = el->next; + hashJoinFree(&el); + el = elNext; + } +*pList = NULL; +}