533112afe2a2005e80cdb1f82904ea65032d4302 braney Sat Oct 2 11:37:34 2021 -0700 split hg/lib into two separate libaries, one only used by the cgis diff --git src/hg/cgilib/joinMixer.c src/hg/cgilib/joinMixer.c new file mode 100644 index 0000000..9b5316a --- /dev/null +++ src/hg/cgilib/joinMixer.c @@ -0,0 +1,363 @@ +/* joinMixer - implement multiple joins by sql joins and/or hashJoins, depending on which + * should work best for each joined table. */ + +/* Copyright (C) 2015 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "joinMixer.h" +#include "hdb.h" + +static void addUniqueJoinerDt(struct joinerDtf **pTableList, struct hash *uniqHash, + struct joinerDtf *dt, char *db) +/* Use a string key into the hash to build up a unique list of cloned joinerDtf's with + * ignored (possibly NULL) fields (table-only, hence "dt" not "dtf"). */ +{ +char dbTable[PATH_LEN]; +joinerDtfToSqlTableString(dt, db, dbTable, sizeof(dbTable)); +if (! hashLookup(uniqHash, dbTable)) + { + slAddHead(pTableList, joinerDtfClone(dt)); + hashAdd(uniqHash, dbTable, NULL); + } +} + +static struct joinerDtf *tablesInOutput(char *db, char *mainTable, + struct joinerDtf *outputFieldList) +/* Return a uniquified list of tables (NULL for field) that we need for selected output fields + * and filters, if applicable. Joining these tables may require additional intermediate tables. */ +{ +struct hash *uniqHash = hashNew(0); +struct joinerDtf *tableList = NULL; +// Always include the main table, even if its fields aren't in output. +struct joinerDtf *mainDt = joinerDtfNew(db, mainTable, NULL); +addUniqueJoinerDt(&tableList, uniqHash, mainDt, db); +struct joinerDtf *dtf; +for (dtf = outputFieldList; dtf != NULL; dtf = dtf->next) + addUniqueJoinerDt(&tableList, uniqHash, dtf, db); +hashFree(&uniqHash); +slReverse(&tableList); +return tableList; +} + +static boolean noFancyKeywords(struct joinerField *jfA, struct joinerField *jfB) +/* Return TRUE unless jfA or jfB has a keyword that is too fancy for sql joins. */ +{ +if (isNotEmpty(jfA->separator) && differentString(jfA->separator, ",")) + return FALSE; +if (isNotEmpty(jfB->separator)) + return FALSE; +if (jfA->chopBefore || jfA->chopAfter || jfB->chopBefore || jfB->chopAfter) + return FALSE; +return TRUE; +} + +static boolean dependsOn(struct joinerPair *jpA, struct joinerPair *jpBList) +/* Return TRUE if jpA->a's table is found among the ->b's in jpBList. */ +{ +struct joinerPair *jpB; +for (jpB = jpBList; jpB != NULL; jpB = jpB->next) + { + if (joinerDtfSameTable(jpA->a, jpB->b)) + return TRUE; + } +return FALSE; +} + +static void partitionJoins(char *db, struct joinerPair *routeList, uint mainTableRowCount, + struct joinerPair **retSqlRouteList, + struct joinerPair **retHashJoinRouteList) +/* For each table pair in routeList, figure out whether a sql join is possible and + * perhaps faster than brute force join. If it looks like sql is faster, add to + * retSqlRouteList, otherwise to retHashJoinRouteList. */ +{ +struct sqlConnection *conn = hAllocConn(db); +boolean mysqlFasterForMainTable = mainTableRowCount < 10000; +struct joinerPair *jp, *jpNext; +for (jp = routeList; jp != NULL; jp = jpNext) + { + jpNext = jp->next; + boolean useSql = FALSE; + struct joinerField *jfA = joinerSetFindField(jp->identifier, jp->a); + struct joinerField *jfB = joinerSetFindField(jp->identifier, jp->b); + if (noFancyKeywords(jfA, jfB)) + { + char dbTable[PATH_LEN]; + joinerDtfToSqlTableString(jp->b, db, dbTable, sizeof(dbTable)); + uint rowCountB = sqlRowCount(conn, dbTable); + boolean relatedMuchBigger = mainTableRowCount + 10000000 < rowCountB; + if (mysqlFasterForMainTable || relatedMuchBigger) + useSql = TRUE; + } + if (useSql && !dependsOn(jp, *retHashJoinRouteList)) + slAddHead(retSqlRouteList, jp); + else + slAddHead(retHashJoinRouteList, jp); + } +slReverse(retSqlRouteList); +slReverse(retHashJoinRouteList); +hFreeConn(&conn); +} + +static struct hash *tablesInRouteList(char *db, char *mainTable, struct joinerPair *routeList) +/* Return a hash of names of tables in routeList to NULL values. */ +{ +struct hash *hash = hashNew(0); +// Always add mainTable +char dbTable[PATH_LEN]; +hashAdd(hash, mainTable, NULL); +// Add all pairs' ->b's +struct joinerPair *jp; +for (jp = routeList; jp != NULL; jp = jp->next) + { + joinerDtfToSqlTableString(jp->b, db, dbTable, sizeof(dbTable)); + hashAdd(hash, dbTable, NULL); + } +return hash; +} + +static void addField(struct hash *fieldToIx, struct joinerDtf *dtf, char *db, uint *pIx) +/* If stringified dtf is not already in hash, add it with value=*pIx and increment *pIx. */ +{ +char dtField[PATH_LEN]; +joinerDtfToSqlFieldString(dtf, db, dtField, sizeof(dtField)); +if (!hashLookup(fieldToIx, dtField)) + { + hashAddInt(fieldToIx, dtField, *pIx); + *pIx = *pIx + 1; + } +} + +static uint mustFindIx(struct hash *fieldToIx, struct joinerDtf *dtf, char *db, char *errMsg) +/* Look up nonnegative int value of stringified dtf; die if not found, otherwise return it. */ +{ +char dtField[PATH_LEN]; +joinerDtfToSqlFieldString(dtf, db, dtField, sizeof(dtField)); +int bigRowKeyIx = hashIntValDefault(fieldToIx, dtField, -1); +if (bigRowKeyIx < 0) + errAbort("%s '%s'", errMsg, dtField); +return (uint)bigRowKeyIx; +} + +struct hjTableInfo +// Info accumulator for constructing a new hashJoin +{ + struct hjTableInfo *next; + char *dbTable; + struct joinerPair *jp; + struct joinerDtf *hashValCols; +}; + +static struct hjTableInfo *hjTableInfoNew(char *dbTable, struct joinerPair *jp, + struct joinerDtf *hashValCol) +// Return a new hjTableInfo for dbTable; if jp is non-NULL, jp->b is the hash key field. +{ +struct hjTableInfo *self; +AllocVar(self); +self->dbTable = cloneString(dbTable); +self->jp = jp; +self->hashValCols = joinerDtfClone(hashValCol); +return self; +} + +static struct hjTableInfo *hjTableInfoFind(struct hjTableInfo *list, char *dbTable) +// Just use slPair find because the first two fields are next and char *name. +{ +return (struct hjTableInfo *)slPairFind((struct slPair *)list, dbTable); +} + +static void hjTableInfoAddKey(struct hjTableInfo **pList, struct joinerPair *jp, char *db) +// jp->b is the hash key field for its table; look up jp->b's table in pList, add if necessary, +// setting the table's jp. +{ +char dbTable[PATH_LEN]; +joinerDtfToSqlTableString(jp->b, db, dbTable, sizeof(dbTable)); +struct hjTableInfo *infoForTable = hjTableInfoFind(*pList, dbTable); +if (infoForTable == NULL) + slAddHead(pList, hjTableInfoNew(dbTable, jp, NULL)); +else if (infoForTable->jp != NULL) + errAbort("Multiple keys for %s: %s and %s", dbTable, infoForTable->jp->b->field, jp->b->field); +else + infoForTable->jp = jp; +} + +static void hjTableInfoAddVal(struct hjTableInfo **pList, struct joinerDtf *dtf, char *db) +// Find dtf in pList -- it should be there already if hjRouteList is traversed in order -- +// and add dtf to its hash value column list if not already there. +{ +char dbTable[PATH_LEN]; +joinerDtfToSqlTableString(dtf, db, dbTable, sizeof(dbTable)); +struct hjTableInfo *infoForTable = hjTableInfoFind(*pList, dbTable); +if (infoForTable == NULL) + { + errAbort("hjTableInfoAddVal: can't find table '%s' in list", dbTable); + } +else if (! joinerDtfFind(infoForTable->hashValCols, dtf)) + slAddTail(&infoForTable->hashValCols, joinerDtfClone(dtf)); +} + +static void hjTableInfoFree(struct hjTableInfo **pHjti) +// Free up *pHjti & contents if not NULL. +{ +if (! pHjti || ! *pHjti) + return; +struct hjTableInfo *hjti = *pHjti; +freeMem(hjti->dbTable); +joinerDtfFreeList(&hjti->hashValCols); +freez(pHjti); +} + +static void hjTableInfoFreeList(struct hjTableInfo **pHjtiList) +// Free up every member of *pHjtiList if not NULL. +{ +if (! pHjtiList || ! *pHjtiList) + return; +struct hjTableInfo *hjti = *pHjtiList; +while (hjti != NULL) + { + struct hjTableInfo *hjtiNext = hjti->next; + hjTableInfoFree(&hjti); + hjti = hjtiNext; + } +*pHjtiList = NULL; +} + +struct joinMixer *joinMixerNew(struct joiner *joiner, char *db, char *mainTable, + struct joinerDtf *outputFieldList, + uint mainTableRowCount, boolean naForMissing) +/* If outputFieldList contains fields from more than one table, use joiner to figure + * out the route of table joins to relate all fields; for each table, predict whether + * it would be more efficient to join by sql or by hashJoin, taking into account + * anticipated row counts. Return info sufficient for building a sql query, a list + * of hashJoins, bigRow size, and indexes in bigRow for each output. + * If naForMissing is TRUE then the hashJoiner result columns will contain "n/a" when + * there is no match in the hash. */ +{ +struct joinMixer *jmOut; +AllocVar(jmOut); +// Figure out what tables we need to query, and whether each one should be joined +// by sql or hash: +struct joinerDtf *tablesNeeded = tablesInOutput(db, mainTable, outputFieldList); +struct joinerPair *routeList = joinerFindRouteThroughAll(joiner, tablesNeeded); +struct joinerPair *hjRouteList = NULL; +partitionJoins(db, routeList, mainTableRowCount, &jmOut->sqlRouteList, &hjRouteList); +// routeList was clobbered, make sure we don't try to use it again: +routeList = NULL; +char dbTable[PATH_LEN]; +// Split output fields into those that come from sql and those that come from hashJoins: +// Assign indices in external row to each output from mysql. +struct hash *fieldToIx = hashNew(0); +uint bigRowIx = 0; +struct hash *sqlTables = tablesInRouteList(db, mainTable, jmOut->sqlRouteList); +struct joinerDtf *hjFieldList = NULL; +struct joinerDtf *dtf; +for (dtf = outputFieldList; dtf != NULL; dtf = dtf->next) + { + joinerDtfToSqlTableString(dtf, db, dbTable, sizeof(dbTable)); + if (hashLookup(sqlTables, dbTable) || sameString(dbTable, mainTable)) + { + slAddHead(&jmOut->sqlFieldList, joinerDtfClone(dtf)); + addField(fieldToIx, dtf, db, &bigRowIx); + } + else + { + slAddHead(&hjFieldList, joinerDtfClone(dtf)); + // Don't add to bigRow/fieldToIx because we might need to tack on some sql fields + // for hashJoin keys. + } + } +slReverse(&jmOut->sqlFieldList); +slReverse(&hjFieldList); + +// Add hashJoin jp->a keys that come from sqlTables to sqlFieldList and bigRow if not already +// in sqlFieldList +struct joinerPair *jp; +for (jp = hjRouteList; jp != NULL; jp = jp->next) + { + joinerDtfToSqlTableString(jp->a, db, dbTable, sizeof(dbTable)); + if (hashLookup(sqlTables, dbTable) && !joinerDtfFind(jmOut->sqlFieldList, jp->a)) + { + slAddTail(&jmOut->sqlFieldList, joinerDtfClone(jp->a)); + addField(fieldToIx, jp->a, db, &bigRowIx); + } + } + +// Now that sqlFieldList is complete, add hashJoin output fields to bigRow. +for (dtf = hjFieldList; dtf != NULL; dtf = dtf->next) + addField(fieldToIx, dtf, db, &bigRowIx); + +// Add hashJoin key info (jp->b) to hjTableCols. If any hashJoin key fields take values +// produced by other hashJoins (jp->a), but those columns don't appear in output, add them +// to bigRow. +struct hjTableInfo *hjTableCols = NULL; +for (jp = hjRouteList; jp != NULL; jp = jp->next) + { + hjTableInfoAddKey(&hjTableCols, jp, db); + joinerDtfToSqlTableString(jp->a, db, dbTable, sizeof(dbTable)); + if (! hashLookup(sqlTables, dbTable)) + hjTableInfoAddVal(&hjTableCols, jp->a, db); + addField(fieldToIx, jp->a, db, &bigRowIx); + } + +// Done assigning bigRow indices; set bigRowSize. +jmOut->bigRowSize = bigRowIx; + +// Add hash output fields to hjTableCols +for (dtf = hjFieldList; dtf != NULL; dtf = dtf->next) + hjTableInfoAddVal(&hjTableCols, dtf, db); + +// Build up jmOut->hashJoins +slReverse(&hjTableCols); +struct hjTableInfo *hjti; +int i; +for (i = 0, hjti = hjTableCols; hjti != NULL; hjti = hjti->next, i++) + { + if (hjti->jp == NULL) + errAbort("hjTableInfo for %s has NULL jp", hjti->dbTable); + if (hjti->hashValCols == NULL) + errAbort("hjTableInfo for %s has NULL hashValCols", hjti->dbTable); + uint bigRowKeyIx = mustFindIx(fieldToIx, hjti->jp->a, db, + "joinMixerNew: Can't find index of hashJoin index"); + struct joinerField *jfA = joinerSetFindField(hjti->jp->identifier, hjti->jp->a); + struct joinerField *jfB = joinerSetFindField(hjti->jp->identifier, hjti->jp->b); + uint valCount = slCount(hjti->hashValCols); + uint bigRowColIxs[valCount]; + struct joinerDtf *col; + int colIx; + for (colIx = 0, col = hjti->hashValCols; col != NULL; col = col->next, colIx++) + bigRowColIxs[colIx] = mustFindIx(fieldToIx, col, db, + "joinMixerNew: Missing bigRow ix for"); + slAddHead(&jmOut->hashJoins, + hashJoinNew(hjti->jp->b, bigRowKeyIx, + hjti->hashValCols, bigRowColIxs, + jfA, jfB, naForMissing)); + } +slReverse(&jmOut->hashJoins); + +// Fill in each output field's index into bigRow +AllocArray(jmOut->outIxs, slCount(outputFieldList)); +uint outRowIx; +for (outRowIx = 0, dtf = outputFieldList; dtf != NULL; dtf = dtf->next, outRowIx++) + jmOut->outIxs[outRowIx] = mustFindIx(fieldToIx, dtf, db, + "joinMixerNew: no bigRowIx for output field"); + +joinerPairFreeList(&hjRouteList); +joinerDtfFreeList(&hjFieldList); +hjTableInfoFreeList(&hjTableCols); +hashFree(&fieldToIx); +hashFree(&sqlTables); +joinerDtfFreeList(&tablesNeeded); +return jmOut; +} + +void joinMixerFree(struct joinMixer **pJm) +/* Free joinMixer's holdings unless already NULL. */ +{ +if (!pJm || !*pJm) + return; +struct joinMixer *jm = *pJm; +joinerPairFreeList(&jm->sqlRouteList); +joinerDtfFreeList(&jm->sqlFieldList); +hashJoinFreeList(&jm->hashJoins); +freeMem(jm->outIxs); +freez(pJm); +}