533112afe2a2005e80cdb1f82904ea65032d4302 braney Sat Oct 2 11:37:34 2021 -0700 split hg/lib into two separate libaries, one only used by the cgis diff --git src/hg/lib/joinMixer.c src/hg/lib/joinMixer.c deleted file mode 100644 index 9b5316a..0000000 --- src/hg/lib/joinMixer.c +++ /dev/null @@ -1,363 +0,0 @@ -/* joinMixer - implement multiple joins by sql joins and/or hashJoins, depending on which - * should work best for each joined table. */ - -/* Copyright (C) 2015 The Regents of the University of California - * See README in this or parent directory for licensing information. */ - -#include "joinMixer.h" -#include "hdb.h" - -static void addUniqueJoinerDt(struct joinerDtf **pTableList, struct hash *uniqHash, - struct joinerDtf *dt, char *db) -/* Use a string key into the hash to build up a unique list of cloned joinerDtf's with - * ignored (possibly NULL) fields (table-only, hence "dt" not "dtf"). */ -{ -char dbTable[PATH_LEN]; -joinerDtfToSqlTableString(dt, db, dbTable, sizeof(dbTable)); -if (! hashLookup(uniqHash, dbTable)) - { - slAddHead(pTableList, joinerDtfClone(dt)); - hashAdd(uniqHash, dbTable, NULL); - } -} - -static struct joinerDtf *tablesInOutput(char *db, char *mainTable, - struct joinerDtf *outputFieldList) -/* Return a uniquified list of tables (NULL for field) that we need for selected output fields - * and filters, if applicable. Joining these tables may require additional intermediate tables. */ -{ -struct hash *uniqHash = hashNew(0); -struct joinerDtf *tableList = NULL; -// Always include the main table, even if its fields aren't in output. -struct joinerDtf *mainDt = joinerDtfNew(db, mainTable, NULL); -addUniqueJoinerDt(&tableList, uniqHash, mainDt, db); -struct joinerDtf *dtf; -for (dtf = outputFieldList; dtf != NULL; dtf = dtf->next) - addUniqueJoinerDt(&tableList, uniqHash, dtf, db); -hashFree(&uniqHash); -slReverse(&tableList); -return tableList; -} - -static boolean noFancyKeywords(struct joinerField *jfA, struct joinerField *jfB) -/* Return TRUE unless jfA or jfB has a keyword that is too fancy for sql joins. */ -{ -if (isNotEmpty(jfA->separator) && differentString(jfA->separator, ",")) - return FALSE; -if (isNotEmpty(jfB->separator)) - return FALSE; -if (jfA->chopBefore || jfA->chopAfter || jfB->chopBefore || jfB->chopAfter) - return FALSE; -return TRUE; -} - -static boolean dependsOn(struct joinerPair *jpA, struct joinerPair *jpBList) -/* Return TRUE if jpA->a's table is found among the ->b's in jpBList. */ -{ -struct joinerPair *jpB; -for (jpB = jpBList; jpB != NULL; jpB = jpB->next) - { - if (joinerDtfSameTable(jpA->a, jpB->b)) - return TRUE; - } -return FALSE; -} - -static void partitionJoins(char *db, struct joinerPair *routeList, uint mainTableRowCount, - struct joinerPair **retSqlRouteList, - struct joinerPair **retHashJoinRouteList) -/* For each table pair in routeList, figure out whether a sql join is possible and - * perhaps faster than brute force join. If it looks like sql is faster, add to - * retSqlRouteList, otherwise to retHashJoinRouteList. */ -{ -struct sqlConnection *conn = hAllocConn(db); -boolean mysqlFasterForMainTable = mainTableRowCount < 10000; -struct joinerPair *jp, *jpNext; -for (jp = routeList; jp != NULL; jp = jpNext) - { - jpNext = jp->next; - boolean useSql = FALSE; - struct joinerField *jfA = joinerSetFindField(jp->identifier, jp->a); - struct joinerField *jfB = joinerSetFindField(jp->identifier, jp->b); - if (noFancyKeywords(jfA, jfB)) - { - char dbTable[PATH_LEN]; - joinerDtfToSqlTableString(jp->b, db, dbTable, sizeof(dbTable)); - uint rowCountB = sqlRowCount(conn, dbTable); - boolean relatedMuchBigger = mainTableRowCount + 10000000 < rowCountB; - if (mysqlFasterForMainTable || relatedMuchBigger) - useSql = TRUE; - } - if (useSql && !dependsOn(jp, *retHashJoinRouteList)) - slAddHead(retSqlRouteList, jp); - else - slAddHead(retHashJoinRouteList, jp); - } -slReverse(retSqlRouteList); -slReverse(retHashJoinRouteList); -hFreeConn(&conn); -} - -static struct hash *tablesInRouteList(char *db, char *mainTable, struct joinerPair *routeList) -/* Return a hash of names of tables in routeList to NULL values. */ -{ -struct hash *hash = hashNew(0); -// Always add mainTable -char dbTable[PATH_LEN]; -hashAdd(hash, mainTable, NULL); -// Add all pairs' ->b's -struct joinerPair *jp; -for (jp = routeList; jp != NULL; jp = jp->next) - { - joinerDtfToSqlTableString(jp->b, db, dbTable, sizeof(dbTable)); - hashAdd(hash, dbTable, NULL); - } -return hash; -} - -static void addField(struct hash *fieldToIx, struct joinerDtf *dtf, char *db, uint *pIx) -/* If stringified dtf is not already in hash, add it with value=*pIx and increment *pIx. */ -{ -char dtField[PATH_LEN]; -joinerDtfToSqlFieldString(dtf, db, dtField, sizeof(dtField)); -if (!hashLookup(fieldToIx, dtField)) - { - hashAddInt(fieldToIx, dtField, *pIx); - *pIx = *pIx + 1; - } -} - -static uint mustFindIx(struct hash *fieldToIx, struct joinerDtf *dtf, char *db, char *errMsg) -/* Look up nonnegative int value of stringified dtf; die if not found, otherwise return it. */ -{ -char dtField[PATH_LEN]; -joinerDtfToSqlFieldString(dtf, db, dtField, sizeof(dtField)); -int bigRowKeyIx = hashIntValDefault(fieldToIx, dtField, -1); -if (bigRowKeyIx < 0) - errAbort("%s '%s'", errMsg, dtField); -return (uint)bigRowKeyIx; -} - -struct hjTableInfo -// Info accumulator for constructing a new hashJoin -{ - struct hjTableInfo *next; - char *dbTable; - struct joinerPair *jp; - struct joinerDtf *hashValCols; -}; - -static struct hjTableInfo *hjTableInfoNew(char *dbTable, struct joinerPair *jp, - struct joinerDtf *hashValCol) -// Return a new hjTableInfo for dbTable; if jp is non-NULL, jp->b is the hash key field. -{ -struct hjTableInfo *self; -AllocVar(self); -self->dbTable = cloneString(dbTable); -self->jp = jp; -self->hashValCols = joinerDtfClone(hashValCol); -return self; -} - -static struct hjTableInfo *hjTableInfoFind(struct hjTableInfo *list, char *dbTable) -// Just use slPair find because the first two fields are next and char *name. -{ -return (struct hjTableInfo *)slPairFind((struct slPair *)list, dbTable); -} - -static void hjTableInfoAddKey(struct hjTableInfo **pList, struct joinerPair *jp, char *db) -// jp->b is the hash key field for its table; look up jp->b's table in pList, add if necessary, -// setting the table's jp. -{ -char dbTable[PATH_LEN]; -joinerDtfToSqlTableString(jp->b, db, dbTable, sizeof(dbTable)); -struct hjTableInfo *infoForTable = hjTableInfoFind(*pList, dbTable); -if (infoForTable == NULL) - slAddHead(pList, hjTableInfoNew(dbTable, jp, NULL)); -else if (infoForTable->jp != NULL) - errAbort("Multiple keys for %s: %s and %s", dbTable, infoForTable->jp->b->field, jp->b->field); -else - infoForTable->jp = jp; -} - -static void hjTableInfoAddVal(struct hjTableInfo **pList, struct joinerDtf *dtf, char *db) -// Find dtf in pList -- it should be there already if hjRouteList is traversed in order -- -// and add dtf to its hash value column list if not already there. -{ -char dbTable[PATH_LEN]; -joinerDtfToSqlTableString(dtf, db, dbTable, sizeof(dbTable)); -struct hjTableInfo *infoForTable = hjTableInfoFind(*pList, dbTable); -if (infoForTable == NULL) - { - errAbort("hjTableInfoAddVal: can't find table '%s' in list", dbTable); - } -else if (! joinerDtfFind(infoForTable->hashValCols, dtf)) - slAddTail(&infoForTable->hashValCols, joinerDtfClone(dtf)); -} - -static void hjTableInfoFree(struct hjTableInfo **pHjti) -// Free up *pHjti & contents if not NULL. -{ -if (! pHjti || ! *pHjti) - return; -struct hjTableInfo *hjti = *pHjti; -freeMem(hjti->dbTable); -joinerDtfFreeList(&hjti->hashValCols); -freez(pHjti); -} - -static void hjTableInfoFreeList(struct hjTableInfo **pHjtiList) -// Free up every member of *pHjtiList if not NULL. -{ -if (! pHjtiList || ! *pHjtiList) - return; -struct hjTableInfo *hjti = *pHjtiList; -while (hjti != NULL) - { - struct hjTableInfo *hjtiNext = hjti->next; - hjTableInfoFree(&hjti); - hjti = hjtiNext; - } -*pHjtiList = NULL; -} - -struct joinMixer *joinMixerNew(struct joiner *joiner, char *db, char *mainTable, - struct joinerDtf *outputFieldList, - uint mainTableRowCount, boolean naForMissing) -/* If outputFieldList contains fields from more than one table, use joiner to figure - * out the route of table joins to relate all fields; for each table, predict whether - * it would be more efficient to join by sql or by hashJoin, taking into account - * anticipated row counts. Return info sufficient for building a sql query, a list - * of hashJoins, bigRow size, and indexes in bigRow for each output. - * If naForMissing is TRUE then the hashJoiner result columns will contain "n/a" when - * there is no match in the hash. */ -{ -struct joinMixer *jmOut; -AllocVar(jmOut); -// Figure out what tables we need to query, and whether each one should be joined -// by sql or hash: -struct joinerDtf *tablesNeeded = tablesInOutput(db, mainTable, outputFieldList); -struct joinerPair *routeList = joinerFindRouteThroughAll(joiner, tablesNeeded); -struct joinerPair *hjRouteList = NULL; -partitionJoins(db, routeList, mainTableRowCount, &jmOut->sqlRouteList, &hjRouteList); -// routeList was clobbered, make sure we don't try to use it again: -routeList = NULL; -char dbTable[PATH_LEN]; -// Split output fields into those that come from sql and those that come from hashJoins: -// Assign indices in external row to each output from mysql. -struct hash *fieldToIx = hashNew(0); -uint bigRowIx = 0; -struct hash *sqlTables = tablesInRouteList(db, mainTable, jmOut->sqlRouteList); -struct joinerDtf *hjFieldList = NULL; -struct joinerDtf *dtf; -for (dtf = outputFieldList; dtf != NULL; dtf = dtf->next) - { - joinerDtfToSqlTableString(dtf, db, dbTable, sizeof(dbTable)); - if (hashLookup(sqlTables, dbTable) || sameString(dbTable, mainTable)) - { - slAddHead(&jmOut->sqlFieldList, joinerDtfClone(dtf)); - addField(fieldToIx, dtf, db, &bigRowIx); - } - else - { - slAddHead(&hjFieldList, joinerDtfClone(dtf)); - // Don't add to bigRow/fieldToIx because we might need to tack on some sql fields - // for hashJoin keys. - } - } -slReverse(&jmOut->sqlFieldList); -slReverse(&hjFieldList); - -// Add hashJoin jp->a keys that come from sqlTables to sqlFieldList and bigRow if not already -// in sqlFieldList -struct joinerPair *jp; -for (jp = hjRouteList; jp != NULL; jp = jp->next) - { - joinerDtfToSqlTableString(jp->a, db, dbTable, sizeof(dbTable)); - if (hashLookup(sqlTables, dbTable) && !joinerDtfFind(jmOut->sqlFieldList, jp->a)) - { - slAddTail(&jmOut->sqlFieldList, joinerDtfClone(jp->a)); - addField(fieldToIx, jp->a, db, &bigRowIx); - } - } - -// Now that sqlFieldList is complete, add hashJoin output fields to bigRow. -for (dtf = hjFieldList; dtf != NULL; dtf = dtf->next) - addField(fieldToIx, dtf, db, &bigRowIx); - -// Add hashJoin key info (jp->b) to hjTableCols. If any hashJoin key fields take values -// produced by other hashJoins (jp->a), but those columns don't appear in output, add them -// to bigRow. -struct hjTableInfo *hjTableCols = NULL; -for (jp = hjRouteList; jp != NULL; jp = jp->next) - { - hjTableInfoAddKey(&hjTableCols, jp, db); - joinerDtfToSqlTableString(jp->a, db, dbTable, sizeof(dbTable)); - if (! hashLookup(sqlTables, dbTable)) - hjTableInfoAddVal(&hjTableCols, jp->a, db); - addField(fieldToIx, jp->a, db, &bigRowIx); - } - -// Done assigning bigRow indices; set bigRowSize. -jmOut->bigRowSize = bigRowIx; - -// Add hash output fields to hjTableCols -for (dtf = hjFieldList; dtf != NULL; dtf = dtf->next) - hjTableInfoAddVal(&hjTableCols, dtf, db); - -// Build up jmOut->hashJoins -slReverse(&hjTableCols); -struct hjTableInfo *hjti; -int i; -for (i = 0, hjti = hjTableCols; hjti != NULL; hjti = hjti->next, i++) - { - if (hjti->jp == NULL) - errAbort("hjTableInfo for %s has NULL jp", hjti->dbTable); - if (hjti->hashValCols == NULL) - errAbort("hjTableInfo for %s has NULL hashValCols", hjti->dbTable); - uint bigRowKeyIx = mustFindIx(fieldToIx, hjti->jp->a, db, - "joinMixerNew: Can't find index of hashJoin index"); - struct joinerField *jfA = joinerSetFindField(hjti->jp->identifier, hjti->jp->a); - struct joinerField *jfB = joinerSetFindField(hjti->jp->identifier, hjti->jp->b); - uint valCount = slCount(hjti->hashValCols); - uint bigRowColIxs[valCount]; - struct joinerDtf *col; - int colIx; - for (colIx = 0, col = hjti->hashValCols; col != NULL; col = col->next, colIx++) - bigRowColIxs[colIx] = mustFindIx(fieldToIx, col, db, - "joinMixerNew: Missing bigRow ix for"); - slAddHead(&jmOut->hashJoins, - hashJoinNew(hjti->jp->b, bigRowKeyIx, - hjti->hashValCols, bigRowColIxs, - jfA, jfB, naForMissing)); - } -slReverse(&jmOut->hashJoins); - -// Fill in each output field's index into bigRow -AllocArray(jmOut->outIxs, slCount(outputFieldList)); -uint outRowIx; -for (outRowIx = 0, dtf = outputFieldList; dtf != NULL; dtf = dtf->next, outRowIx++) - jmOut->outIxs[outRowIx] = mustFindIx(fieldToIx, dtf, db, - "joinMixerNew: no bigRowIx for output field"); - -joinerPairFreeList(&hjRouteList); -joinerDtfFreeList(&hjFieldList); -hjTableInfoFreeList(&hjTableCols); -hashFree(&fieldToIx); -hashFree(&sqlTables); -joinerDtfFreeList(&tablesNeeded); -return jmOut; -} - -void joinMixerFree(struct joinMixer **pJm) -/* Free joinMixer's holdings unless already NULL. */ -{ -if (!pJm || !*pJm) - return; -struct joinMixer *jm = *pJm; -joinerPairFreeList(&jm->sqlRouteList); -joinerDtfFreeList(&jm->sqlFieldList); -hashJoinFreeList(&jm->hashJoins); -freeMem(jm->outIxs); -freez(pJm); -}