c8e1dc987ba5532c65be54d8f43af1ec4337fb10 angie Sat Nov 21 09:52:07 2015 -0800 Back-end implementation of Data Integrator's support for related tables and fields using all.joiner. Most joins are implemented using a new module, hashJoin.c; but SQL joins are used in certain cases when hash joins are impractical and SQL joins are actually faster. A new module joinMixer determines which joins should be implemented by hashJoin vs SQL, and computes row indices for hashJoin objects to find keys (from SQL or other hashJoins) and store results. The SQL join info from joinMixer is translated into SQL queries in annoStreamDb. annoStreamDb also generates its own autoSql asObject, adding the fields from related tables after the fields of the main track table. Main changes: - annoStreamDb.c - main table SQL query now uses . instead of just to avoid clashes with same field name in different tables - SQL joins return multiple rows for a single main table row when there are multiple matching rows in a related table; these rows need to be squashed into one row with the multiple matches comma-separated, both to match hgTables behavior and to avoid overflow of rowBuf. (glomSqlDup) - as mentioned above, generate joining SQL queries when necessary and generate own asObj including selected fields from related tables. - parse JSON config object with relatedTables spec from UI via hgi_querySpec hashJoin basically slurps a related table into a big hash of keys to values, perform lookups (possibly of multiple keys), and formats each column's results. It includes a lot of tweaks to match hgTables/joining.c output char-for-char: collapse adjacent duplicate matches, commas at end of matches from multiple key lookups, reversed order of multiple match values. hgTables/joining.c uses arrays of slNames, but in order to avoid all that allocation I'm just glomming into an array of reused dyStrings. joinMixer takes a list of fields to include in output, gets a list of joins to be performed (from joinerRouteThroughAll), applies some simple rough heuristics to guess whether a join is practical in SQL, and decides which joins to do by SQL and which to do by hashJoin. It plans a row format with several groups of fields in this order: main table fields, related table fields to appear in the output, related table fields needed by hashJoins, hashJoin result fields needed by other hashJoins, and hashJoin result fields to appear in output. It initializes hashJoins with precomputed row indexes and also provides a mapping from big-row columns to the columns that appear in output. Thanks to Matt for testing on demo6 during development. refs #15544 diff --git src/lib/obscure.c src/lib/obscure.c index b707da3..5208b76 100644 --- src/lib/obscure.c +++ src/lib/obscure.c @@ -1,831 +1,846 @@ /* Obscure stuff that is handy every now and again. * * This file is copyright 2002 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include #include "portable.h" #include "localmem.h" #include "hash.h" #include "obscure.h" #include "linefile.h" static int _dotForUserMod = 100; /* How often does dotForUser() output a dot. */ long incCounterFile(char *fileName) /* Increment a 32 bit value on disk. */ { long val = 0; FILE *f = fopen(fileName, "r+b"); if (f != NULL) { mustReadOne(f, val); rewind(f); } else { f = fopen(fileName, "wb"); } ++val; if (f != NULL) { fwrite(&val, sizeof(val), 1, f); if (fclose(f) != 0) errnoAbort("fclose failed"); } return val; } int digitsBaseTwo(unsigned long x) /* Return base two # of digits. */ { int digits = 0; while (x) { digits += 1; x >>= 1; } return digits; } int digitsBaseTen(int x) /* Return number of digits base 10. */ { int digCount = 1; if (x < 0) { digCount = 2; x = -x; } while (x >= 10) { digCount += 1; x /= 10; } return digCount; } void writeGulp(char *file, char *buf, int size) /* Write out a bunch of memory. */ { FILE *f = mustOpen(file, "w"); mustWrite(f, buf, size); carefulClose(&f); } void readInGulp(char *fileName, char **retBuf, size_t *retSize) /* Read whole file in one big gulp. */ { size_t size = (size_t)fileSize(fileName); char *buf; FILE *f = mustOpen(fileName, "rb"); *retBuf = buf = needLargeMem(size+1); mustRead(f, buf, size); buf[size] = 0; /* Just in case it needs zero termination. */ fclose(f); if (retSize != NULL) *retSize = size; } void readAllWords(char *fileName, char ***retWords, int *retWordCount, char **retBuf) /* Read in whole file and break it into words. You need to freeMem both * *retWordCount and *retBuf when done. */ { int wordCount; char *buf = NULL; char **words = NULL; size_t bufSize; readInGulp(fileName, &buf, &bufSize); wordCount = chopByWhite(buf, NULL, 0); if (wordCount != 0) { words = needMem(wordCount * sizeof(words[0])); chopByWhite(buf, words, wordCount); } *retWords = words; *retWordCount = wordCount; *retBuf = buf; } int countWordsInFile(char *fileName) /* Count number of words in file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; int wordCount = 0; while (lineFileNext(lf, &line, NULL)) wordCount += chopByWhite(line, NULL, 0); lineFileClose(&lf); return wordCount; } struct hash *hashWordsInFile(char *fileName, int hashSize) /* Create a hash of space delimited words in file. */ { struct hash *hash = newHash(hashSize); struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; while (lineFileNext(lf, &line, NULL)) { while ((word = nextWord(&line)) != NULL) hashAdd(hash, word, NULL); } lineFileClose(&lf); return hash; } struct hash *hashNameIntFile(char *fileName) /* Given a two column file (name, integer value) return a * hash keyed by name with integer values */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[2]; struct hash *hash = hashNew(16); while (lineFileRow(lf, row)) hashAddInt(hash, row[0], lineFileNeedNum(lf, row, 1)); lineFileClose(&lf); return hash; } struct hash *hashTwoColumnFile(char *fileName) /* Given a two column file (key, value) return a hash. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[2]; struct hash *hash = hashNew(16); while (lineFileRow(lf, row)) { char *name = row[0]; char *value = lmCloneString(hash->lm, row[1]); hashAdd(hash, name, value); } lineFileClose(&lf); return hash; } struct slName *readAllLines(char *fileName) /* Read all lines of file into a list. (Removes trailing carriage return.) */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct slName *list = NULL, *el; char *line; while (lineFileNext(lf, &line, NULL)) { el = newSlName(line); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } void copyFile(char *source, char *dest) /* Copy file from source to dest. */ { int bufSize = 64*1024; char *buf = needMem(bufSize); int bytesRead; int s, d; s = open(source, O_RDONLY); if (s < 0) errAbort("Couldn't open %s. %s\n", source, strerror(errno)); d = creat(dest, 0777); if (d < 0) { close(s); errAbort("Couldn't open %s. %s\n", dest, strerror(errno)); } while ((bytesRead = read(s, buf, bufSize)) > 0) { if (write(d, buf, bytesRead) < 0) errAbort("Write error on %s. %s\n", dest, strerror(errno)); } close(s); if (close(d) != 0) errnoAbort("close failed"); freeMem(buf); } void copyOpenFile(FILE *inFh, FILE *outFh) /* copy an open stdio file */ { int c; while ((c = fgetc(inFh)) != EOF) fputc(c, outFh); if (ferror(inFh)) errnoAbort("file read failed"); if (ferror(outFh)) errnoAbort("file write failed"); } void cpFile(int s, int d) /* Copy from source file to dest until reach end of file. */ { int bufSize = 64*1024, readSize; char *buf = needMem(bufSize); for (;;) { readSize = read(s, buf, bufSize); if (readSize > 0) mustWriteFd(d, buf, readSize); if (readSize <= 0) break; } freeMem(buf); } void *intToPt(int i) /* Convert integer to pointer. Use when really want to store an * int in a pointer field. */ { char *pt = NULL; return pt+i; } int ptToInt(void *pt) /* Convert pointer to integer. Use when really want to store a * pointer in an int. */ { char *a = NULL, *b = pt; return b - a; } void *sizetToPt(size_t i) /* Convert size_t to pointer. Use when really want to store a * size_t in a pointer. */ { char *pt = NULL; return pt+i; } size_t ptToSizet(void *pt) /* Convert pointer to size_t. Use when really want to store a * pointer in a size_t. */ { char *a = NULL, *b = pt; return b - a; } boolean parseQuotedStringNoEscapes( char *in, char *out, char **retNext) /* Read quoted string from in (which should begin with first quote). * Write unquoted string to out, which may be the same as in. * Return pointer to character past end of string in *retNext. * Return FALSE if can't find end. * Unlike parseQuotedString() do not treat backslash as an escape * character, merely pass it on through. */ { char c, *s = in; int quoteChar = *s++; for (;;) { c = *s++; if (c == 0) { warn("Unmatched %c", quoteChar); return FALSE; } else if (c == quoteChar) break; else *out++ = c; } *out = 0; if (retNext != NULL) *retNext = s; return TRUE; } boolean parseQuotedString( char *in, char *out, char **retNext) /* Read quoted string from in (which should begin with first quote). * Write unquoted string to out, which may be the same as in. * Return pointer to character past end of string in *retNext. * Return FALSE if can't find end. */ { char c, *s = in; int quoteChar = *s++; boolean escaped = FALSE; for (;;) { c = *s++; if (c == 0) { warn("Unmatched %c", quoteChar); return FALSE; } if (escaped) { if (c == '\\' || c == quoteChar) *out++ = c; else { *out++ = '\\'; *out++ = c; } escaped = FALSE; } else { if (c == '\\') escaped = TRUE; else if (c == quoteChar) break; else *out++ = c; } } *out = 0; if (retNext != NULL) *retNext = s; return TRUE; } char *nextQuotedWord(char **pLine) /* Generalization of nextWord. Returns next quoted * string or if no quotes next word. Updates *pLine * to point past word that is returned. Does not return * quotes. */ { char *line, c; line = skipLeadingSpaces(*pLine); if (line == NULL || line[0] == 0) return NULL; c = *line; if (c == '"' || c == '\'') { if (!parseQuotedString(line, line, pLine)) return NULL; return line; } else { return nextWord(pLine); } } void escCopy(char *in, char *out, char toEscape, char escape) /* Copy in to out, escaping as needed. Out better be big enough. * (Worst case is strlen(in)*2 + 1.) */ { char c; for (;;) { c = *in++; if (c == toEscape) *out++ = escape; *out++ = c; if (c == 0) break; } } char *makeEscapedString(char *in, char toEscape) /* Return string that is a copy of in, but with all * toEscape characters preceded by '\' * When done freeMem result. */ { int newSize = strlen(in) + countChars(in, toEscape); char *out = needMem(newSize+1); escCopy(in, out, toEscape, '\\'); return out; } char *makeQuotedString(char *in, char quoteChar) /* Create a string surrounded by quoteChar, with internal * quoteChars escaped. freeMem result when done. */ { int newSize = 2 + strlen(in) + countChars(in, quoteChar); char *out = needMem(newSize+1); out[0] = quoteChar; escCopy(in, out+1, quoteChar, '\\'); out[newSize-1] = quoteChar; return out; } struct hash *hashThisEqThatLine(char *line, int lineIx, boolean firstStartsWithLetter) /* Return a symbol table from a line of form: * 1-this1=val1 2-this='quoted val2' var3="another val" * If firstStartsWithLetter is true, then the left side of the equals must start with * a letter. */ { char *dupe = cloneString(line); char *s = dupe, c; char *var, *val; struct hash *hash = newHash(8); for (;;) { if ((var = skipLeadingSpaces(s)) == NULL) break; if ((c = *var) == 0) break; if (firstStartsWithLetter && !isalpha(c)) errAbort("line %d of custom input: variable needs to start with letter '%s'", lineIx, var); val = strchr(var, '='); if (val == NULL) { errAbort("line %d of var %s in custom input: %s \n missing = in var/val pair", lineIx, var, line); } *val++ = 0; c = *val; if (c == '\'' || c == '"') { if (!parseQuotedString(val, val, &s)) errAbort("line %d of input: missing closing %c", lineIx, c); } else { s = skipToSpaces(val); if (s != NULL) *s++ = 0; } hashAdd(hash, var, cloneString(val)); } freez(&dupe); return hash; } struct hash *hashVarLine(char *line, int lineIx) /* Return a symbol table from a line of form: * var1=val1 var2='quoted val2' var3="another val" */ { return hashThisEqThatLine(line, lineIx, TRUE); } struct slName *stringToSlNames(char *string) /* Convert string to a list of slNames separated by * white space, but allowing multiple words in quotes. * Quotes if any are stripped. */ { struct slName *list = NULL, *name; char *dupe = cloneString(string); char c, *s = dupe, *e; for (;;) { if ((s = skipLeadingSpaces(s)) == NULL) break; if ((c = *s) == 0) break; if (c == '\'' || c == '"') { if (!parseQuotedString(s, s, &e)) errAbort("missing closing %c in %s", c, string); } else { e = skipToSpaces(s); if (e != NULL) *e++ = 0; } name = slNameNew(s); slAddHead(&list, name); s = e; } freeMem(dupe); slReverse(&list); return list; } struct slName *charSepToSlNames(char *string, char c) /* Convert character-separated list of items to slName list. * Note that the last occurence of c is optional. (That * is for a comma-separated list a,b,c and a,b,c, are * equivalent. */ { struct slName *list = NULL, *el; char *s, *e; s = string; while (s != NULL && s[0] != 0) { e = strchr(s, c); if (e == NULL) { el = slNameNew(s); slAddHead(&list, el); break; } else { el = slNameNewN(s, e - s); slAddHead(&list, el); s = e+1; } } slReverse(&list); return list; } struct slName *commaSepToSlNames(char *commaSep) /* Convert comma-separated list of items to slName list. */ { return charSepToSlNames(commaSep, ','); } void sprintLongWithCommas(char *s, long long l) /* Print out a long number with commas a thousands, millions, etc. */ { long long trillions, billions, millions, thousands; if (l >= 1000000000000LL) { trillions = l/1000000000000LL; l -= trillions * 1000000000000LL; billions = l/1000000000; l -= billions * 1000000000; millions = l/1000000; l -= millions * 1000000; thousands = l/1000; l -= thousands * 1000; sprintf(s, "%lld,%03lld,%03lld,%03lld,%03lld", trillions, billions, millions, thousands, l); } else if (l >= 1000000000) { billions = l/1000000000; l -= billions * 1000000000; millions = l/1000000; l -= millions * 1000000; thousands = l/1000; l -= thousands * 1000; sprintf(s, "%lld,%03lld,%03lld,%03lld", billions, millions, thousands, l); } else if (l >= 1000000) { millions = l/1000000; l -= millions * (long long)1000000; thousands = l/1000; l -= thousands * 1000; sprintf(s, "%lld,%03lld,%03lld", millions, thousands, l); } else if (l >= 1000) { thousands = l/1000; l -= thousands * 1000; sprintf(s, "%lld,%03lld", thousands, l); } else sprintf(s, "%lld", l); } void printLongWithCommas(FILE *f, long long l) /* Print out a long number with commas at thousands, millions, etc. */ { char ascii[32]; sprintLongWithCommas(ascii, l); fprintf(f, "%s", ascii); } void sprintWithGreekByte(char *s, int slength, long long size) /* Numbers formatted with PB, TB, GB, MB, KB, B */ { char *greek[] = {"B", "KB", "MB", "GB", "TB", "PB"}; int i = 0; long long d = 1; while ((size/d) >= 1024) { ++i; d *= 1024; } double result = ((double)size)/d; if (result < 10) safef(s,slength,"%3.1f %s",((double)size)/d, greek[i]); else safef(s,slength,"%3.0f %s",((double)size)/d, greek[i]); } void printWithGreekByte(FILE *f, long long l) /* Print with formatting in gigabyte, terabyte, etc. */ { char buf[32]; sprintWithGreekByte(buf, sizeof(buf), l); fprintf(f, "%s", buf); } void shuffleArrayOfChars(char *array, int arraySize) /* Shuffle array of characters of given size given number of times. */ { char c; int i, randIx; /* Randomly permute an array using the method from Cormen, et al */ for (i=0; i 1) { struct slList *el; struct slList **array; int i; array = needLargeMem(count * sizeof(*array)); for (el = list, i=0; el != NULL; el = el->next, i++) array[i] = el; for (i=0; i<4; ++i) shuffleArrayOfPointers(array, count); list = NULL; for (i=0; inext = list; list = array[i]; } freeMem(array); slReverse(&list); *pL = list; } } void *slListRandomReduce(void *list, double reduceRatio) /* Reduce list to approximately reduceRatio times original size. Destroys original list. */ { if (reduceRatio >= 1.0) return list; int threshold = RAND_MAX * reduceRatio; struct slList *newList = NULL, *next, *el; for (el = list; el != NULL; el = next) { next = el->next; if (rand() <= threshold) { slAddHead(&newList, el); } } return newList; } void *slListRandomSample(void *list, int maxCount) /* Return a sublist of list with at most maxCount. Destroy list in process */ { if (list == NULL) return list; int initialCount = slCount(list); if (initialCount <= maxCount) return list; double reduceRatio = (double)maxCount/initialCount; if (reduceRatio < 0.9) { double conservativeReduceRatio = reduceRatio * 1.05; list = slListRandomReduce(list, conservativeReduceRatio); } int midCount = slCount(list); if (midCount <= maxCount) return list; shuffleList(list); struct slList *lastEl = slElementFromIx(list, maxCount-1); lastEl->next = NULL; return list; } char *stripCommas(char *position) /* make a new string with commas stripped out */ { char *newPos = cloneString(position); char *nPtr = newPos; if (position == NULL) return NULL; while((*nPtr = *position++)) if (*nPtr != ',') nPtr++; return newPos; } void dotForUserInit(int dotMod) /* Set how often dotForUser() outputs a dot. */ { assert(dotMod > 0); _dotForUserMod = dotMod; } void dotForUser() /* Write out a dot every _dotForUserMod times this is called. */ { static int dot = -10; /* Check to see if dot has been initialized. */ if(dot == - 10) dot = _dotForUserMod; if (--dot <= 0) { putc('.', stderr); fflush(stderr); dot = _dotForUserMod; } } void spaceToUnderbar(char *s) /* Convert white space to underbar. */ { char c; while ((c = *s) != 0) { if (isspace(c)) *s = '_'; ++s; } } void printVmPeak() /* print to stderr peak Vm memory usage (if /proc/ business exists) */ { pid_t pid = getpid(); char temp[256]; safef(temp, sizeof(temp), "/proc/%d/status", (int) pid); struct lineFile *lf = lineFileMayOpen(temp, TRUE); if (lf) { char *line; while (lineFileNextReal(lf, &line)) { if (stringIn("VmPeak", line)) { fprintf(stderr, "# pid=%d: %s\n", pid, line); break; } } lineFileClose(&lf); } else fprintf(stderr, "# printVmPeak: %s - not available\n", temp); fflush(stderr); } boolean nameInCommaList(char *name, char *commaList) /* Return TRUE if name is in comma separated list. */ { if (commaList == NULL) return FALSE; int nameLen = strlen(name); for (;;) { char c = *commaList; if (c == 0) return FALSE; if (memcmp(name, commaList, nameLen) == 0) { c = commaList[nameLen]; if (c == 0 || c == ',') return TRUE; } commaList = strchr(commaList, ','); if (commaList == NULL) return FALSE; commaList += 1; } } +boolean endsWithWordComma(char *string, char *word) +/* Return TRUE if string ends with word possibly followed by a comma, and the beginning + * of word within string is the beginning of string or follows a comma. */ +{ +int stringLen = strlen(string); +int wordLen = strlen(word); +int commaSize = (stringLen > wordLen && string[stringLen-1] == ',') ? 1 : 0; +if (stringLen < wordLen + commaSize) + return FALSE; +int wordOffset = stringLen - commaSize - wordLen; +if (sameStringN(string + wordOffset, word, wordLen) && + (wordOffset == 0 || string[wordOffset-1] == ',')) + return TRUE; +return FALSE; +}