24f85378dd074f3ea364a40b95b6ee27a72012dc angie Fri Oct 2 16:29:25 2015 -0700 On very long queries like selected fields of snp144 chr1 (>11M rows), hgIntegrator was spending about half of the time repetitively building up sourceName.columnName strings and looking them up in a hash to tell whether or not to include each column's data in the output. To avoid the waste, now it builds up an array (one for each source, primary + grators) of arrays of flags (one flag for each column) so it can do an array lookup instead of string-munging and hash lookup. ~2x speedup for selected fields of snp144 chr1 (4 minutes -> 2 minutes). mysql time is about half that; much of the other half is copying data in annoStreamDb.c:bufferRowsFromSqlQuery. diff --git src/lib/annoFormatTab.c src/lib/annoFormatTab.c index bbd84b6..35aadfb 100644 --- src/lib/annoFormatTab.c +++ src/lib/annoFormatTab.c @@ -2,110 +2,162 @@ /* Copyright (C) 2013 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "annoFormatTab.h" #include "annoGratorQuery.h" #include "dystring.h" struct annoFormatTab { struct annoFormatter formatter; // External interface char *fileName; // Output file name, can be "stdout" FILE *f; // Output file handle struct hash *columnVis; // Hash of columns that have been explicitly selected // or deselected by user. + boolean **columnFlags; // 2D array[sources][columns] of flags for whether each + // column of each source is to be included in output. + int *columnFlagLengths; // array[sources] of column counts (lengths of subarrays), + // for bounds-checking + int sourceCount; // Number of sources (primary + grators) boolean needHeader; // TRUE if we should print out the header }; static void makeFullColumnName(char *fullName, size_t size, char *sourceName, char *colName) /* If sourceName is non-empty, make fullName sourceName.colName, otherwise just colName. */ { if (isNotEmpty(sourceName)) safef(fullName, size, "%s.%s", sourceName, colName); else safecpy(fullName, size, colName); } void annoFormatTabSetColumnVis(struct annoFormatter *vSelf, char *sourceName, char *colName, boolean enabled) /* Explicitly include or exclude column in output. sourceName must be the same * as the corresponding annoStreamer source's name. */ { struct annoFormatTab *self = (struct annoFormatTab *)vSelf; if (! self->columnVis) self->columnVis = hashNew(0); char fullName[PATH_LEN]; makeFullColumnName(fullName, sizeof(fullName), sourceName, colName); hashAddInt(self->columnVis, fullName, enabled); } -static boolean columnIsIncluded(struct annoFormatTab *self, char *sourceName, char *colName) +static boolean columnNameIsIncluded(struct annoFormatTab *self, char *sourceName, char *colName) // Return TRUE if column has not been explicitly deselected. { if (self->columnVis) { char fullName[PATH_LEN]; makeFullColumnName(fullName, sizeof(fullName), sourceName, colName); int vis = hashIntValDefault(self->columnVis, fullName, 1); if (vis == 0) return FALSE; } return TRUE; } +static void addColumnFlagsForSource(struct annoFormatTab *self, struct annoStreamer *streamer) +/* Make an array of flags for whether to include each column from streamer. */ +{ +uint sourceIx = self->sourceCount; +int colCount = slCount(streamer->asObj->columnList); +AllocArray(self->columnFlags[sourceIx], colCount); +self->columnFlagLengths[sourceIx] = colCount; +struct asColumn *col; +uint colIx; +for (colIx = 0, col = streamer->asObj->columnList; col != NULL; col = col->next, colIx++) + self->columnFlags[sourceIx][colIx] = columnNameIsIncluded(self, streamer->name, col->name); +self->sourceCount++; +} + +static void makeColumnFlags(struct annoFormatTab *self, struct annoStreamer *primary, + struct annoStreamer *integrators) +/* Build arrays of flags for whether each column from each source is to be included + * in the output. */ +{ +int sourceCount = 1 + slCount(integrators); +AllocArray(self->columnFlags, sourceCount); +AllocArray(self->columnFlagLengths, sourceCount); +// self->sourceCount is incremented by each call to addColumnFlags +self->sourceCount = 0; +addColumnFlagsForSource(self, primary); +struct annoStreamer *grator; +for (grator = integrators; grator != NULL; grator = grator->next) + addColumnFlagsForSource(self, grator); +assert(self->sourceCount == sourceCount); +} + +INLINE boolean columnIsIncluded(struct annoFormatTab *self, uint sourceIx, uint colIx) +/* Look up flag for whether this source's column is included in output. */ +{ +if (sourceIx >= self->sourceCount) + errAbort("annoFormatTab: illegal sourceIx %d (count is %d)", sourceIx, self->sourceCount); +if (colIx >= self->columnFlagLengths[sourceIx]) + errAbort("annoFormatTab: illegal colIx %d (lengths[%d] is %d) -- out-of-order sources?", + colIx, sourceIx, self->columnFlagLengths[sourceIx]); +return self->columnFlags[sourceIx][colIx]; +} + static void printHeaderColumns(struct annoFormatTab *self, struct annoStreamer *source, - boolean *pIsFirst) + uint sourceIx, boolean *pIsFirst) /* Print names of included columns from this source. */ { FILE *f = self->f; char *sourceName = source->name; boolean isFirst = (pIsFirst && *pIsFirst); +uint colIx; struct asColumn *col; -for (col = source->asObj->columnList; col != NULL; col = col->next) +for (colIx = 0, col = source->asObj->columnList; col != NULL; col = col->next, colIx++) { - if (columnIsIncluded(self, sourceName, col->name)) + if (columnIsIncluded(self, sourceIx, colIx)) { if (isFirst) isFirst = FALSE; else fputc('\t', f); char fullName[PATH_LEN]; makeFullColumnName(fullName, sizeof(fullName), sourceName, col->name); fputs(fullName, f); } } if (pIsFirst != NULL) *pIsFirst = isFirst; } static void aftInitialize(struct annoFormatter *vSelf, struct annoStreamer *primary, struct annoStreamer *integrators) -/* Print header, regardless of whether we get any data after this. */ +/* Print header, regardless of whether we get any data after this. + * Also build arrays of flags for whether each column from each source is to be included + * in the output. */ { struct annoFormatTab *self = (struct annoFormatTab *)vSelf; +makeColumnFlags(self, primary, integrators); if (self->needHeader) { char *primaryHeader = primary->getHeader(primary); boolean isFirst = TRUE; if (isNotEmpty(primaryHeader)) fprintf(self->f, "# Header from primary input:\n%s", primaryHeader); fputc('#', self->f); - printHeaderColumns(self, primary, &isFirst); + printHeaderColumns(self, primary, 0, &isFirst); + uint sourceIx; struct annoStreamer *grator; - for (grator = integrators; grator != NULL; grator = grator->next) - printHeaderColumns(self, grator, &isFirst); + for (sourceIx = 1, grator = integrators; grator != NULL; grator = grator->next, sourceIx++) + printHeaderColumns(self, grator, sourceIx, &isFirst); fputc('\n', self->f); self->needHeader = FALSE; } } static char **bed4WordsFromAnnoRow(struct annoRow *row, char *fourth) /* Return an array of 4 words with row's chrom, chromStart, and chromEnd, and cloned fourth. */ { char **words; AllocArray(words, 4); words[0] = cloneString(row->chrom); char buf[PATH_LEN]; safef(buf, sizeof(buf), "%u", row->start); words[1] = cloneString(buf); safef(buf, sizeof(buf), "%u", row->end); @@ -157,93 +209,92 @@ else if (source->rowType == arWigVec || source->rowType == arWigSingle) { words = wordsFromWigRow(row, source->rowType); if (words != NULL) freeWhenDone = TRUE; } else errAbort("annoFormatTab: unrecognized row type %d from source %s", source->rowType, source->name); if (retFreeWhenDone != NULL) *retFreeWhenDone = freeWhenDone; return words; } static void printColumns(struct annoFormatTab *self, struct annoStreamer *streamer, - struct annoRow *row, boolean isFirst) + uint sourceIx, struct annoRow *row, boolean isFirst) /* Print columns in streamer's row (if NULL, print the right number of empty fields). */ { FILE *f = self->f; -char *sourceName = streamer->name; boolean freeWhenDone = FALSE; char **words = wordsFromRow(row, streamer, &freeWhenDone); struct asColumn *col; -int i; -for (col = streamer->asObj->columnList, i = 0; col != NULL; col = col->next, i++) +uint colIx; +for (col = streamer->asObj->columnList, colIx = 0; col != NULL; col = col->next, colIx++) { - if (columnIsIncluded(self, sourceName, col->name)) + if (columnIsIncluded(self, sourceIx, colIx)) { if (isFirst) isFirst = FALSE; else fputc('\t', f); if (words != NULL) - fputs((words[i] ? words[i] : ""), f); + fputs((words[colIx] ? words[colIx] : ""), f); } } -int wordCount = i; +int wordCount = colIx; if (freeWhenDone) { - for (i = 0; i < wordCount; i++) - freeMem(words[i]); + for (colIx = 0; colIx < wordCount; colIx++) + freeMem(words[colIx]); freeMem(words); } } static void aftComment(struct annoFormatter *fSelf, char *content) /* Print out a comment line. */ { if (strchr(content, '\n')) errAbort("aftComment: no multi-line input"); struct annoFormatTab *self = (struct annoFormatTab *)fSelf; fprintf(self->f, "# %s\n", content); } static void aftFormatOne(struct annoFormatter *vSelf, struct annoStreamRows *primaryData, struct annoStreamRows *gratorData, int gratorCount) /* Print out tab-separated columns that we have gathered in prior calls to aftCollect, * and start over fresh for the next line of output. */ { struct annoFormatTab *self = (struct annoFormatTab *)vSelf; // Got one row from primary; what's the largest # of rows from any grator? int maxRows = 1; -int iG; +uint iG; for (iG = 0; iG < gratorCount; iG++) { int gratorRowCount = slCount(gratorData[iG].rowList); if (gratorRowCount > maxRows) maxRows = gratorRowCount; } // Print out enough rows to make sure that all grator rows are included. int iR; for (iR = 0; iR < maxRows; iR++) { - printColumns(self, primaryData->streamer, primaryData->rowList, TRUE); + printColumns(self, primaryData->streamer, 0, primaryData->rowList, TRUE); for (iG = 0; iG < gratorCount; iG++) { struct annoRow *gratorRow = slElementFromIx(gratorData[iG].rowList, iR); - printColumns(self, gratorData[iG].streamer, gratorRow, FALSE); + printColumns(self, gratorData[iG].streamer, 1+iG, gratorRow, FALSE); } fputc('\n', self->f); } } static void aftClose(struct annoFormatter **pVSelf) /* Close file handle, free self. */ { if (pVSelf == NULL) return; struct annoFormatTab *self = *(struct annoFormatTab **)pVSelf; freeMem(self->fileName); carefulClose(&(self->f)); annoFormatterFree(pVSelf); }