a1dcdda03cad5b931d17066789091a64f8dceca6 max Tue Nov 22 16:05:21 2016 -0800 adding bigDataIndex support to the table browser, refs #18420 diff --git src/hg/hgTables/vcf.c src/hg/hgTables/vcf.c index 4200a97..1c68406 100644 --- src/hg/hgTables/vcf.c +++ src/hg/hgTables/vcf.c @@ -244,32 +244,35 @@ struct region *region, *regionList = getRegions(); int maxOut = bigFileMaxOutput(); // Include the header, absolutely necessary for VCF parsing. boolean printedHeader = FALSE; // Temporary storage for row-ification: struct dyString *dyAlt = newDyString(1024); struct dyString *dyFilter = newDyString(1024); struct dyString *dyInfo = newDyString(1024); struct dyString *dyGt = newDyString(1024); struct vcfRecord *rec; for (region = regionList; region != NULL && (maxOut > 0); region = region->next) { char *fileName = vcfFileName(conn, table, region->chrom); struct vcfFile *vcff; if (isTabix) - vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, + { + char *indexUrl = bigDataIndexFromCtOrHub(table, conn); + vcff = vcfTabixFileAndIndexMayOpen(fileName, indexUrl, region->chrom, region->start, region->end, 100, maxOut); + } else vcff = vcfFileMayOpen(fileName, region->chrom, region->start, region->end, 100, maxOut, TRUE); if (vcff == NULL) noWarnAbort(); // If we are outputting all fields, but this VCF has no genotype info, omit the // genotype columns from output: if (allFields && vcff->genotypeCount == 0) fieldCount = VCFDATALINE_NUM_COLS - 2; if (!printedHeader) { fprintf(f, "%s", vcff->headerString); if (filter) fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); if (!allFields) @@ -305,39 +308,39 @@ } } vcfFileFree(&vcff); freeMem(fileName); } if (maxOut == 0) errAbort("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); /* Clean up and exit. */ dyStringFree(&dyAlt); dyStringFree(&dyFilter); dyStringFree(&dyInfo); dyStringFree(&dyGt); hashFree(&fieldHash); freeMem(fieldArray); freeMem(columnArray); } -static void addFilteredBedsOnRegion(char *fileName, struct region *region, char *table, +static void addFilteredBedsOnRegion(char *fileName, char *indexUrl, struct region *region, char *table, struct asFilter *filter, struct lm *bedLm, struct bed **pBedList, struct hash *idHash, int *pMaxOut, boolean isTabix) /* Add relevant beds in reverse order to pBedList */ { struct vcfFile *vcff; if (isTabix) - vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, + vcff = vcfTabixFileAndIndexMayOpen(fileName, indexUrl, region->chrom, region->start, region->end, 100, *pMaxOut); else vcff = vcfFileMayOpen(fileName, region->chrom, region->start, region->end, 100, *pMaxOut, TRUE); if (vcff == NULL) noWarnAbort(); struct lm *lm = lmInit(0); char *row[VCFDATALINE_NUM_COLS]; char numBuf[VCF_NUM_BUF_SIZE]; // Temporary storage for row-ification: struct dyString *dyAlt = newDyString(1024); struct dyString *dyFilter = newDyString(1024); struct dyString *dyInfo = newDyString(1024); struct dyString *dyGt = newDyString(1024); struct vcfRecord *rec; @@ -372,49 +375,52 @@ { int maxOut = bigFileMaxOutput(); /* Figure out vcf file name get column info and filter. */ struct asObject *as = vcfAsObj(); struct asFilter *filter = asFilterFromCart(cart, db, table, as); struct hash *idHash = identifierHash(db, table); /* Get beds a region at a time. */ struct bed *bedList = NULL; struct region *region; for (region = regionList; region != NULL; region = region->next) { char *fileName = vcfFileName(conn, table, region->chrom); if (fileName == NULL) continue; - addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut, + char *indexUrl = bigDataIndexFromCtOrHub(table, conn); + addFilteredBedsOnRegion(fileName, indexUrl, region, table, filter, lm, &bedList, idHash, &maxOut, isTabix); freeMem(fileName); if (maxOut <= 0) { errAbort("Reached output limit of %d data values, please make region smaller,\n" "\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); } } slReverse(&bedList); return bedList; } struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count, boolean isTabix) /* Return some semi-random IDs from a VCF file. */ { /* Read 10000 items from vcf file, or if they ask for a big list, then 4x what they ask for. */ char *fileName = vcfFileName(conn, table, hDefaultChrom(database)); -struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) : +char *indexUrl = bigDataIndexFromCtOrHub(table, conn); + +struct lineFile *lf = isTabix ? lineFileTabixAndIndexMayOpen(fileName, indexUrl, TRUE) : lineFileMayOpen(fileName, TRUE); if (lf == NULL) noWarnAbort(); int orderedCount = count * 4; if (orderedCount < 100) orderedCount = 100; struct slName *idList = NULL; char *words[4]; int i; for (i = 0; i < orderedCount && lineFileChop(lf, words); i++) { // compress runs of identical ID, in case most are placeholder if (i == 0 || !sameString(words[2], idList->name)) slAddHead(&idList, slNameNew(words[2])); } @@ -459,31 +465,32 @@ struct asColumn *col; int colCount = 0; for (col = as->columnList; col != NULL; col = col->next) { hPrintf("<TR><TD><TT>%s</TT></TD>", col->name); hPrintf("<TD>%s</TD></TR>", col->comment); ++colCount; } hTableEnd(); /* Put up another section with sample rows. */ webNewSection("Sample Rows"); hTableStart(); /* Fetch sample rows. */ -struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) : +char *indexUrl = bigDataIndexFromCtOrHub(table, conn); +struct lineFile *lf = isTabix ? lineFileTabixAndIndexMayOpen(fileName, indexUrl, TRUE) : lineFileMayOpen(fileName, TRUE); if (lf == NULL) noWarnAbort(); char *row[VCF_MAX_SCHEMA_COLS]; int i; for (i = 0; i < 10; i++) { int colCount = lineFileChop(lf, row); int colIx; if (i == 0) { // Print field names as column headers, using colCount to compute genotype span hPrintf("<TR>"); for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount; colIx++, col = col->next)