8a0946dd6870f10cde056ba243f1fb4ec1fd16b4 angie Thu Feb 27 11:58:33 2014 -0800 Adding support for plain VCF custom tracks (as opposed to VCF+tabix),since users seem to want to upload VCF, and as long as the file is not too big it will work OK. This means adding a new track type vcf (as opposed to vcfTabix) and supporting it in hgTracks, hgTrackUi, hgc, hgTables and hgVai. (and others I've forgotten?) refs #12416 diff --git src/hg/hgTables/vcf.c src/hg/hgTables/vcf.c index 524fe02..d1e4b1e 100644 --- src/hg/hgTables/vcf.c +++ src/hg/hgTables/vcf.c @@ -7,55 +7,67 @@ #include "asParse.h" #include "hgBam.h" #include "linefile.h" #include "localmem.h" #include "obscure.h" #include "vcf.h" #include "web.h" #if (defined USE_TABIX && defined KNETFILE_HOOKS) #include "knetUdc.h" #include "udc.h" #endif//def USE_TABIX && KNETFILE_HOOKS #define VCFDATALINE_NUM_COLS 10 -boolean isVcfTable(char *table) -/* Return TRUE if table corresponds to a VCF file. */ +boolean isVcfTable(char *table, boolean *retIsTabix) +/* Return TRUE if table corresponds to a VCF file. + * If retIsTabix is non-NULL, set *retIsTabix to TRUE if this is vcfTabix (not just vcf). */ { +boolean isVcfTabix = FALSE, isVcf = FALSE; struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); if (tdb) - return tdbIsVcf(tdb); + { + isVcfTabix = startsWithWord("vcfTabix", tdb->type); + isVcf = startsWithWord("vcf", tdb->type); + } else - return trackIsType(database, table, curTrack, "vcfTabix", ctLookupName); + { + isVcfTabix = trackIsType(database, table, curTrack, "vcfTabix", ctLookupName); + if (!isVcfTabix) + isVcf = trackIsType(database, table, curTrack, "vcf", ctLookupName); + } +if (retIsTabix) + *retIsTabix = isVcfTabix; +return (isVcfTabix || isVcf); } -struct hTableInfo *vcfToHti(char *table) +struct hTableInfo *vcfToHti(char *table, boolean isTabix) /* Get standard fields of VCF into hti structure. */ { struct hTableInfo *hti; AllocVar(hti); hti->rootName = cloneString(table); hti->isPos= TRUE; strcpy(hti->chromField, "chrom"); strcpy(hti->startField, "pos"); strcpy(hti->nameField, "id"); -hti->type = cloneString("vcfTabix"); +hti->type = cloneString(isTabix ? "vcfTabix" : "vcf"); return hti; } -struct slName *vcfGetFields(char *table) +struct slName *vcfGetFields() /* Get fields of vcf as simple name list. */ { struct asObject *as = vcfAsObj(); struct slName *names = asColNames(as); return names; } struct sqlFieldType *vcfListFieldsAndTypes() /* Get fields of bigBed as list of sqlFieldType. */ { struct asObject *as = vcfAsObj(); struct sqlFieldType *list = sqlFieldTypesFromAs(as); return list; } @@ -155,56 +167,75 @@ row[5] = rec->qual; dyJoin(dyFilter, ";", rec->filters, rec->filterCount); row[6] = dyFilter->string; vcfInfoElsToString(dyInfo, vcff, rec); row[7] = dyInfo->string; if (vcff->genotypeCount > 0) { row[8] = rec->format; dyJoin(dyGt, "\t", rec->genotypeUnparsedStrings, vcff->genotypeCount); row[9] = dyGt->string; } else row[8] = row[9] = ""; // compatible with localmem usage } -void vcfTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f) +static char *vcfFileName(struct trackDb *tdb, struct sqlConnection *conn, char *table, char *chrom) +// Look up the vcf or vcfTabix file name, using CUSTOM_TRASH if necessary. +{ +boolean isCt = isCustomTrack(table); +char *dbTable = table; +struct sqlConnection *dbConn = conn; +if (isCt) + { + dbConn = hAllocConn(CUSTOM_TRASH); + struct customTrack *ct = ctLookupName(table); + dbTable = ct->dbTableName; + } +char *fileName = bbiNameFromSettingOrTableChrom(tdb, dbConn, dbTable, chrom); +if (isCt) + hFreeConn(&dbConn); +return fileName; +} + +void vcfTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f, + boolean isTabix) /* Print out selected fields from VCF. If fields is NULL, then print out all fields. */ { struct hTableInfo *hti = NULL; hti = getHti(db, table, conn); struct hash *idHash = NULL; char *idField = getIdField(db, curTrack, table, hti); int idFieldNum = 0; /* if we know what field to use for the identifiers, get the hash of names */ if (idField != NULL) idHash = identifierHash(db, table); if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount); chopByChar(fields, ',', fieldArray, fieldCount); /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by * column name. */ struct hash *fieldHash = hashNew(0); -struct slName *bb, *bbList = vcfGetFields(table); +struct slName *bb, *bbList = vcfGetFields(); int i; for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) { /* if we know the field for identifiers, save it away */ if ((idField != NULL) && sameString(idField, bb->name)) idFieldNum = i; hashAddInt(fieldHash, bb->name, i); } /* Create an array of column indexes corresponding to the selected field list. */ int *columnArray; AllocArray(columnArray, fieldCount); for (i=0; i 0); region = region->next) { - char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom); - struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, + char *fileName = vcfFileName(tdb, conn, table, region->chrom); + struct vcfFile *vcff; + if (isTabix) + vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, 100, maxOut); + else + vcff = vcfFileMayOpen(fileName, region->chrom, region->start, region->end, + 100, maxOut, TRUE); if (vcff == NULL) noWarnAbort(); // If we are outputting all fields, but this VCF has no genotype info, omit the // genotype columns from output: if (allFields && vcff->genotypeCount == 0) fieldCount = VCFDATALINE_NUM_COLS - 2; if (!printedHeader) { fprintf(f, "%s", vcff->headerString); if (filter) fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); if (!allFields) { fprintf(f, "#%s", fieldArray[0]); for (i=1; ichrom, region->start, region->end, +struct vcfFile *vcff; +if (isTabix) + vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, 100, *pMaxOut); +else + vcff = vcfFileMayOpen(fileName, region->chrom, region->start, region->end, + 100, *pMaxOut, TRUE); if (vcff == NULL) noWarnAbort(); struct lm *lm = lmInit(0); char *row[VCFDATALINE_NUM_COLS]; char numBuf[VCF_NUM_BUF_SIZE]; // Temporary storage for row-ification: struct dyString *dyAlt = newDyString(1024); struct dyString *dyFilter = newDyString(1024); struct dyString *dyInfo = newDyString(1024); struct dyString *dyGt = newDyString(1024); struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row); if (asFilterOnRow(filter, row)) @@ -325,68 +367,70 @@ bed->chromEnd = rec->chromEnd; bed->name = lmCloneString(bedLm, rec->name); slAddHead(pBedList, bed); } (*pMaxOut)--; if (*pMaxOut <= 0) break; } dyStringFree(&dyAlt); dyStringFree(&dyFilter); dyStringFree(&dyInfo); dyStringFree(&dyGt); lmCleanup(&lm); vcfFileFree(&vcff); } struct bed *vcfGetFilteredBedsOnRegions(struct sqlConnection *conn, char *db, char *table, struct region *regionList, struct lm *lm, - int *retFieldCount) + int *retFieldCount, boolean isTabix) /* Get list of beds from VCF, in all regions, that pass filtering. */ { int maxOut = bigFileMaxOutput(); /* Figure out vcf file name get column info and filter. */ struct asObject *as = vcfAsObj(); struct asFilter *filter = asFilterFromCart(cart, db, table, as); struct hash *idHash = identifierHash(db, table); /* Get beds a region at a time. */ struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); struct bed *bedList = NULL; struct region *region; for (region = regionList; region != NULL; region = region->next) { - char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom); + char *fileName = vcfFileName(tdb, conn, table, region->chrom); if (fileName == NULL) continue; - addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut); + addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut, + isTabix); freeMem(fileName); if (maxOut <= 0) { warn("Reached output limit of %d data values, please make region smaller,\n" "\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); break; } } slReverse(&bedList); return bedList; } -struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count) +struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count, boolean isTabix) /* Return some semi-random IDs from a VCF file. */ { /* Read 10000 items from vcf file, or if they ask for a big list, then 4x what they ask for. */ struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); -char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database)); -struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE); +char *fileName = vcfFileName(tdb, conn, table, hDefaultChrom(database)); +struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) : + lineFileMayOpen(fileName, TRUE); if (lf == NULL) noWarnAbort(); int orderedCount = count * 4; if (orderedCount < 100) orderedCount = 100; struct slName *idList = NULL; char *words[4]; int i; for (i = 0; i < orderedCount && lineFileChop(lf, words); i++) { // compress runs of identical ID, in case most are placeholder if (i == 0 || !sameString(words[2], idList->name)) slAddHead(&idList, slNameNew(words[2])); } lineFileClose(&lf); @@ -395,66 +439,67 @@ struct slName *sl; for (sl = idList, i = 0; sl != NULL; sl = sl->next, i++) { if (i+1 >= count) { slNameFreeList(&(sl->next)); break; } } freez(&fileName); return idList; } #define VCF_MAX_SCHEMA_COLS 20 -void showSchemaVcf(char *table, struct trackDb *tdb) +void showSchemaVcf(char *table, struct trackDb *tdb, boolean isTabix) /* Show schema on vcf. */ { struct sqlConnection *conn = hAllocConn(database); -char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database)); +char *fileName = vcfFileName(tdb, conn, table, hDefaultChrom(database)); struct asObject *as = vcfAsObj(); hPrintf("Database: %s", database); hPrintf("    Primary Table: %s
", table); hPrintf("VCF File: %s", fileName); hPrintf("
\n"); hPrintf("Format description: %s
", as->comment); hPrintf("See the Variant Call Format specification for more details
\n", "http://www.1000genomes.org/wiki/analysis/vcf4.0"); /* Put up table that describes fields. */ hTableStart(); hPrintf("field"); hPrintf("description "); puts("\n"); struct asColumn *col; int colCount = 0; for (col = as->columnList; col != NULL; col = col->next) { hPrintf("%s", col->name); hPrintf("%s", col->comment); ++colCount; } hTableEnd(); /* Put up another section with sample rows. */ webNewSection("Sample Rows"); hTableStart(); /* Fetch sample rows. */ -struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE); +struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) : + lineFileMayOpen(fileName, TRUE); if (lf == NULL) noWarnAbort(); char *row[VCF_MAX_SCHEMA_COLS]; int i; for (i = 0; i < 10; i++) { int colCount = lineFileChop(lf, row); int colIx; if (i == 0) { // Print field names as column headers, using colCount to compute genotype span hPrintf(""); for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount; colIx++, col = col->next) {