7a0635be123f35cc0b5063af03911a70ecbc9841 angie Fri Jun 1 17:30:41 2012 -0700 Track #7964 (1000 Genomes Phase 1 Variant Calls): adding trackDb entryand code support for database table that has per-chromosome fileNames because 1000 Genomes provides enormous per-chromosome files. This is very similar to code support for per-chromosome BAM. To-do: refactor out bam-specific code and trackDbCustom.c's tdbBigFileName to use bbiNameFromSettingOrTable(Chrom). diff --git src/hg/hgTables/vcf.c src/hg/hgTables/vcf.c index bc7148d..8b23909 100644 --- src/hg/hgTables/vcf.c +++ src/hg/hgTables/vcf.c @@ -17,40 +17,30 @@ #include "udc.h" #endif//def USE_TABIX && KNETFILE_HOOKS #define VCFDATALINE_NUM_COLS 10 boolean isVcfTable(char *table) /* Return TRUE if table corresponds to a VCF file. */ { struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); if (tdb) return tdbIsVcf(tdb); else return trackIsType(database, table, curTrack, "vcfTabix", ctLookupName); } -char *vcfFileName(char *table, struct sqlConnection *conn, char *seqName) -/* Return file name associated with VCF. This handles differences whether it's - * a custom or built-in track. Do a freeMem on returned string when done. */ -{ -char *fileName = bigFileNameFromCtOrHub(table, conn); -if (fileName == NULL) - fileName = bamFileNameFromTable(conn, table, seqName); -return fileName; -} - struct hTableInfo *vcfToHti(char *table) /* Get standard fields of VCF into hti structure. */ { struct hTableInfo *hti; AllocVar(hti); hti->rootName = cloneString(table); hti->isPos= TRUE; strcpy(hti->chromField, "chrom"); strcpy(hti->startField, "pos"); strcpy(hti->nameField, "id"); hti->type = cloneString("vcfTabix"); return hti; } struct slName *vcfGetFields(char *table) @@ -221,41 +211,42 @@ } // If we are outputting a subset of fields, invalidate the VCF header. boolean allFields = (fieldCount == VCFDATALINE_NUM_COLS); if (!allFields) fprintf(f, "# Only selected columns are included below; output is not valid VCF.\n"); struct asObject *as = vcfAsObj(); struct asFilter *filter = NULL; if (anyFilter()) filter = asFilterFromCart(cart, db, table, as); /* Loop through outputting each region */ struct region *region, *regionList = getRegions(); int maxOut = bigFileMaxOutput(); +struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); // Include the header, absolutely necessary for VCF parsing. boolean printedHeader = FALSE; // Temporary storage for row-ification: struct dyString *dyAlt = newDyString(1024); struct dyString *dyFilter = newDyString(1024); struct dyString *dyInfo = newDyString(1024); struct dyString *dyGt = newDyString(1024); struct vcfRecord *rec; for (region = regionList; region != NULL && (maxOut > 0); region = region->next) { - char *fileName = vcfFileName(table, conn, region->chrom); + char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom); struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, 100, maxOut); if (vcff == NULL) noWarnAbort(); // If we are outputting all fields, but this VCF has no genotype info, omit the // genotype columns from output: if (allFields && vcff->genotypeCount == 0) fieldCount = VCFDATALINE_NUM_COLS - 2; if (!printedHeader) { fprintf(f, "%s", vcff->headerString); if (filter) fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); if (!allFields) { @@ -344,53 +335,55 @@ vcfFileFree(&vcff); } struct bed *vcfGetFilteredBedsOnRegions(struct sqlConnection *conn, char *db, char *table, struct region *regionList, struct lm *lm, int *retFieldCount) /* Get list of beds from VCF, in all regions, that pass filtering. */ { int maxOut = bigFileMaxOutput(); /* Figure out vcf file name get column info and filter. */ struct asObject *as = vcfAsObj(); struct asFilter *filter = asFilterFromCart(cart, db, table, as); struct hash *idHash = identifierHash(db, table); /* Get beds a region at a time. */ +struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); struct bed *bedList = NULL; struct region *region; for (region = regionList; region != NULL; region = region->next) { - char *fileName = vcfFileName(table, conn, region->chrom); + char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom); addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut); freeMem(fileName); if (maxOut <= 0) { warn("Reached output limit of %d data values, please make region smaller,\n" "\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); break; } } slReverse(&bedList); return bedList; } struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count) /* Return some semi-random IDs from a VCF file. */ { /* Read 10000 items from vcf file, or if they ask for a big list, then 4x what they ask for. */ -char *fileName = vcfFileName(table, conn, NULL); +struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); +char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database)); struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE); if (lf == NULL) noWarnAbort(); int orderedCount = count * 4; if (orderedCount < 10000) orderedCount = 10000; struct slName *idList = NULL; char *words[4]; int i; for (i = 0; i < orderedCount && lineFileChop(lf, words); i++) slAddHead(&idList, slNameNew(words[2])); lineFileClose(&lf); /* Shuffle list and trim it to count if necessary. */ shuffleList(&idList, 1); struct slName *sl; @@ -399,32 +392,33 @@ if (i+1 >= count) { slNameFreeList(&(sl->next)); break; } } freez(&fileName); return idList; } #define VCF_MAX_SCHEMA_COLS 20 void showSchemaVcf(char *table) /* Show schema on vcf. */ { +struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); struct sqlConnection *conn = hAllocConn(database); -char *fileName = vcfFileName(table, conn, NULL); +char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database)); struct asObject *as = vcfAsObj(); hPrintf("<B>Database:</B> %s", database); hPrintf(" <B>Primary Table:</B> %s<br>", table); hPrintf("<B>VCF File:</B> %s", fileName); hPrintf("<BR>\n"); hPrintf("<B>Format description:</B> %s<BR>", as->comment); hPrintf("See the <A HREF=\"%s\" target=_blank>Variant Call Format specification</A> for more details<BR>\n", "http://www.1000genomes.org/wiki/analysis/vcf4.0"); /* Put up table that describes fields. */ hTableStart(); hPrintf("<TR><TH>field</TH>"); hPrintf("<TH>description</TH> "); puts("</TR>\n");