7a0635be123f35cc0b5063af03911a70ecbc9841
angie
  Fri Jun 1 17:30:41 2012 -0700
Track #7964 (1000 Genomes Phase 1 Variant Calls): adding trackDb entryand code support for database table that has per-chromosome fileNames
because 1000 Genomes provides enormous per-chromosome files.
This is very similar to code support for per-chromosome BAM.
To-do: refactor out bam-specific code and trackDbCustom.c's
tdbBigFileName to use bbiNameFromSettingOrTable(Chrom).

diff --git src/hg/hgTables/vcf.c src/hg/hgTables/vcf.c
index bc7148d..8b23909 100644
--- src/hg/hgTables/vcf.c
+++ src/hg/hgTables/vcf.c
@@ -17,40 +17,30 @@
 #include "udc.h"
 #endif//def USE_TABIX && KNETFILE_HOOKS
 
 #define VCFDATALINE_NUM_COLS 10
 
 boolean isVcfTable(char *table)
 /* Return TRUE if table corresponds to a VCF file. */
 {
 struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
 if (tdb)
     return tdbIsVcf(tdb);
 else
     return trackIsType(database, table, curTrack, "vcfTabix", ctLookupName);
 }
 
-char *vcfFileName(char *table, struct sqlConnection *conn, char *seqName)
-/* Return file name associated with VCF.  This handles differences whether it's
- * a custom or built-in track.  Do a freeMem on returned string when done. */
-{
-char *fileName = bigFileNameFromCtOrHub(table, conn);
-if (fileName == NULL)
-    fileName = bamFileNameFromTable(conn, table, seqName);
-return fileName;
-}
-
 struct hTableInfo *vcfToHti(char *table)
 /* Get standard fields of VCF into hti structure. */
 {
 struct hTableInfo *hti;
 AllocVar(hti);
 hti->rootName = cloneString(table);
 hti->isPos= TRUE;
 strcpy(hti->chromField, "chrom");
 strcpy(hti->startField, "pos");
 strcpy(hti->nameField, "id");
 hti->type = cloneString("vcfTabix");
 return hti;
 }
 
 struct slName *vcfGetFields(char *table)
@@ -221,41 +211,42 @@
     }
 
 // If we are outputting a subset of fields, invalidate the VCF header.
 boolean allFields = (fieldCount == VCFDATALINE_NUM_COLS);
 if (!allFields)
     fprintf(f, "# Only selected columns are included below; output is not valid VCF.\n");
 
 struct asObject *as = vcfAsObj();
 struct asFilter *filter = NULL;
 if (anyFilter())
     filter = asFilterFromCart(cart, db, table, as);
 
 /* Loop through outputting each region */
 struct region *region, *regionList = getRegions();
 int maxOut = bigFileMaxOutput();
+struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
 // Include the header, absolutely necessary for VCF parsing.
 boolean printedHeader = FALSE;
 // Temporary storage for row-ification:
 struct dyString *dyAlt = newDyString(1024);
 struct dyString *dyFilter = newDyString(1024);
 struct dyString *dyInfo = newDyString(1024);
 struct dyString *dyGt = newDyString(1024);
 struct vcfRecord *rec;
 for (region = regionList; region != NULL && (maxOut > 0); region = region->next)
     {
-    char *fileName = vcfFileName(table, conn, region->chrom);
+    char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom);
     struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end,
 					       100, maxOut);
     if (vcff == NULL)
 	noWarnAbort();
     // If we are outputting all fields, but this VCF has no genotype info, omit the
     // genotype columns from output:
     if (allFields && vcff->genotypeCount == 0)
 	fieldCount = VCFDATALINE_NUM_COLS - 2;
     if (!printedHeader)
 	{
 	fprintf(f, "%s", vcff->headerString);
 	if (filter)
 	    fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList));
 	if (!allFields)
 	    {
@@ -344,53 +335,55 @@
 vcfFileFree(&vcff);
 }
 
 struct bed *vcfGetFilteredBedsOnRegions(struct sqlConnection *conn,
 	char *db, char *table, struct region *regionList, struct lm *lm,
 	int *retFieldCount)
 /* Get list of beds from VCF, in all regions, that pass filtering. */
 {
 int maxOut = bigFileMaxOutput();
 /* Figure out vcf file name get column info and filter. */
 struct asObject *as = vcfAsObj();
 struct asFilter *filter = asFilterFromCart(cart, db, table, as);
 struct hash *idHash = identifierHash(db, table);
 
 /* Get beds a region at a time. */
+struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
 struct bed *bedList = NULL;
 struct region *region;
 for (region = regionList; region != NULL; region = region->next)
     {
-    char *fileName = vcfFileName(table, conn, region->chrom);
+    char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom);
     addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut);
     freeMem(fileName);
     if (maxOut <= 0)
 	{
 	warn("Reached output limit of %d data values, please make region smaller,\n"
 	     "\tor set a higher output line limit with the filter settings.", bigFileMaxOutput());
 	break;
 	}
     }
 slReverse(&bedList);
 return bedList;
 }
 
 struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count)
 /* Return some semi-random IDs from a VCF file. */
 {
 /* Read 10000 items from vcf file,  or if they ask for a big list, then 4x what they ask for. */
-char *fileName = vcfFileName(table, conn, NULL);
+struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
+char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database));
 struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE);
 if (lf == NULL)
     noWarnAbort();
 int orderedCount = count * 4;
 if (orderedCount < 10000)
     orderedCount = 10000;
 struct slName *idList = NULL;
 char *words[4];
 int i;
 for (i = 0;  i < orderedCount && lineFileChop(lf, words); i++)
     slAddHead(&idList, slNameNew(words[2]));
 lineFileClose(&lf);
 /* Shuffle list and trim it to count if necessary. */
 shuffleList(&idList, 1);
 struct slName *sl;
@@ -399,32 +392,33 @@
     if (i+1 >= count)
 	{
 	slNameFreeList(&(sl->next));
 	break;
 	}
     }
 freez(&fileName);
 return idList;
 }
 
 #define VCF_MAX_SCHEMA_COLS 20
 
 void showSchemaVcf(char *table)
 /* Show schema on vcf. */
 {
+struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
 struct sqlConnection *conn = hAllocConn(database);
-char *fileName = vcfFileName(table, conn, NULL);
+char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database));
 
 struct asObject *as = vcfAsObj();
 hPrintf("<B>Database:</B> %s", database);
 hPrintf("&nbsp;&nbsp;&nbsp;&nbsp;<B>Primary Table:</B> %s<br>", table);
 hPrintf("<B>VCF File:</B> %s", fileName);
 hPrintf("<BR>\n");
 hPrintf("<B>Format description:</B> %s<BR>", as->comment);
 hPrintf("See the <A HREF=\"%s\" target=_blank>Variant Call Format specification</A> for  more details<BR>\n",
 	"http://www.1000genomes.org/wiki/analysis/vcf4.0");
 
 /* Put up table that describes fields. */
 hTableStart();
 hPrintf("<TR><TH>field</TH>");
 hPrintf("<TH>description</TH> ");
 puts("</TR>\n");