8a0946dd6870f10cde056ba243f1fb4ec1fd16b4
angie
  Thu Feb 27 11:58:33 2014 -0800
Adding support for plain VCF custom tracks (as opposed to VCF+tabix),since users seem to want to upload VCF, and as long as the file is
not too big it will work OK.  This means adding a new track type
vcf (as opposed to vcfTabix) and supporting it in hgTracks, hgTrackUi,
hgc, hgTables and hgVai.  (and others I've forgotten?)
refs #12416

diff --git src/hg/hgTables/vcf.c src/hg/hgTables/vcf.c
index 524fe02..d1e4b1e 100644
--- src/hg/hgTables/vcf.c
+++ src/hg/hgTables/vcf.c
@@ -7,55 +7,67 @@
 #include "asParse.h"
 #include "hgBam.h"
 #include "linefile.h"
 #include "localmem.h"
 #include "obscure.h"
 #include "vcf.h"
 #include "web.h"
 
 #if (defined USE_TABIX && defined KNETFILE_HOOKS)
 #include "knetUdc.h"
 #include "udc.h"
 #endif//def USE_TABIX && KNETFILE_HOOKS
 
 #define VCFDATALINE_NUM_COLS 10
 
-boolean isVcfTable(char *table)
-/* Return TRUE if table corresponds to a VCF file. */
+boolean isVcfTable(char *table, boolean *retIsTabix)
+/* Return TRUE if table corresponds to a VCF file. 
+ * If retIsTabix is non-NULL, set *retIsTabix to TRUE if this is vcfTabix (not just vcf). */
 {
+boolean isVcfTabix = FALSE, isVcf = FALSE;
 struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
 if (tdb)
-    return tdbIsVcf(tdb);
+    {
+    isVcfTabix = startsWithWord("vcfTabix", tdb->type);
+    isVcf = startsWithWord("vcf", tdb->type);
+    }
 else
-    return trackIsType(database, table, curTrack, "vcfTabix", ctLookupName);
+    {
+    isVcfTabix = trackIsType(database, table, curTrack, "vcfTabix", ctLookupName);
+    if (!isVcfTabix)
+	isVcf = trackIsType(database, table, curTrack, "vcf", ctLookupName);
+    }
+if (retIsTabix)
+    *retIsTabix = isVcfTabix;
+return (isVcfTabix || isVcf);
 }
 
-struct hTableInfo *vcfToHti(char *table)
+struct hTableInfo *vcfToHti(char *table, boolean isTabix)
 /* Get standard fields of VCF into hti structure. */
 {
 struct hTableInfo *hti;
 AllocVar(hti);
 hti->rootName = cloneString(table);
 hti->isPos= TRUE;
 strcpy(hti->chromField, "chrom");
 strcpy(hti->startField, "pos");
 strcpy(hti->nameField, "id");
-hti->type = cloneString("vcfTabix");
+hti->type = cloneString(isTabix ? "vcfTabix" : "vcf");
 return hti;
 }
 
-struct slName *vcfGetFields(char *table)
+struct slName *vcfGetFields()
 /* Get fields of vcf as simple name list. */
 {
 struct asObject *as = vcfAsObj();
 struct slName *names = asColNames(as);
 return names;
 }
 
 struct sqlFieldType *vcfListFieldsAndTypes()
 /* Get fields of bigBed as list of sqlFieldType. */
 {
 struct asObject *as = vcfAsObj();
 struct sqlFieldType *list = sqlFieldTypesFromAs(as);
 return list;
 }
 
@@ -155,56 +167,75 @@
 row[5] = rec->qual;
 dyJoin(dyFilter, ";", rec->filters, rec->filterCount);
 row[6] = dyFilter->string;
 vcfInfoElsToString(dyInfo, vcff, rec);
 row[7] = dyInfo->string;
 if (vcff->genotypeCount > 0)
     {
     row[8] = rec->format;
     dyJoin(dyGt, "\t", rec->genotypeUnparsedStrings, vcff->genotypeCount);
     row[9] = dyGt->string;
     }
 else
     row[8] = row[9] = ""; // compatible with localmem usage
 }
 
-void vcfTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f)
+static char *vcfFileName(struct trackDb *tdb, struct sqlConnection *conn, char *table, char *chrom)
+// Look up the vcf or vcfTabix file name, using CUSTOM_TRASH if necessary.
+{
+boolean isCt = isCustomTrack(table);
+char *dbTable = table;
+struct sqlConnection *dbConn = conn;
+if (isCt)
+    {
+    dbConn = hAllocConn(CUSTOM_TRASH);
+    struct customTrack *ct = ctLookupName(table);
+    dbTable = ct->dbTableName;
+    }
+char *fileName = bbiNameFromSettingOrTableChrom(tdb, dbConn, dbTable, chrom);
+if (isCt)
+    hFreeConn(&dbConn);
+return fileName;
+}
+
+void vcfTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f,
+	       boolean isTabix)
 /* Print out selected fields from VCF.  If fields is NULL, then print out all fields. */
 {
 struct hTableInfo *hti = NULL;
 hti = getHti(db, table, conn);
 struct hash *idHash = NULL;
 char *idField = getIdField(db, curTrack, table, hti);
 int idFieldNum = 0;
 
 /* if we know what field to use for the identifiers, get the hash of names */
 if (idField != NULL)
     idHash = identifierHash(db, table);
 
 if (f == NULL)
     f = stdout;
 
 /* Convert comma separated list of fields to array. */
 int fieldCount = chopByChar(fields, ',', NULL, 0);
 char **fieldArray;
 AllocArray(fieldArray, fieldCount);
 chopByChar(fields, ',', fieldArray, fieldCount);
 
 /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by
  * column name. */
 struct hash *fieldHash = hashNew(0);
-struct slName *bb, *bbList = vcfGetFields(table);
+struct slName *bb, *bbList = vcfGetFields();
 int i;
 for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i)
     {
     /* if we know the field for identifiers, save it away */
     if ((idField != NULL) && sameString(idField, bb->name))
 	idFieldNum = i;
     hashAddInt(fieldHash, bb->name, i);
     }
 
 /* Create an array of column indexes corresponding to the selected field list. */
 int *columnArray;
 AllocArray(columnArray, fieldCount);
 for (i=0; i<fieldCount; ++i)
     {
     columnArray[i] = hashIntVal(fieldHash, fieldArray[i]);
@@ -222,33 +253,38 @@
 
 /* Loop through outputting each region */
 struct region *region, *regionList = getRegions();
 int maxOut = bigFileMaxOutput();
 struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
 // Include the header, absolutely necessary for VCF parsing.
 boolean printedHeader = FALSE;
 // Temporary storage for row-ification:
 struct dyString *dyAlt = newDyString(1024);
 struct dyString *dyFilter = newDyString(1024);
 struct dyString *dyInfo = newDyString(1024);
 struct dyString *dyGt = newDyString(1024);
 struct vcfRecord *rec;
 for (region = regionList; region != NULL && (maxOut > 0); region = region->next)
     {
-    char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom);
-    struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end,
+    char *fileName = vcfFileName(tdb, conn, table, region->chrom);
+    struct vcfFile *vcff;
+    if (isTabix)
+	vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end,
 				   100, maxOut);
+    else
+	vcff = vcfFileMayOpen(fileName, region->chrom, region->start, region->end,
+			      100, maxOut, TRUE);
     if (vcff == NULL)
 	noWarnAbort();
     // If we are outputting all fields, but this VCF has no genotype info, omit the
     // genotype columns from output:
     if (allFields && vcff->genotypeCount == 0)
 	fieldCount = VCFDATALINE_NUM_COLS - 2;
     if (!printedHeader)
 	{
 	fprintf(f, "%s", vcff->headerString);
 	if (filter)
 	    fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList));
 	if (!allFields)
 	    {
 	    fprintf(f, "#%s", fieldArray[0]);
 	    for (i=1; i<fieldCount; ++i)
@@ -283,35 +319,41 @@
     vcfFileFree(&vcff);
     freeMem(fileName);
     }
 
 if (maxOut == 0)
     warn("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput());
 /* Clean up and exit. */
 dyStringFree(&dyAlt);  dyStringFree(&dyFilter);  dyStringFree(&dyInfo);  dyStringFree(&dyGt);
 hashFree(&fieldHash);
 freeMem(fieldArray);
 freeMem(columnArray);
 }
 
 static void addFilteredBedsOnRegion(char *fileName, struct region *region, char *table,
 				    struct asFilter *filter, struct lm *bedLm,
-				    struct bed **pBedList, struct hash *idHash, int *pMaxOut)
+				    struct bed **pBedList, struct hash *idHash, int *pMaxOut,
+				    boolean isTabix)
 /* Add relevant beds in reverse order to pBedList */
 {
-struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end,
+struct vcfFile *vcff;
+if (isTabix)
+    vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end,
 			       100, *pMaxOut);
+else
+    vcff = vcfFileMayOpen(fileName, region->chrom, region->start, region->end,
+			  100, *pMaxOut, TRUE);
 if (vcff == NULL)
     noWarnAbort();
 struct lm *lm = lmInit(0);
 char *row[VCFDATALINE_NUM_COLS];
 char numBuf[VCF_NUM_BUF_SIZE];
 // Temporary storage for row-ification:
 struct dyString *dyAlt = newDyString(1024);
 struct dyString *dyFilter = newDyString(1024);
 struct dyString *dyInfo = newDyString(1024);
 struct dyString *dyGt = newDyString(1024);
 struct vcfRecord *rec;
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
     {
     vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row);
     if (asFilterOnRow(filter, row))
@@ -325,68 +367,70 @@
 	bed->chromEnd = rec->chromEnd;
 	bed->name = lmCloneString(bedLm, rec->name);
 	slAddHead(pBedList, bed);
 	}
     (*pMaxOut)--;
     if (*pMaxOut <= 0)
 	break;
     }
 dyStringFree(&dyAlt);  dyStringFree(&dyFilter);  dyStringFree(&dyInfo);  dyStringFree(&dyGt);
 lmCleanup(&lm);
 vcfFileFree(&vcff);
 }
 
 struct bed *vcfGetFilteredBedsOnRegions(struct sqlConnection *conn,
 	char *db, char *table, struct region *regionList, struct lm *lm,
-	int *retFieldCount)
+	int *retFieldCount, boolean isTabix)
 /* Get list of beds from VCF, in all regions, that pass filtering. */
 {
 int maxOut = bigFileMaxOutput();
 /* Figure out vcf file name get column info and filter. */
 struct asObject *as = vcfAsObj();
 struct asFilter *filter = asFilterFromCart(cart, db, table, as);
 struct hash *idHash = identifierHash(db, table);
 
 /* Get beds a region at a time. */
 struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
 struct bed *bedList = NULL;
 struct region *region;
 for (region = regionList; region != NULL; region = region->next)
     {
-    char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, region->chrom);
+    char *fileName = vcfFileName(tdb, conn, table, region->chrom);
     if (fileName == NULL)
 	continue;
-    addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut);
+    addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut,
+			    isTabix);
     freeMem(fileName);
     if (maxOut <= 0)
 	{
 	warn("Reached output limit of %d data values, please make region smaller,\n"
 	     "\tor set a higher output line limit with the filter settings.", bigFileMaxOutput());
 	break;
 	}
     }
 slReverse(&bedList);
 return bedList;
 }
 
-struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count)
+struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count, boolean isTabix)
 /* Return some semi-random IDs from a VCF file. */
 {
 /* Read 10000 items from vcf file,  or if they ask for a big list, then 4x what they ask for. */
 struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
-char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database));
-struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE);
+char *fileName = vcfFileName(tdb, conn, table, hDefaultChrom(database));
+struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) :
+				lineFileMayOpen(fileName, TRUE);
 if (lf == NULL)
     noWarnAbort();
 int orderedCount = count * 4;
 if (orderedCount < 100)
     orderedCount = 100;
 struct slName *idList = NULL;
 char *words[4];
 int i;
 for (i = 0;  i < orderedCount && lineFileChop(lf, words); i++)
     {
     // compress runs of identical ID, in case most are placeholder
     if (i == 0 || !sameString(words[2], idList->name))
 	slAddHead(&idList, slNameNew(words[2]));
     }
 lineFileClose(&lf);
@@ -395,66 +439,67 @@
 struct slName *sl;
 for (sl = idList, i = 0; sl != NULL; sl = sl->next, i++)
     {
     if (i+1 >= count)
 	{
 	slNameFreeList(&(sl->next));
 	break;
 	}
     }
 freez(&fileName);
 return idList;
 }
 
 #define VCF_MAX_SCHEMA_COLS 20
 
-void showSchemaVcf(char *table, struct trackDb *tdb)
+void showSchemaVcf(char *table, struct trackDb *tdb, boolean isTabix)
 /* Show schema on vcf. */
 {
 struct sqlConnection *conn = hAllocConn(database);
-char *fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, hDefaultChrom(database));
+char *fileName = vcfFileName(tdb, conn, table, hDefaultChrom(database));
 
 struct asObject *as = vcfAsObj();
 hPrintf("<B>Database:</B> %s", database);
 hPrintf("&nbsp;&nbsp;&nbsp;&nbsp;<B>Primary Table:</B> %s<br>", table);
 hPrintf("<B>VCF File:</B> %s", fileName);
 hPrintf("<BR>\n");
 hPrintf("<B>Format description:</B> %s<BR>", as->comment);
 hPrintf("See the <A HREF=\"%s\" target=_blank>Variant Call Format specification</A> for  more details<BR>\n",
 	"http://www.1000genomes.org/wiki/analysis/vcf4.0");
 
 /* Put up table that describes fields. */
 hTableStart();
 hPrintf("<TR><TH>field</TH>");
 hPrintf("<TH>description</TH> ");
 puts("</TR>\n");
 struct asColumn *col;
 int colCount = 0;
 for (col = as->columnList; col != NULL; col = col->next)
     {
     hPrintf("<TR><TD><TT>%s</TT></TD>", col->name);
     hPrintf("<TD>%s</TD></TR>", col->comment);
     ++colCount;
     }
 hTableEnd();
 
 /* Put up another section with sample rows. */
 webNewSection("Sample Rows");
 hTableStart();
 
 /* Fetch sample rows. */
-struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE);
+struct lineFile *lf = isTabix ? lineFileTabixMayOpen(fileName, TRUE) :
+				lineFileMayOpen(fileName, TRUE);
 if (lf == NULL)
     noWarnAbort();
 char *row[VCF_MAX_SCHEMA_COLS];
 int i;
 for (i = 0;  i < 10;  i++)
     {
     int colCount = lineFileChop(lf, row);
     int colIx;
     if (i == 0)
 	{
 	// Print field names as column headers, using colCount to compute genotype span
 	hPrintf("<TR>");
 	for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount;
 	     colIx++, col = col->next)
 	    {