src/hg/hgTables/vcf.c 70d4208efa11f262ba5591198214c9ccdc6b54ae

70d4208efa11f262ba5591198214c9ccdc6b54ae
angie
  Mon Aug 22 22:24:05 2011 -0700
Feature #3707 (VCF+tabix support in hgTables):Basic hgTables support for VCF, modeled after BAM code.  Tested all fields,
selected fields, bed output; filter, intersection, schema
on 3 flavors of vcfTabix:
ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20101123/interim_phase1_release/ALL.chr17.phase1.projectConsensus.genotypes.vcf.gz
ftp://ftp-trace.ncbi.nlm.nih.gov/1000genomes/ftp/release/20100804/AFR.dindel.20100804.sites.vcf.gz
ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/v4.0/ByChromosomeNoGeno/00-All.vcf.gz

diff --git src/hg/hgTables/vcf.c src/hg/hgTables/vcf.c
new file mode 100644
index 0000000..a5d4f48
--- /dev/null
+++ src/hg/hgTables/vcf.c
@@ -0,0 +1,492 @@
+/* vcf - stuff to handle VCF stuff in table browser. */
+
+#include "common.h"
+#include "hgTables.h"
+#include "asFilter.h"
+#include "hubConnect.h"
+#include "asParse.h"
+#include "hgBam.h"
+#include "linefile.h"
+#include "localmem.h"
+#include "obscure.h"
+#include "vcf.h"
+#include "web.h"
+
+#if (defined USE_TABIX && defined KNETFILE_HOOKS)
+#include "knetUdc.h"
+#include "udc.h"
+#endif//def USE_TABIX && KNETFILE_HOOKS
+
+#define VCFDATALINE_NUM_COLS 10
+
+char *vcfDataLineAutoSqlString =
+"table vcfDataLine"
+"\"The fields of a Variant Call Format data line\""
+"    ("
+"    string chrom;	\"An identifier from the reference genome\""
+"    uint pos;		\"The reference position, with the 1st base having position 1\""
+"    string id;		\"Semi-colon separated list of unique identifiers where available\""
+"    string ref;		\"Reference base(s)\""
+"    string alt;		\"Comma separated list of alternate non-reference alleles called on at least one of the samples\""
+"    string qual;	\"Phred-scaled quality score for the assertion made in ALT. i.e. give -10log_10 prob(call in ALT is wrong)\""
+"    string filter;	\"PASS if this position has passed all filters. Otherwise, a semicolon-separated list of codes for filters that fail\""
+"    string info;	\"Additional information encoded as a semicolon-separated series of short keys with optional comma-separated values\""
+"    string format;	\"If genotype columns are specified in header, a semicolon-separated list of of short keys starting with GT\""
+"    string genotypes;	\"If genotype columns are specified in header, a tab-separated set of genotype column values; each value is a colon-separated list of values corresponding to keys in the format column\""
+"    )";
+
+boolean isVcfTable(char *table)
+/* Return TRUE if table corresponds to a VCF file. */
+{
+if (isHubTrack(table))
+    {
+    struct trackDb *tdb = hashFindVal(fullTrackAndSubtrackHash, table);
+    return startsWithWord("vcfTabix", tdb->type);
+    }
+else
+    return trackIsType(database, table, curTrack, "vcfTabix", ctLookupName);
+}
+
+char *vcfFileName(char *table, struct sqlConnection *conn, char *seqName)
+/* Return file name associated with VCF.  This handles differences whether it's
+ * a custom or built-in track.  Do a freeMem on returned string when done. */
+{
+char *fileName = bigFileNameFromCtOrHub(table, conn);
+if (fileName == NULL)
+    fileName = bamFileNameFromTable(conn, table, seqName);
+return fileName;
+}
+
+struct asObject *vcfAsObj()
+/* Return asObject describing fields of VCF */
+{
+return asParseText(vcfDataLineAutoSqlString);
+}
+
+struct hTableInfo *vcfToHti(char *table)
+/* Get standard fields of VCF into hti structure. */
+{
+struct hTableInfo *hti;
+AllocVar(hti);
+hti->rootName = cloneString(table);
+hti->isPos= TRUE;
+strcpy(hti->chromField, "chrom");
+strcpy(hti->startField, "pos");
+strcpy(hti->nameField, "id");
+hti->type = cloneString("vcfTabix");
+return hti;
+}
+
+struct slName *vcfGetFields(char *table)
+/* Get fields of vcf as simple name list. */
+{
+struct asObject *as = vcfAsObj();
+struct slName *names = asColNames(as);
+return names;
+}
+
+struct sqlFieldType *vcfListFieldsAndTypes()
+/* Get fields of bigBed as list of sqlFieldType. */
+{
+struct asObject *as = vcfAsObj();
+struct sqlFieldType *list = sqlFieldTypesFromAs(as);
+return list;
+}
+
+void dyJoin(struct dyString *dy, char *delim, char **words, int wordCount)
+/* Clear dy and append wordCount words interspersed with delim. */
+{
+dyStringClear(dy);
+int i;
+for (i = 0;  i < wordCount;  i++)
+    {
+    if (i > 0)
+	dyStringAppend(dy, delim);
+    dyStringAppend(dy, words[i]);
+    }
+}
+
+void vcfInfoElsToString(struct dyString *dy, struct vcfFile *vcff, struct vcfRecord *rec)
+/* Unpack rec's typed infoElements to semicolon-sep'd string in dy.*/
+{
+dyStringClear(dy);
+if (rec->infoCount == 0)
+    dyStringAppendC(dy, '.');
+int i;
+for (i = 0;  i < rec->infoCount;  i++)
+    {
+    if (i > 0)
+	dyStringAppendC(dy, ';');
+    const struct vcfInfoElement *el = &(rec->infoElements[i]);
+    const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, el->key);
+    enum vcfInfoType type = def? def->type : type;
+    dyStringAppend(dy, el->key);
+    if (el->count > 0)
+	dyStringAppendC(dy, '=');
+    int j;
+    for (j = 0;  j < el->count;  j++)
+	{
+	if (j > 0)
+	    dyStringAppendC(dy, ',');
+	union vcfDatum dat = el->values[j];
+	switch (type)
+	    {
+	    case vcfInfoInteger:
+		dyStringPrintf(dy, "%d", dat.datInt);
+		break;
+	    case vcfInfoFloat:
+		{
+		// use big precision and erase trailing zeros:
+		char fbuf[64];
+		safef(fbuf, sizeof(fbuf), "%.16lf", dat.datFloat);
+		int i;
+		for (i = strlen(fbuf) - 1;  i > 0;  i--)
+		    if (fbuf[i] == '0')
+			fbuf[i] = '\0';
+		    else
+			break;
+		dyStringAppend(dy, fbuf);
+		}
+		break;
+	    case vcfInfoCharacter:
+		dyStringAppendC(dy, dat.datChar);
+		break;
+	    case vcfInfoFlag: // Flags could have values in older VCF
+	    case vcfInfoNoType:
+	    case vcfInfoString:
+		dyStringAppend(dy, dat.datString);
+		break;
+	    default:
+		errAbort("Invalid vcfInfoType %d (how did this get past parser?", type);
+	    }
+	}
+    }
+}
+
+#define VCF_NUM_BUF_SIZE 256
+
+void vcfRecordToRow(struct vcfRecord *rec, char *chrom, char *numBuf, struct dyString *dyAlt,
+		    struct dyString *dyFilter, struct dyString *dyInfo, struct dyString *dyGt,
+		    char *row[VCFDATALINE_NUM_COLS])
+/* Convert vcfRecord to array of strings, using numBuf to store ascii
+ * versions of numbers and dyStrings to store stringified arrays. */
+{
+struct vcfFile *vcff = rec->file;
+char *numPt = numBuf;
+row[0] = chrom; // VCF files' chrom field often lacks "chr" -- use region->chrom from caller.
+row[1] = numPt;
+numPt += safef(numPt, VCF_NUM_BUF_SIZE - (numPt-numBuf), "%u", rec->chromStart+1);
+numPt++;
+row[2] = rec->name;
+row[3] = rec->alleles[0];
+dyJoin(dyAlt, ",", &(rec->alleles[1]), rec->alleleCount-1);
+row[4] = dyAlt->string;
+row[5] = rec->qual;
+dyJoin(dyFilter, ";", rec->filters, rec->filterCount);
+row[6] = dyFilter->string;
+vcfInfoElsToString(dyInfo, vcff, rec);
+row[7] = dyInfo->string;
+if (vcff->genotypeCount > 0)
+    {
+    row[8] = rec->format;
+    dyJoin(dyGt, "\t", rec->genotypeUnparsedStrings, vcff->genotypeCount);
+    row[9] = dyGt->string;
+    }
+else
+    row[8] = row[9] = ""; // compatible with localmem usage
+}
+
+void vcfTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f)
+/* Print out selected fields from VCF.  If fields is NULL, then print out all fields. */
+{
+struct hTableInfo *hti = NULL;
+hti = getHti(db, table, conn);
+struct hash *idHash = NULL;
+char *idField = getIdField(db, curTrack, table, hti);
+int idFieldNum = 0;
+
+/* if we know what field to use for the identifiers, get the hash of names */
+if (idField != NULL)
+    idHash = identifierHash(db, table);
+
+if (f == NULL)
+    f = stdout;
+
+/* Convert comma separated list of fields to array. */
+int fieldCount = chopByChar(fields, ',', NULL, 0);
+char **fieldArray;
+AllocArray(fieldArray, fieldCount);
+chopByChar(fields, ',', fieldArray, fieldCount);
+
+/* Get list of all fields in big bed and turn it into a hash of column indexes keyed by
+ * column name. */
+struct hash *fieldHash = hashNew(0);
+struct slName *bb, *bbList = vcfGetFields(table);
+int i;
+for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i)
+    {
+    /* if we know the field for identifiers, save it away */
+    if ((idField != NULL) && sameString(idField, bb->name))
+	idFieldNum = i;
+    hashAddInt(fieldHash, bb->name, i);
+    }
+
+/* Create an array of column indexes corresponding to the selected field list. */
+int *columnArray;
+AllocArray(columnArray, fieldCount);
+for (i=0; i<fieldCount; ++i)
+    {
+    columnArray[i] = hashIntVal(fieldHash, fieldArray[i]);
+    }
+
+/* Output row of labels if we are outputting only selected columns.
+ * We will include original VCF header below, and adding a comment line
+ * at the top invalidates the VCF. */
+boolean allFields = (fieldCount == VCFDATALINE_NUM_COLS);
+if (!allFields)
+    {
+    fprintf(f, "#%s", fieldArray[0]);
+    for (i=1; i<fieldCount; ++i)
+	fprintf(f, "\t%s", fieldArray[i]);
+    fprintf(f, "\n");
+    }
+
+struct asObject *as = vcfAsObj();
+struct asFilter *filter = NULL;
+if (anyFilter())
+    filter = asFilterFromCart(cart, db, table, as);
+
+/* Loop through outputting each region */
+struct region *region, *regionList = getRegions();
+int maxOut = bigFileMaxOutput();
+// Include the header, absolutely necessary for VCF parsing.
+boolean printedHeader = FALSE;
+// Temporary storage for row-ification:
+struct dyString *dyAlt = newDyString(1024);
+struct dyString *dyFilter = newDyString(1024);
+struct dyString *dyInfo = newDyString(1024);
+struct dyString *dyGt = newDyString(1024);
+struct vcfRecord *rec;
+for (region = regionList; region != NULL && (maxOut > 0); region = region->next)
+    {
+    char *fileName = vcfFileName(table, conn, region->chrom);
+    struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end,
+					       100);
+    // If we are outputting all fields, but this VCF has no genotype info, omit the
+    // genotype columns from output:
+    if (allFields && vcff->genotypeCount == 0)
+	fieldCount = VCFDATALINE_NUM_COLS - 2;
+    if (!printedHeader)
+	{
+	fprintf(f, "%s", vcff->headerString);
+	if (filter)
+	    fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList));
+	printedHeader = TRUE;
+	}
+    char *row[VCFDATALINE_NUM_COLS];
+    char numBuf[VCF_NUM_BUF_SIZE];
+    for (rec = vcff->records;  rec != NULL && (maxOut > 0);  rec = rec->next)
+        {
+	vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row);
+	if (asFilterOnRow(filter, row))
+	    {
+	    /* if we're looking for identifiers, check if this matches */
+	    if ((idHash != NULL) && (hashLookup(idHash, row[idFieldNum]) == NULL))
+		continue;
+	    // All fields output: after asFilter'ing, preserve original VCF chrom
+	    if (allFields && !sameString(rec->chrom, region->chrom))
+		row[0] = rec->chrom;
+	    int i;
+	    fprintf(f, "%s", row[columnArray[0]]);
+	    for (i=1; i<fieldCount; ++i)
+		{
+		fprintf(f, "\t%s", row[columnArray[i]]);
+		}
+	    fprintf(f, "\n");
+	    maxOut --;
+	    }
+	}
+    vcfFileFree(&vcff);
+    freeMem(fileName);
+    }
+
+if (maxOut == 0)
+    warn("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput());
+/* Clean up and exit. */
+dyStringFree(&dyAlt);  dyStringFree(&dyFilter);  dyStringFree(&dyInfo);  dyStringFree(&dyGt);
+hashFree(&fieldHash);
+freeMem(fieldArray);
+freeMem(columnArray);
+}
+
+static void addFilteredBedsOnRegion(char *fileName, struct region *region, char *table,
+				    struct asFilter *filter, struct lm *bedLm,
+				    struct bed **pBedList, struct hash *idHash)
+/* Add relevant beds in reverse order to pBedList */
+{
+struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end,
+					   100);
+struct lm *lm = lmInit(0);
+char *row[VCFDATALINE_NUM_COLS];
+char numBuf[VCF_NUM_BUF_SIZE];
+// Temporary storage for row-ification:
+struct dyString *dyAlt = newDyString(1024);
+struct dyString *dyFilter = newDyString(1024);
+struct dyString *dyInfo = newDyString(1024);
+struct dyString *dyGt = newDyString(1024);
+struct vcfRecord *rec;
+for (rec = vcff->records;  rec != NULL;  rec = rec->next)
+    {
+    vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row);
+    if (asFilterOnRow(filter, row))
+        {
+	if ((idHash != NULL) && (hashLookup(idHash, rec->name) == NULL))
+	    continue;
+	struct bed *bed;
+	lmAllocVar(bedLm, bed);
+	bed->chrom = lmCloneString(bedLm, region->chrom);
+	bed->chromStart = rec->chromStart;
+	bed->chromEnd = rec->chromEnd;
+	bed->name = lmCloneString(bedLm, rec->name);
+	slAddHead(pBedList, bed);
+	}
+    }
+dyStringFree(&dyAlt);  dyStringFree(&dyFilter);  dyStringFree(&dyInfo);  dyStringFree(&dyGt);
+lmCleanup(&lm);
+vcfFileFree(&vcff);
+}
+
+struct bed *vcfGetFilteredBedsOnRegions(struct sqlConnection *conn,
+	char *db, char *table, struct region *regionList, struct lm *lm,
+	int *retFieldCount)
+/* Get list of beds from VCF, in all regions, that pass filtering. */
+{
+/* Figure out vcf file name get column info and filter. */
+struct asObject *as = vcfAsObj();
+struct asFilter *filter = asFilterFromCart(cart, db, table, as);
+struct hash *idHash = identifierHash(db, table);
+
+/* Get beds a region at a time. */
+struct bed *bedList = NULL;
+struct region *region;
+for (region = regionList; region != NULL; region = region->next)
+    {
+    char *fileName = vcfFileName(table, conn, region->chrom);
+    addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash);
+    freeMem(fileName);
+    }
+slReverse(&bedList);
+return bedList;
+}
+
+struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count)
+/* Return some semi-random IDs from a VCF file. */
+{
+/* Read 10000 items from vcf file,  or if they ask for a big list, then 4x what they ask for. */
+char *fileName = vcfFileName(table, conn, NULL);
+struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE);
+if (lf == NULL)
+    errAbort("%s", "");
+int orderedCount = count * 4;
+if (orderedCount < 10000)
+    orderedCount = 10000;
+struct slName *idList = NULL;
+char *words[4];
+int i;
+for (i = 0;  i < orderedCount && lineFileChop(lf, words); i++)
+    slAddHead(&idList, slNameNew(words[2]));
+lineFileClose(&lf);
+/* Shuffle list and trim it to count if necessary. */
+shuffleList(&idList, 1);
+struct slName *sl;
+for (sl = idList, i = 0; sl != NULL; i++, sl = sl->next, i++)
+    {
+    if (i+1 >= count)
+	{
+	slNameFreeList(&(sl->next));
+	break;
+	}
+    }
+freez(&fileName);
+return idList;
+}
+
+#define VCF_MAX_SCHEMA_COLS 20
+
+void showSchemaVcf(char *table)
+/* Show schema on vcf. */
+{
+struct sqlConnection *conn = hAllocConn(database);
+char *fileName = vcfFileName(table, conn, NULL);
+
+struct asObject *as = vcfAsObj();
+hPrintf("<B>Database:</B> %s", database);
+hPrintf("&nbsp;&nbsp;&nbsp;&nbsp;<B>Primary Table:</B> %s<br>", table);
+hPrintf("<B>VCF File:</B> %s", fileName);
+hPrintf("<BR>\n");
+hPrintf("<B>Format description:</B> %s<BR>", as->comment);
+hPrintf("See the <A HREF=\"%s\" target=_blank>Variant Call Format specification</A> for  more details<BR>\n",
+	"http://www.1000genomes.org/wiki/analysis/vcf4.0");
+
+/* Put up table that describes fields. */
+hTableStart();
+hPrintf("<TR><TH>field</TH>");
+hPrintf("<TH>description</TH> ");
+puts("</TR>\n");
+struct asColumn *col;
+int colCount = 0;
+for (col = as->columnList; col != NULL; col = col->next)
+    {
+    hPrintf("<TR><TD><TT>%s</TT></TD>", col->name);
+    hPrintf("<TD>%s</TD></TR>", col->comment);
+    ++colCount;
+    }
+hTableEnd();
+
+/* Put up another section with sample rows. */
+webNewSection("Sample Rows");
+hTableStart();
+
+/* Fetch sample rows. */
+struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE);
+if (lf == NULL)
+    errAbort("%s", "");
+char *row[VCF_MAX_SCHEMA_COLS];
+int i;
+for (i = 0;  i < 10;  i++)
+    {
+    int colCount = lineFileChop(lf, row);
+    int colIx;
+    if (i == 0)
+	{
+	// Print field names as column headers, using colCount to compute genotype span
+	hPrintf("<TR>");
+	for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount;
+	     colIx++, col = col->next)
+	    {
+
+	    if (sameString("genotypes", col->name) && colCount > colIx+1)
+		hPrintf("<TH colspan=%d>%s</TH>", colCount - colIx, col->name);
+	    else
+		hPrintf("<TH>%s</TH>", col->name);
+	    }
+	hPrintf("</TR>\n");
+	}
+    hPrintf("<TR>");
+    for (colIx=0; colIx < colCount; ++colIx)
+	{
+	if (colCount > VCFDATALINE_NUM_COLS && colIx == colCount - 1)
+	    hPrintf("<TD>...</TD>");
+	else
+	    writeHtmlCell(row[colIx]);
+	}
+    hPrintf("</TR>\n");
+    }
+hTableEnd();
+
+/* Clean up and go home. */
+lineFileClose(&lf);
+freeMem(fileName);
+hFreeConn(&conn);
+}
+