2b5eb866f050d964d8964ec5a84f7b63889cc6b1
angie
  Mon Jun 3 14:39:36 2013 -0700
New CGI, hgVai (Variant Annotation Integrator): simple checklist-styleUI by which user can select variants that they have uploaded; gene
predictions to identify which part of a gene, if any, is hit by each
variant; several additional sources of annotations/predictions e.g.
dbNSFP scores and conserved elements/scores; and several filters to
constrain output to the variants most likely to have a functional effect.
Along with the new CGI, there are various lib bugfixes and improvements,
a new hg/lib/tests/ testcase, and some test file changes to accomodate
data updates to both knownGene and the pg* tables in knownGene.
refs #6152

diff --git src/lib/annoStreamVcf.c src/lib/annoStreamVcf.c
index 4e4b6d5..28856b1 100644
--- src/lib/annoStreamVcf.c
+++ src/lib/annoStreamVcf.c
@@ -1,31 +1,35 @@
 /* annoStreamVcf -- subclass of annoStreamer for VCF files */
 
 #include "annoStreamVcf.h"
+#include "twoBit.h"
 #include "vcf.h"
 
 struct annoStreamVcf
 {
     struct annoStreamer streamer;	// Parent class members & methods
     // Private members
     struct vcfFile *vcff;		// VCF parsed header and file object
     struct vcfRecord *record;		// Current parsed row of VCF (need this for chromEnd)
     char *asWords[VCF_NUM_COLS];	// Current row of VCF with genotypes squashed for autoSql
     struct dyString *dyGt;		// Scratch space for squashing genotype columns
+    struct hash *chromNameHash;		// Translate "chr"-less seq names if necessary.
     int numCols;			// Number of columns in autoSql def of VCF.
     int numFileCols;			// Number of columns in VCF file.
     boolean isTabix;			// True if we are accessing compressed VCF via tabix index
+    int maxRecords;			// Maximum number of annoRows to return.
+    int recordCount;			// Number of annoRows we have returned so far.
 };
 
 
 static void asvSetRegion(struct annoStreamer *vSelf, char *chrom, uint regionStart, uint regionEnd)
 /* Set region -- and free current sqlResult if there is one. */
 {
 annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd);
 struct annoStreamVcf *self = (struct annoStreamVcf *)vSelf;
 if (self->isTabix)
     lineFileSetTabixRegion(self->vcff->lf, chrom, regionStart, regionEnd);
 }
 
 static char *asvGetHeader(struct annoStreamer *vSelf)
 /* Return VCF header (e.g. for use by formatter) */
 {
@@ -78,106 +82,134 @@
 {
 struct annoStreamer *sSelf = (struct annoStreamer *)self;
 char *regionChrom = sSelf->chrom;
 uint regionStart = sSelf->regionStart;
 uint regionEnd = sSelf->regionEnd;
 if (minChrom != NULL)
     {
     if (regionChrom == NULL)
 	{
 	regionChrom = minChrom;
 	regionStart = minEnd;
 	regionEnd = annoAssemblySeqSize(sSelf->assembly, minChrom);
 	}
     else
 	{
-	if (differentString(minChrom, regionChrom))
-	    errAbort("annoStreamVcf %s: nextRow minChrom='%s' but region chrom='%s'",
-		     sSelf->name, minChrom, regionChrom);
 	regionStart = max(regionStart, minEnd);
 	}
     }
 char **words = nextRowRaw(self);
 if (minChrom != NULL && words != NULL)
     {
     if (self->isTabix && strcmp(words[0], minChrom) < 0)
 	{
 	uint regionEnd = sSelf->regionEnd;
 	if (sSelf->chrom == NULL)
 	    regionEnd = annoAssemblySeqSize(sSelf->assembly, minChrom);
 	lineFileSetTabixRegion(self->vcff->lf, minChrom, minEnd, regionEnd);
 	}
     while (words != NULL &&
 	   (strcmp(words[0], minChrom) < 0 ||
 	    (sameString(words[0], minChrom) && self->record->chromEnd < minEnd)))
 	words = nextRowRaw(self);
     }
 return words;
 }
 
-static struct annoRow *asvNextRow(struct annoStreamer *vSelf, char *minChrom, uint minEnd,
+static char *getProperChromName(struct annoStreamVcf *self, char *vcfChrom)
+/* We tolerate chr-less chrom names in VCF and BAM ("1" for "chr1" etc); to avoid
+ * confusing the rest of the system, return the chr-ful version if it exists. */
+{
+char *name = hashFindVal(self->chromNameHash, vcfChrom);
+if (name == NULL)
+    {
+    name = vcfChrom;
+    struct twoBitFile *tbf = self->streamer.assembly->tbf;
+    char buf[256];
+    if (! twoBitIsSequence(tbf, vcfChrom))
+	{
+	safef(buf, sizeof(buf), "chr%s", vcfChrom);
+	if (twoBitIsSequence(tbf, buf))
+	    name = buf;
+	}
+    name = lmCloneString(self->chromNameHash->lm, name);
+    hashAdd(self->chromNameHash, vcfChrom, name);
+    }
+return name;
+}
+
+static struct annoRow *asvNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd,
 				  struct lm *callerLm)
 /* Return an annoRow encoding the next VCF record, or NULL if there are no more items. */
 {
-struct annoStreamVcf *self = (struct annoStreamVcf *)vSelf;
+struct annoStreamVcf *self = (struct annoStreamVcf *)sSelf;
+if (minChrom != NULL && sSelf->chrom != NULL && differentString(minChrom, sSelf->chrom))
+    errAbort("annoStreamVcf %s: nextRow minChrom='%s' but region chrom='%s'",
+	     sSelf->name, minChrom, sSelf->chrom);
+if (self->maxRecords > 0 && self->recordCount >= self->maxRecords)
+    return NULL;
 char **words = nextRowUnfiltered(self, minChrom, minEnd);
 if (words == NULL)
     return NULL;
 // Skip past any left-join failures until we get a right-join failure, a passing row, or EOF.
 boolean rightFail = FALSE;
-while (annoFilterRowFails(vSelf->filters, words, self->numCols, &rightFail))
+while (annoFilterRowFails(sSelf->filters, words, self->numCols, &rightFail))
     {
     if (rightFail)
 	break;
     words = nextRowUnfiltered(self, minChrom, minEnd);
     if (words == NULL)
 	return NULL;
     }
 struct vcfRecord *rec = self->record;
-return annoRowFromStringArray(rec->chrom, rec->chromStart, rec->chromEnd,
+char *chrom = getProperChromName(self, rec->chrom);
+self->recordCount++;
+return annoRowFromStringArray(chrom, rec->chromStart, rec->chromEnd,
 			      rightFail, words, self->numCols, callerLm);
 }
 
 static void asvClose(struct annoStreamer **pVSelf)
 /* Close VCF file and free self. */
 {
 if (pVSelf == NULL)
     return;
 struct annoStreamVcf *self = *(struct annoStreamVcf **)pVSelf;
 vcfFileFree(&(self->vcff));
 // Don't free self->record -- currently it belongs to vcff's localMem
 dyStringFree(&(self->dyGt));
 annoStreamerFree(pVSelf);
 }
 
 struct annoStreamer *annoStreamVcfNew(char *fileOrUrl, boolean isTabix, struct annoAssembly *aa,
 				      int maxRecords)
 /* Create an annoStreamer (subclass) object from a VCF file, which may
  * or may not have been compressed and indexed by tabix. */
 {
 int maxErr = -1; // don't errAbort on VCF format warnings/errs
 struct vcfFile *vcff;
 if (isTabix)
-    vcff = vcfTabixFileMayOpen(fileOrUrl, NULL, 0, 0, maxErr, maxRecords);
+    vcff = vcfTabixFileMayOpen(fileOrUrl, NULL, 0, 0, maxErr, 0);
 else
-    vcff = vcfFileMayOpen(fileOrUrl, maxErr, maxRecords, FALSE);
+    vcff = vcfFileMayOpen(fileOrUrl, maxErr, 0, FALSE);
 if (vcff == NULL)
     errAbort("annoStreamVcfNew: unable to open VCF: '%s'", fileOrUrl);
 struct annoStreamVcf *self;
 AllocVar(self);
 struct annoStreamer *streamer = &(self->streamer);
 struct asObject *asObj = vcfAsObj();
 annoStreamerInit(streamer, aa, asObj, fileOrUrl);
 streamer->rowType = arWords;
 streamer->setRegion = asvSetRegion;
 streamer->getHeader = asvGetHeader;
 streamer->nextRow = asvNextRow;
 streamer->close = asvClose;
 self->vcff = vcff;
 self->dyGt = dyStringNew(1024);
+self->chromNameHash = hashNew(0);
 self->isTabix = isTabix;
 self->numCols = slCount(asObj->columnList);
 self->numFileCols = 8;
 if (vcff->genotypeCount > 0)
     self->numFileCols = 9 + vcff->genotypeCount;
+self->maxRecords = maxRecords;
 return (struct annoStreamer *)self;
 }