src/lib/vcf.c 8a0946dd6870f10cde056ba243f1fb4ec1fd16b4

8a0946dd6870f10cde056ba243f1fb4ec1fd16b4
angie
  Thu Feb 27 11:58:33 2014 -0800
Adding support for plain VCF custom tracks (as opposed to VCF+tabix),since users seem to want to upload VCF, and as long as the file is
not too big it will work OK.  This means adding a new track type
vcf (as opposed to vcfTabix) and supporting it in hgTracks, hgTrackUi,
hgc, hgTables and hgVai.  (and others I've forgotten?)
refs #12416

diff --git src/lib/vcf.c src/lib/vcf.c
index b34146f..0d7bb0b 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -755,92 +755,147 @@
 	int i;
 	for (i = 0;  i < rec->alleleCount;  i++)
 	    {
 	    if (rec->alleles[i][1] == '\0')
 		rec->alleles[i] = vcfFilePooledStr(vcff, "-");
 	    else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i])))
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1);
 	    else // don't trim first character of symbolic allele
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]);
 	    }
 	}
     }
 return chromStartOrig;
 }
 
-static struct vcfRecord *vcfParseData(struct vcfFile *vcff, int maxRecords)
+static boolean chromsMatch(char *chromA, char *chromB)
+// Return TRUE if chromA and chromB are non-NULL and identical, possibly ignoring
+// "chr" at the beginning of one but not the other.
+{
+if (chromA == NULL || chromB == NULL)
+    return FALSE;
+char *chromAPlus = startsWith("chr", chromA) ? chromA+3 : chromA;
+char *chromBPlus = startsWith("chr", chromB) ? chromB+3 : chromB;
+return sameString(chromAPlus, chromBPlus);
+}
+
+static struct vcfRecord *vcfParseData(struct vcfFile *vcff, char *chrom, int start, int end,
+				      int maxRecords)
 // Given a vcfFile into which the header has been parsed, and whose
 // lineFile is positioned at the beginning of a data row, parse and
-// return all data rows from lineFile.
+// return all data rows (in region, if chrom is non-NULL) from lineFile,
+// up to maxRecords.
 {
 if (vcff == NULL)
     return NULL;
 int recCount = 0;
 struct vcfRecord *records = NULL;
 struct vcfRecord *record;
 while ((record = vcfNextRecord(vcff)) != NULL)
     {
     if (maxRecords >= 0 && recCount >= maxRecords)
         break;
+    if (chrom == NULL)
+	{
+	slAddHead(&records, record);
+	recCount++;
+	}
+    else if (chromsMatch(chrom, record->chrom))
+	{
+	if (end > 0 && record->chromStart >= end)
+	    break;
+	else if (record->chromEnd > start)
+	    {
 	    slAddHead(&records, record);
 	    recCount++;
 	    }
+	}
+    }
 slReverse(&records);
-
 return records;
 }
 
-struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, int maxRecords, boolean parseAll)
+struct vcfFile *vcfFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
+			       int maxErr, int maxRecords, boolean parseAll)
 /* Open fileOrUrl and parse VCF header; return NULL if unable.
- * If parseAll, then read in all lines, parse and store in
+ * If chrom is non-NULL, scan past any variants that precede {chrom, chromStart}.
+ * Note: this is very inefficient -- it's better to use vcfTabix if possible!
+ * If parseAll, then read in all lines in region, parse and store in
  * vcff->records; if maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
  * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 struct lineFile *lf = NULL;
 if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) ||
     startsWith("https://", fileOrUrl))
     lf = netLineFileOpen(fileOrUrl);
 else
     lf = lineFileMayOpen(fileOrUrl, TRUE);
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
-if (parseAll)
+if (vcff && chrom != NULL)
+    {
+    char *line = NULL;
+    int size = 0;
+    while (lineFileNext(vcff->lf, &line, &size))
+	{
+	char lineCopy[size+1];
+	safecpy(lineCopy, sizeof(lineCopy), line);
+	char *words[VCF_MAX_COLUMNS];
+	int wordCount = chopTabs(lineCopy, words);
+	int expected = 8;
+	if (vcff->genotypeCount > 0)
+	    expected = 9 + vcff->genotypeCount;
+	lineFileExpectWords(vcff->lf, expected, wordCount);
+	struct vcfRecord *record = vcfRecordFromRow(vcff, words);
+	if (chromsMatch(chrom, record->chrom))
+	    {
+	    if (record->chromEnd < start)
+		continue;
+	    else
+		{
+		lineFileReuse(vcff->lf);
+		break;
+		}
+	    }
+	}
+    }
+if (vcff && parseAll)
     {
-    vcff->records = vcfParseData(vcff, maxRecords);
+    vcff->records = vcfParseData(vcff, chrom, start, end, maxRecords);
     lineFileClose(&(vcff->lf)); // Not sure why it is closed.  Angie?
     }
 return vcff;
 }
 
 struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
 				    int maxErr, int maxRecords)
 /* Open a VCF file that has been compressed and indexed by tabix and
  * parse VCF header, or return NULL if unable.  If chrom is non-NULL,
  * seek to the position range and parse all lines in range into
  * vcff->records.  If maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
  * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE);
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 if (vcff == NULL)
     return NULL;
 if (isNotEmpty(chrom) && start != end)
     {
     if (lineFileSetTabixRegion(lf, chrom, start, end))
         {
-        vcff->records = vcfParseData(vcff, maxRecords);
+        vcff->records = vcfParseData(vcff, NULL, 0, 0, maxRecords);
         lineFileClose(&(vcff->lf)); // Not sure why it is closed.  Angie?
         }
     }
 return vcff;
 }
 
 int vcfRecordCmp(const void *va, const void *vb)
 /* Compare to sort based on position. */
 {
 const struct vcfRecord *a = *((struct vcfRecord **)va);
 const struct vcfRecord *b = *((struct vcfRecord **)vb);
 int dif;
 dif = strcmp(a->chrom, b->chrom);
 if (dif == 0)
     dif = a->chromStart - b->chromStart;
@@ -854,31 +909,31 @@
 int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end,
                       int maxErr, int maxRecords)
 // Reads a batch of records from an opened and indexed VCF file, adding them to
 // vcff->records and returning the count of new records added in this batch.
 // Note: vcff->records will continue to be sorted, even if batches are loaded
 // out of order.  Additionally, resulting vcff->records will contain no duplicates
 // so returned count refects only the new records added, as opposed to all records
 // in range.  If maxErr >= zero, then continue to parse until there are maxErr+1
 // errors.  A maxErr less than zero does not stop and reports all errors.  Set
 // maxErr to VCF_IGNORE_ERRS for silence.
 {
 int oldCount = slCount(vcff->records);
 
 if (lineFileSetTabixRegion(vcff->lf, chrom, start, end))
     {
-    struct vcfRecord *records = vcfParseData(vcff, maxRecords);
+    struct vcfRecord *records = vcfParseData(vcff, NULL, 0, 0, maxRecords);
     if (records)
         {
         struct vcfRecord *lastRec = vcff->records;
         if (lastRec == NULL)
             vcff->records = records;
         else
             {
             // Considered just asserting the batches were in order, but a problem may
             // result when non-overlapping location windows pick up the same long variant.
             slSortMergeUniq(&(vcff->records), records, vcfRecordCmp, NULL);
             }
         }
     }
 
 return slCount(vcff->records) - oldCount;