7a2b8c93090dbf4365464458023ef9875488185b
kent
  Wed Feb 4 08:53:15 2015 -0800
Making it so that there can be warning messages as well as errors for VCF files.  Changing message 'missing ##fileformat= header line?  Assuming 4.1.' to a warning and indeed making it assume 4.1.  Also having a missing #CHROM POS type header also just a warning.

diff --git src/lib/vcf.c src/lib/vcf.c
index cde6810..b4e308b 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -156,45 +156,61 @@
 	   "Expected alternate allele count (typically used in association analyses)");
 }
 
 static bool vcfFileStopDueToErrors(struct vcfFile *vcff)
 /* determine if we should stop due to the number of errors */
 {
 return vcff->errCnt > vcff->maxErr;
 }
 
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
 #if defined(__GNUC__)
 __attribute__((format(printf, 2, 3)))
 #endif
 ;
 
+static void vaVcfWarn(struct vcfFile *vcff, char *format, va_list args)
+/* Add a little bit of info like file position to warning */
+{
+if (vcff->lf != NULL)
+    {
+    char formatPlus[1024];
+    safef(formatPlus, sizeof(formatPlus), 
+	"%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format);
+    vaWarn(formatPlus, args);
+    }
+else
+    vaWarn(format, args);
+}
+
+static void vcfFileWarn(struct vcfFile *vcff, char *format, ...)
+/* Send error message to errabort stack's warn handler and abort */
+{
+va_list args;
+va_start(args, format);
+vaVcfWarn(vcff, format, args);
+}
+
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
 /* Send error message to errabort stack's warn handler and abort */
 {
 vcff->errCnt++;
 if (vcff->maxErr == VCF_IGNORE_ERRS)
     return;
 va_list args;
 va_start(args, format);
-char formatPlus[1024];
-if (vcff->lf != NULL)
-    sprintf(formatPlus, "%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format);
-else
-    strcpy(formatPlus, format);
-vaWarn(formatPlus, args);
-va_end(args);
+vaVcfWarn(vcff, format, args);
 if (vcfFileStopDueToErrors(vcff))
     errAbort("VCF: %d parser errors, quitting", vcff->errCnt);
 }
 
 static void *vcfFileAlloc(struct vcfFile *vcff, size_t size)
 /* Use vcff's local mem to allocate memory. */
 {
 return lmAlloc( vcfFileLm(vcff), size);
 }
 
 INLINE char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size)
 /* Use vcff's local mem to allocate memory for a string and copy it. */
 {
 return lmCloneStringZ( vcfFileLm(vcff), str, size);
 }
@@ -370,40 +386,36 @@
     {
     if (exp2 == NULL)
 	vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" but got \"%s\"",
 		   ix+1, exp1, words[ix]);
     else if (! sameString(exp2, words[ix]))
 	vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\"  or \"%s\" "
 		   "but got \"%s\"", ix+1, exp1, exp2, words[ix]);
     }
 }
 
 #define expectColumnName(vcff, exp, words, ix) expectColumnName2(vcff, exp, NULL, words, ix)
 
 // There might be a whole lot of genotype columns...
 #define VCF_MAX_COLUMNS 16 * 1024
 
+char *vcfDefaultHeader = "#CHROM POS ID REF ALT QUAL FILTER INFO";
+/* Default header if we have none. */
+
 static void parseColumnHeaderRow(struct vcfFile *vcff, char *line)
 /* Make sure column names are as we expect, and store genotype sample IDs if any are given. */
 {
-if (line[0] != '#')
-    {
-    vcfFileErr(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), "
-	       "not \"%s\"", line);
-    lineFileReuse(vcff->lf);
-    return;
-    }
 char *words[VCF_MAX_COLUMNS];
 int wordCount = chopLine(line+1, words);
 if (wordCount >= VCF_MAX_COLUMNS)
     vcfFileErr(vcff, "header contains at least %d columns; "
 	       "VCF_MAX_COLUMNS may need to be increased in vcf.c!", VCF_MAX_COLUMNS);
 expectColumnName(vcff, "CHROM", words, 0);
 expectColumnName(vcff, "POS", words, 1);
 expectColumnName(vcff, "ID", words, 2);
 expectColumnName(vcff, "REF", words, 3);
 expectColumnName(vcff, "ALT", words, 4);
 expectColumnName2(vcff, "QUAL", "PROB", words, 5);
 expectColumnName(vcff, "FILTER", words, 6);
 expectColumnName(vcff, "INFO", words, 7);
 if (wordCount > 8)
     {
@@ -469,39 +481,52 @@
 
 struct dyString *dyHeader = dyStringNew(1024);
 char *line = NULL;
 // First, metadata lines beginning with "##":
 while (lineFileNext(lf, &line, NULL) && startsWith("##", line))
     {
     dyStringAppend(dyHeader, line);
     dyStringAppendC(dyHeader, '\n');
     parseMetadataLine(vcff, line);
     }
 slReverse(&(vcff->infoDefs));
 slReverse(&(vcff->filterDefs));
 slReverse(&(vcff->gtFormatDefs));
 // Did we get the bare minimum VCF header with supported version?
 if (vcff->majorVersion == 0)
-    vcfFileErr(vcff, "missing ##fileformat= header line?  Assuming 4.1.");
+    {
+    vcfFileWarn(vcff, "missing ##fileformat= header line?  Assuming 4.1.");
+    vcff->majorVersion = 4;
+    vcff->minorVersion = 1;
+    }
 if ((vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1)) &&
     (vcff->majorVersion != 3))
     vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 or v4.1",
 	       vcff->majorVersion, vcff->minorVersion);
 // Next, one header line beginning with single "#" that names the columns:
 if (line == NULL)
     // EOF after metadata
     return vcff;
+char headerLineBuf[256];
+if (line[0] != '#')
+    {
+    lineFileReuse(lf);
+    vcfFileWarn(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), "
+	       "assuming default VCF 4.1 columns");
+    safef(headerLineBuf, sizeof(headerLineBuf), "%s", vcfDefaultHeader);
+    line = headerLineBuf;
+    }
 dyStringAppend(dyHeader, line);
 dyStringAppendC(dyHeader, '\n');
 parseColumnHeaderRow(vcff, line);
 vcff->headerString = dyStringCannibalize(&dyHeader);
 return vcff;
 }
 
 
 #define VCF_MAX_INFO 512
 
 static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt)
 /* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns.
  * Use the length of the reference sequence to set record->chromEnd.
  * Note: this trashes the alt argument, since this is expected to be its last use. */
 {