4344e168885850e053dc245826ab42f2b445c98f
angie
  Wed Aug 24 13:11:01 2011 -0700
Feature #2821 (VCF parser): Fun with flawed input.  This file's rows have6 keywords in the format column, but most genotype columns have only 5
pieces of data:
ftp://ftp-trace.ncbi.nlm.nih.gov/1000genomes/ftp/release/20100804/supporting/AFR.BI_withr2.20100804.genotypes.vcf.gz
This causes many vcfFileErr calls in vcfParseGenotypes.  Turned out that
vcfFileErr was not stopping after the specified # of errors; and that the
warnings are so verbose that I don't think we really need to display them
in hgTracks and hgc. So they are now ignored, and we don't segv on missing
genotype info.

diff --git src/lib/vcf.c src/lib/vcf.c
index 35e834a..0b8bc81 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -159,33 +159,37 @@
 return vcff->errCnt > vcff->maxErr;
 }
 
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
 #if defined(__GNUC__)
 __attribute__((format(printf, 2, 3)))
 #endif
 ;
 
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
 /* Send error message to errabort stack's warn handler and abort */
 {
 va_list args;
 va_start(args, format);
 char formatPlus[1024];
+if (vcff->lf != NULL)
 sprintf(formatPlus, "%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format);
+else
+    strcpy(formatPlus, format);
 vaWarn(formatPlus, args);
 va_end(args);
+vcff->errCnt++;
 if (vcfFileStopDueToErrors(vcff))
     errAbort("VCF: %d parser errors, quitting", vcff->errCnt);
 }
 
 static void *vcfFileAlloc(struct vcfFile *vcff, size_t size)
 /* allocate memory from the memory pool */
 {
 return lmAlloc(vcff->pool->lm, size);
 }
 
 static char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size)
 /* allocate memory for a string and copy it */
 {
 return lmCloneStringZ(vcff->pool->lm, str, size);
 }
@@ -233,31 +237,31 @@
     return vcfInfoString;
 vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line);
 return vcfInfoNoType;
 }
 
 // Regular expressions to check format and extract information from header lines:
 static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$";
 static const char *infoOrFormatRegex =
     "^##(INFO|FORMAT)="
     "<ID=([A-Za-z0-9_:-]+),"
     "Number=(\\.|[0-9-]+),"
     "Type=([A-Za-z]+),"
     "Description=\"?([^\"]+)\"?>$";
 static const char *filterOrAltRegex =
     "^##(FILTER|ALT)="
-    "<ID=([A-Za-z0-9_:-]+),"
+    "<ID=([^,]+),"
     "(Description|Type)=\"([^\"]+)\">$";
 
 INLINE void nonAsciiWorkaround(char *line)
 // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files:
 {
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\"");
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\"");
 }
 
 static void parseMetadataLine(struct vcfFile *vcff, char *line)
 /* Parse a VCF header line beginning with "##" that defines a metadata. */
 {
 char *ptr = line;
 if (ptr == NULL && !startsWith(ptr, "##"))
     errAbort("Bad line passed to parseMetadataLine");
@@ -829,31 +833,30 @@
 	    if (sep == NULL)
 		gt->isHaploid = TRUE;
 	    else
 		gt->hapIxB = atoi(sep+1);
 	    }
 	struct vcfInfoElement *el = &(gt->infoElements[j]);
 	el->key = formatWords[j];
 	el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j],
 				   &(el->values));
 	if (el->count >= VCF_MAX_INFO)
 	    vcfFileErr(vcff, "A single element of the genotype column for \"%s\" "
 		       "has at least %d values; "
 		       "VCF_MAX_INFO may need to be increased in vcf.c!",
 		       gt->id, VCF_MAX_INFO);
 	}
-
     }
 record->genotypeUnparsedStrings = NULL;
 }
 
 const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId)
 /* Find the genotype and associated info for the individual, or return NULL.
  * This calls vcfParseGenotypes if it has not already been called. */
 {
 struct vcfFile *vcff = record->file;
 if (sampleId == NULL || vcff->genotypeCount == 0)
     return NULL;
 vcfParseGenotypes(record);
 int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount);
 if (ix >= 0)
     return &(record->genotypes[ix]);