src/lib/vcf.c 9c9a47ea66cec45d4c7af1c82dfcb9d8f8693e3a

9c9a47ea66cec45d4c7af1c82dfcb9d8f8693e3a
angie
  Thu Apr 17 10:29:31 2014 -0700
vcfNextRecord was using lineFileChop instead of lineFileChopTab,which masked a data bug in 1000 Genomes Phase 1 VCF: some lines have
an extra tab character at the end.  Now we chop by tabs, but ignore
a single empty extra column because an extra tab at the end is a
pretty common error.
fixes #13091

diff --git src/lib/vcf.c src/lib/vcf.c
index e229f03..ab6dc6e 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -675,42 +675,54 @@
 parseFilterColumn(vcff, record, words[6]);
 parseInfoColumn(vcff, record, words[7]);
 if (vcff->genotypeCount > 0)
     {
     record->format = vcfFilePooledStr(vcff, words[8]);
     record->genotypeUnparsedStrings = vcfFileAlloc(vcff,
 						   vcff->genotypeCount * sizeof(char *));
     int i;
     // Don't bother actually parsing all these until & unless we need the info:
     for (i = 0;  i < vcff->genotypeCount;  i++)
 	record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]);
     }
 return record;
 }
 
+static int checkWordCount(struct vcfFile *vcff, char **words, int wordCount)
+// Compensate for error in 1000 Genomes Phase 1 file
+// ALL.chr21.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
+// which has some lines that have an extra "\t" at the end of line,
+// causing the wordCount to be too high by 1:
+{
+int expected = 8;
+if (vcff->genotypeCount > 0)
+    expected = 9 + vcff->genotypeCount;
+if (wordCount == expected+1 && words[expected][0] == '\0')
+    wordCount--;
+lineFileExpectWords(vcff->lf, expected, wordCount);
+return wordCount;
+}
+
 struct vcfRecord *vcfNextRecord(struct vcfFile *vcff)
 /* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file.
  * Note: this does not store record in vcff->records! */
 {
 char *words[VCF_MAX_COLUMNS];
 int wordCount;
-if ((wordCount = lineFileChop(vcff->lf, words)) <= 0)
+if ((wordCount = lineFileChopTab(vcff->lf, words)) <= 0)
     return NULL;
-int expected = 8;
-if (vcff->genotypeCount > 0)
-    expected = 9 + vcff->genotypeCount;
-lineFileExpectWords(vcff->lf, expected, wordCount);
+wordCount = checkWordCount(vcff, words, wordCount);
 return vcfRecordFromRow(vcff, words);
 }
 
 static boolean allelesHavePaddingBase(char **alleles, int alleleCount)
 /* Examine alleles to see if they either a) all start with the same base or
  * b) include a symbolic or 0-length allele.  In either of those cases, there
  * must be an initial padding base that we'll need to trim from non-symbolic
  * alleles. */
 {
 boolean hasPaddingBase = TRUE;
 char firstBase = '\0';
 if (isAllNt(alleles[0], strlen(alleles[0])))
     firstBase = alleles[0][0];
 int i;
 for (i = 1;  i < alleleCount;  i++)
@@ -899,34 +911,31 @@
 if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) ||
     startsWith("https://", fileOrUrl))
     lf = netLineFileOpen(fileOrUrl);
 else
     lf = lineFileMayOpen(fileOrUrl, TRUE);
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 if (vcff && chrom != NULL)
     {
     char *line = NULL;
     while (lineFileNextReal(vcff->lf, &line))
 	{
 	char lineCopy[strlen(line)+1];
 	safecpy(lineCopy, sizeof(lineCopy), line);
 	char *words[VCF_MAX_COLUMNS];
 	int wordCount = chopTabs(lineCopy, words);
-	int expected = 8;
-	if (vcff->genotypeCount > 0)
-	    expected = 9 + vcff->genotypeCount;
-	lineFileExpectWords(vcff->lf, expected, wordCount);
+	wordCount = checkWordCount(vcff, words, wordCount);
 	struct vcfRecord *record = vcfRecordFromRow(vcff, words);
 	if (chromsMatch(chrom, record->chrom))
 	    {
 	    if (record->chromEnd < start)
 		continue;
 	    else
 		{
 		lineFileReuse(vcff->lf);
 		break;
 		}
 	    }
 	}
     }
 if (vcff && parseAll)
     {