9c9a47ea66cec45d4c7af1c82dfcb9d8f8693e3a angie Thu Apr 17 10:29:31 2014 -0700 vcfNextRecord was using lineFileChop instead of lineFileChopTab,which masked a data bug in 1000 Genomes Phase 1 VCF: some lines have an extra tab character at the end. Now we chop by tabs, but ignore a single empty extra column because an extra tab at the end is a pretty common error. fixes #13091 diff --git src/lib/vcf.c src/lib/vcf.c index e229f03..ab6dc6e 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -675,42 +675,54 @@ parseFilterColumn(vcff, record, words[6]); parseInfoColumn(vcff, record, words[7]); if (vcff->genotypeCount > 0) { record->format = vcfFilePooledStr(vcff, words[8]); record->genotypeUnparsedStrings = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; // Don't bother actually parsing all these until & unless we need the info: for (i = 0; i < vcff->genotypeCount; i++) record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]); } return record; } +static int checkWordCount(struct vcfFile *vcff, char **words, int wordCount) +// Compensate for error in 1000 Genomes Phase 1 file +// ALL.chr21.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz +// which has some lines that have an extra "\t" at the end of line, +// causing the wordCount to be too high by 1: +{ +int expected = 8; +if (vcff->genotypeCount > 0) + expected = 9 + vcff->genotypeCount; +if (wordCount == expected+1 && words[expected][0] == '\0') + wordCount--; +lineFileExpectWords(vcff->lf, expected, wordCount); +return wordCount; +} + struct vcfRecord *vcfNextRecord(struct vcfFile *vcff) /* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file. * Note: this does not store record in vcff->records! */ { char *words[VCF_MAX_COLUMNS]; int wordCount; -if ((wordCount = lineFileChop(vcff->lf, words)) <= 0) +if ((wordCount = lineFileChopTab(vcff->lf, words)) <= 0) return NULL; -int expected = 8; -if (vcff->genotypeCount > 0) - expected = 9 + vcff->genotypeCount; -lineFileExpectWords(vcff->lf, expected, wordCount); +wordCount = checkWordCount(vcff, words, wordCount); return vcfRecordFromRow(vcff, words); } static boolean allelesHavePaddingBase(char **alleles, int alleleCount) /* Examine alleles to see if they either a) all start with the same base or * b) include a symbolic or 0-length allele. In either of those cases, there * must be an initial padding base that we'll need to trim from non-symbolic * alleles. */ { boolean hasPaddingBase = TRUE; char firstBase = '\0'; if (isAllNt(alleles[0], strlen(alleles[0]))) firstBase = alleles[0][0]; int i; for (i = 1; i < alleleCount; i++) @@ -899,34 +911,31 @@ if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) || startsWith("https://", fileOrUrl)) lf = netLineFileOpen(fileOrUrl); else lf = lineFileMayOpen(fileOrUrl, TRUE); struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); if (vcff && chrom != NULL) { char *line = NULL; while (lineFileNextReal(vcff->lf, &line)) { char lineCopy[strlen(line)+1]; safecpy(lineCopy, sizeof(lineCopy), line); char *words[VCF_MAX_COLUMNS]; int wordCount = chopTabs(lineCopy, words); - int expected = 8; - if (vcff->genotypeCount > 0) - expected = 9 + vcff->genotypeCount; - lineFileExpectWords(vcff->lf, expected, wordCount); + wordCount = checkWordCount(vcff, words, wordCount); struct vcfRecord *record = vcfRecordFromRow(vcff, words); if (chromsMatch(chrom, record->chrom)) { if (record->chromEnd < start) continue; else { lineFileReuse(vcff->lf); break; } } } } if (vcff && parseAll) {