3a1b00734b0c19737b48b171decfeb6b9edd27aa markd Wed May 6 18:16:34 2026 -0700 Fix issue with new htslib and access to tabix VCF header. Permanently enable tests that would have found this problem. They were disable by a non-extent USE_TABIX make vaiable. diff --git src/lib/vcf.c src/lib/vcf.c index 3db0767f942..eadb50c2372 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -2,30 +2,32 @@ * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40 * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "dnautil.h" #include "errAbort.h" #include #include "localmem.h" #include "net.h" #include "regexHelper.h" #include "htslib/tbx.h" +#include "htslib/hts.h" +#include "htslib/kstring.h" #include "vcf.h" /* Reserved but optional INFO keys: */ const char *vcfInfoAncestralAllele = "AA"; const char *vcfInfoPerAlleleGtCount = "AC"; // allele count in genotypes, for each ALT allele, // in the same order as listed const char *vcfInfoAlleleFrequency = "AF"; // allele frequency for each ALT allele in the same // order as listed: use this when estimated from // primary data, not called genotypes const char *vcfInfoNumAlleles = "AN"; // total number of alleles in called genotypes const char *vcfInfoBaseQuality = "BQ"; // RMS base quality at this position const char *vcfInfoCigar = "CIGAR"; // cigar string describing how to align an // alternate allele to the reference allele const char *vcfInfoIsDbSnp = "DB"; // dbSNP membership const char *vcfInfoDepth = "DP"; // combined depth across samples, e.g. DP=154 @@ -479,68 +481,99 @@ { initVcfSpecInfoDefs(); initVcfSpecGtFormatDefs(); if (lf == NULL) return NULL; struct vcfFile *vcff = vcfFileNew(); vcff->lf = lf; vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName); vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr; // keep a hash of the INFO keys vcff->infoDefHash = hashNew(0); struct dyString *dyHeader = dyStringNew(1024); char *line = NULL; -// First, metadata lines beginning with "##": +char *tabixHeaderLine = NULL; +// First, metadata lines beginning with "##". +// htslib >= 1.21 tbx_readrec strips meta_char lines, so the tabix iterator +// never returns the VCF header. Read it directly off the htsFile; the +// HTS_IDX_REST iterator created in lineFileTabixAndIndexMayOpen has +// curr_off == 0 and will resume from the bgzf position past the header +// (or lineFileSetTabixRegion will reseek for region queries). +if (lf->tabix != NULL) + { + kstring_t str = KS_INITIALIZE; + while (hts_getline((htsFile *)lf->htsFile, '\n', &str) >= 0) + { + if (!startsWith("##", str.s)) + { + tabixHeaderLine = cloneString(str.s); + line = tabixHeaderLine; + break; + } + dyStringAppend(dyHeader, str.s); + dyStringAppendC(dyHeader, '\n'); + parseMetadataLine(vcff, str.s); + } + free(str.s); + } +else + { while (lineFileNext(lf, &line, NULL) && startsWith("##", line)) { dyStringAppend(dyHeader, line); dyStringAppendC(dyHeader, '\n'); parseMetadataLine(vcff, line); } + } slReverse(&(vcff->infoDefs)); slReverse(&(vcff->filterDefs)); slReverse(&(vcff->gtFormatDefs)); // Did we get the bare minimum VCF header with supported version? if (vcff->majorVersion == 0) { vcfFileWarn(vcff, "missing ##fileformat= header line? Assuming 4.1."); vcff->majorVersion = 4; vcff->minorVersion = 1; } if ((vcff->majorVersion != 4 || vcff->minorVersion < 0 || vcff->minorVersion > 3) && (vcff->majorVersion != 3)) vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 - v4.3", vcff->majorVersion, vcff->minorVersion); // Next, one header line beginning with single "#" that names the columns: if (line == NULL) // EOF after metadata return vcff; char headerLineBuf[256]; if (line[0] != '#') { + // For tabix lf the line came from a private hts_getline read, not the lineFile + // buffer, so lineFileReuse would push back stale state. The iterator will reseek + // before any data iteration, so the consumed line is not lost in practice. + if (lf->tabix == NULL) lineFileReuse(lf); vcfFileWarn(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), " "assuming default VCF 4.1 columns"); safef(headerLineBuf, sizeof(headerLineBuf), "%s", vcfDefaultHeader); line = headerLineBuf; } dyStringAppend(dyHeader, line); dyStringAppendC(dyHeader, '\n'); parseColumnHeaderRow(vcff, line); vcff->headerString = dyStringCannibalize(&dyHeader); +freez(&tabixHeaderLine); return vcff; } struct vcfFile *vcfFileFromHeader(char *name, char *headerString, int maxErr) /* Parse the VCF header string into a vcfFile object with no rows. * name is for error reporting. * If maxErr is non-negative then continue to parse until maxErr+1 errors have been found. * A maxErr less than zero does not stop and reports all errors. * Set maxErr to VCF_IGNORE_ERRS for silence. */ { struct lineFile *lf = lineFileOnString(name, TRUE, cloneString(headerString)); return vcfFileHeaderFromLineFile(lf, maxErr); } #define VCF_MAX_INFO (4*4096)