70d4208efa11f262ba5591198214c9ccdc6b54ae angie Mon Aug 22 22:24:05 2011 -0700 Feature #3707 (VCF+tabix support in hgTables):Basic hgTables support for VCF, modeled after BAM code. Tested all fields, selected fields, bed output; filter, intersection, schema on 3 flavors of vcfTabix: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20101123/interim_phase1_release/ALL.chr17.phase1.projectConsensus.genotypes.vcf.gz ftp://ftp-trace.ncbi.nlm.nih.gov/1000genomes/ftp/release/20100804/AFR.dindel.20100804.sites.vcf.gz ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/v4.0/ByChromosomeNoGeno/00-All.vcf.gz diff --git src/lib/vcf.c src/lib/vcf.c index 8b610ba..35e834a 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -256,32 +256,30 @@ } static void parseMetadataLine(struct vcfFile *vcff, char *line) /* Parse a VCF header line beginning with "##" that defines a metadata. */ { char *ptr = line; if (ptr == NULL && !startsWith(ptr, "##")) errAbort("Bad line passed to parseMetadataLine"); ptr += 2; char *firstEq = strchr(ptr, '='); if (firstEq == NULL) { vcfFileErr(vcff, "Metadata line lacks '=': \"%s\"", line); return; } -// Every metadata line is saved here: -hashAddN(vcff->metaDataHash, ptr, (firstEq - ptr), vcfFileCloneStr(vcff, firstEq+1)); regmatch_t substrs[8]; // Some of the metadata lines are crucial for parsing the rest of the file: if (startsWith("##fileformat=", line) || startsWith("##format", line)) { if (regexMatchSubstr(line, fileformatRegex, substrs, ArraySize(substrs))) { // substrs[2] is major version #, substrs[3] is set only if there is a minor version, // and substrs[4] is the minor version #. vcff->majorVersion = atoi(line + substrs[2].rm_so); if (substrs[3].rm_so != -1) vcff->minorVersion = atoi(line + substrs[4].rm_so); } else vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"", fileformatRegex, line); @@ -373,68 +371,75 @@ if (wordCount < 10) vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?"); vcff->genotypeCount = (wordCount - 9); vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; for (i = 9; i < wordCount; i++) vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]); } } static struct vcfFile *vcfFileNew() /* Return a new, empty vcfFile object. */ { struct vcfFile *vcff = NULL; AllocVar(vcff); -vcff->metaDataHash = hashNew(0); vcff->pool = hashNew(0); return vcff; } static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr) /* Parse a VCF file into a vcfFile object. If maxErr not zero, then * continue to parse until this number of error have been reached. A maxErr * less than zero does not stop and reports all errors. */ { initVcfSpecInfoDefs(); initVcfSpecGtFormatDefs(); if (lf == NULL) return NULL; struct vcfFile *vcff = vcfFileNew(); vcff->lf = lf; vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName); vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr; +struct dyString *dyHeader = dyStringNew(1024); char *line = NULL; // First, metadata lines beginning with "##": while (lineFileNext(lf, &line, NULL) && startsWith("##", line)) + { + dyStringAppend(dyHeader, line); + dyStringAppendC(dyHeader, '\n'); parseMetadataLine(vcff, line); + } slReverse(&(vcff->infoDefs)); slReverse(&(vcff->filterDefs)); slReverse(&(vcff->gtFormatDefs)); // Did we get the bare minimum VCF header with supported version? if (vcff->majorVersion == 0) vcfFileErr(vcff, "missing ##fileformat= header line? Assuming 4.1."); if ((vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1)) && (vcff->majorVersion != 3)) vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 or v4.1", vcff->majorVersion, vcff->minorVersion); // Next, one header line beginning with single "#" that names the columns: if (line == NULL) // EOF after metadata return vcff; +dyStringAppend(dyHeader, line); +dyStringAppendC(dyHeader, '\n'); parseColumnHeaderRow(vcff, line); +vcff->headerString = dyStringCannibalize(&dyHeader); return vcff; } #define VCF_MAX_INFO 512 static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt) /* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns. * Use the length of the reference sequence to set record->chromEnd. * Note: this trashes the alt argument, since this is expected to be its last use. */ { char *altAlleles[VCF_MAX_INFO]; int altCount = chopCommas(alt, altAlleles); record->alleleCount = 1 + altCount; record->alleles = vcfFileAlloc(vcff, record->alleleCount * sizeof(record->alleles[0])); @@ -658,33 +663,33 @@ return NULL; if (isNotEmpty(chrom) && start != end) { if (lineFileSetTabixRegion(lf, chrom, start, end)) vcfParseData(vcff); } return vcff; } void vcfFileFree(struct vcfFile **pVcff) /* Free a vcfFile object. */ { if (pVcff == NULL || *pVcff == NULL) return; struct vcfFile *vcff = *pVcff; +freez(&(vcff->headerString)); hashFree(&(vcff->pool)); hashFree(&(vcff->byName)); -hashFree(&(vcff->metaDataHash)); lineFileClose(&(vcff->lf)); freez(pVcff); } const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId) /* Return all records with name=variantId, or NULL if not found. */ { struct vcfRecord *varList = NULL; if (vcff->byName == NULL) { vcff->byName = hashNew(0); struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { if (sameString (rec->name, variantId))