19eaeb4d65c6962e516fb23474643ba754c32ea1
angie
  Fri Feb 11 10:52:15 2011 -0800
Feature #2821 (VCF parser): works on a 1000 Genomes pilot release VCF+tabix file with genotypes.[Note: this is a squash of 6 commits, developed off in my vcf branch.]

diff --git src/inc/vcf.h src/inc/vcf.h
new file mode 100644
index 0000000..cb1306c
--- /dev/null
+++ src/inc/vcf.h
@@ -0,0 +1,181 @@
+/* VCF: Variant Call Format, version 4.0 / 4.1
+ * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40
+ * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
+ * The vcfFile object borrows many memory handling and error reporting tricks from MarkD's
+ * gff3File; any local deficiencies are not to reflect poorly on Mark's fine work! :) */
+
+#ifndef vcf_h
+#define vcf_h
+
+#include "hash.h"
+#include "linefile.h"
+
+enum vcfInfoType
+/* VCF header defines INFO column components; each component has one of these types: */
+    {
+    vcfInfoNoType,	// uninitialized value (0) or unrecognized type name
+    vcfInfoInteger,
+    vcfInfoFloat,
+    vcfInfoFlag,
+    vcfInfoCharacter,
+    vcfInfoString,
+    };
+
+union vcfDatum
+/* Container for a value whose type is specified by an enum vcfInfoType. */
+    {
+    int datInt;
+    double datFloat;
+    boolean datFlag;
+    char datChar;
+    char *datString;
+    };
+
+struct vcfInfoDef
+/* Definition of INFO column component from VCF header: */
+    {
+    struct vcfInfoDef *next;
+    char *key;			// A short identifier, e.g. MQ for mapping quality
+    int fieldCount;		// The number of values to follow the id, or -1 if it varies
+    enum vcfInfoType type;	// The type of values that follow the id
+    char *description;		// Brief description of info
+    };
+
+struct vcfInfoElement
+/* A single INFO column component; each row's INFO column may contain multiple components. */
+    {
+    char *key;			// An identifier described by a struct vcfInfoDef
+    int count;			// Number of data values following id
+    union vcfDatum *values;	// Array of data values following id
+    };
+
+struct vcfGenotype
+/* A single component of the optional GENOTYPE column. */
+    {
+    char *id;			// Name of individual/sample (pointer to vcfFile genotypeIds) or .
+    unsigned char hapIxA;	// Index of one haplotype's allele: 0=reference, 1=alt, 2=other alt
+    unsigned char hapIxB;	// Index of other haplotype's allele
+    bool isPhased;		// True if haplotypes are phased
+    bool isHaploid;		// True if there is only one haplotype (e.g. chrY)
+    int infoCount;		// Number of components named in FORMAT column
+    struct vcfInfoElement *infoElements;	// Array of info components
+    };
+
+struct vcfRecord
+/* A VCF data row (or list of rows). */
+{
+    struct vcfRecord *next;
+    char *chrom;		// Reference assembly sequence name
+    unsigned int chromStart;	// Start offset in chrom
+    unsigned int chromEnd;	// End offset in chrom
+    char *name;			// Variant name from ID column
+    char *ref;			// Allele found in reference assembly
+    char *alt;			// Alternate allele(s)
+    float qual;			// Phred-scaled score, i.e. -10log_10 P(call in ALT is wrong)
+    char *filter;		// Either PASS or code(s) described in header for failed filters
+    int infoCount;		// Number of components of INFO column
+    struct vcfInfoElement *infoElements;	// Array of INFO column components
+    char *format;		// Optional column containing ordered list of genotype components
+    char **genotypeUnparsedStrings;	// Array of unparsed optional genotype columns
+    struct vcfGenotype *genotypes;	// If built, array of parsed genotype components
+    struct vcfFile *file;	// Pointer back to parent vcfFile
+};
+
+struct vcfFile
+/* Info extracted from a VCF file.  Manages all memory for contents.
+ * Clearly borrowing structure from MarkD's gff3File. :) */
+{
+    char *fileOrUrl;		// VCF local file path or URL
+    struct hash *metaDataHash;	// Store all header metadata lines here
+    int majorVersion;		// 4 etc.
+    int minorVersion;		// 0, 1 etc.
+    struct vcfInfoDef *infoDefs;	// Header's definitions of INFO column components
+    struct vcfInfoDef *filterDefs;	// Header's definitions of FILTER column failure codes
+    struct vcfInfoDef *altDefs;	// Header's defs of symbolic alternate alleles (e.g. DEL, INS)
+    struct vcfInfoDef *gtFormatDefs;	// Header's defs of GENOTYPE compnts. listed in FORMAT col.
+    int genotypeCount;		// Number of optional genotype columns described in header
+    char **genotypeIds;		// Array of optional genotype column names described in header
+    struct vcfRecord *records;	// VCF data rows, sorted by position
+    struct hash *byName;		// Hash records by name -- not populated until needed.
+    struct hash *pool;		// Used to allocate string values that tend to
+				// be repeated in the files.  hash's localMem is also 
+				// use to allocated memory for all other objects.
+    struct lineFile *lf;	// Used only during parsing
+    FILE *errFh;		// Write errors to this file
+    int maxErr;			// Maximum number of errors before aborting
+    int errCnt;			// Error count
+};
+
+/* Reserved but optional INFO keys: */
+extern const char *vcfInfoAncestralAllele;
+extern const char *vcfInfoPerAlleleGtCount;	// allele count in genotypes, for each ALT allele,
+						// in the same order as listed
+extern const char *vcfInfoAlleleFrequency;  	// allele frequency for each ALT allele in the same
+						// order as listed: use this when estimated from
+						// primary data, not called genotypes
+extern const char *vcfInfoNumAlleles;		// total number of alleles in called genotypes
+extern const char *vcfInfoBaseQuality;		// RMS base quality at this position
+extern const char *vcfInfoCigar;		// cigar string describing how to align an
+						// alternate allele to the reference allele
+extern const char *vcfInfoIsDbSnp;		// dbSNP membership
+extern const char *vcfInfoDepth;		// combined depth across samples, e.g. DP=154
+extern const char *vcfInfoEnd;			// end position of the variant described in this
+						// record (esp. for CNVs)
+extern const char *vcfInfoIsHapMap2;		// membership in hapmap2
+extern const char *vcfInfoIsHapMap3;		// membership in hapmap3
+extern const char *vcfInfoIs1000Genomes;	// membership in 1000 Genomes
+extern const char *vcfInfoMappingQuality;	// RMS mapping quality, e.g. MQ=52
+extern const char *vcfInfoMapQual0Count;	// number of MAPQ == 0 reads covering this record
+extern const char *vcfInfoNumSamples;		// Number of samples with data
+extern const char *vcfInfoStrandBias;		// strand bias at this position
+extern const char *vcfInfoIsSomatic;		// indicates that the record is a somatic mutation,
+						// for cancer genomics
+extern const char *vcfInfoIsValidated;		// validated by follow-up experiment
+
+/* Reserved but optional per-genotype keys: */
+extern const char *vcfGtGenotypes;	// numeric allele values separated by "/" (unphased)
+					// or "|" (phased). Allele values are 0 for
+					// reference allele, 1 for the first allele in ALT,
+					// 2 for the second allele in ALT and so on.
+extern const char *vcfGtDepth;		// read depth at this position for this sample
+extern const char *vcfGtFilter;		// analogous to variant's FILTER field
+extern const char *vcfGtLikelihoods;	// three floating point log10-scaled likelihoods for
+					// AA,AB,BB genotypes where A=ref and B=alt;
+					// not applicable if site is not biallelic.
+extern const char *vcfGtPhred;		// Phred-scaled genotype likelihoods rounded to closest int
+extern const char *vcfGtConditionalQual;	// Conditional genotype quality
+					// i.e. phred quality -10log_10 P(genotype call is wrong,
+					// conditioned on the site's being variant)
+extern const char *vcfGtHaplotypeQualities;	// Two phred qualities comma separated
+extern const char *vcfGtPhaseSet;	// Set of phased genotypes to which this genotype belongs
+extern const char *vcfGtPhasingQuality;	// Phred-scaled P(alleles ordered wrongly in heterozygote)
+extern const char *vcfGtExpectedAltAlleleCount;	// Typically used in association analyses
+
+struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, FILE *errFh);
+/* Parse a VCF file into a vcfFile object; return NULL if unable.
+ * If maxErr not zero, then continue to parse until this number of error have been reached.
+ * A maxErr less than zero does not stop and reports all errors. Write errors to errFh,
+ * if NULL, use stderr. */
+
+struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
+				    int maxErr, FILE *errFh);
+/* Parse header and rows within the given position range from a VCF file that has been
+ * compressed and indexed by tabix into a vcfFile object; return NULL if or if file has
+ * no items in range.
+ * If maxErr not zero, then continue to parse until this number of error have been reached.
+ * A maxErr less than zero does not stop and reports all errors. Write errors to errFh,
+ * if NULL, use stderr. */
+
+void vcfFileFree(struct vcfFile **g3fPtr);
+/* Free a vcfFile object. */
+
+const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId);
+/* Return all records with name=variantId, or NULL if not found. */
+
+const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key);
+/* Find an INFO element, or NULL. */
+
+const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId);
+/* Find the genotype and associated info for the individual, or return NULL. */
+
+#endif // vcf_h