491395482b817cfe82cbdef4dd7f8ee0723c1a4a
angie
  Wed Mar 23 21:01:49 2011 -0700
Feature #2822, #2823 (VCF customFactory + track handler):Added a new track type, vcfTabix, with handlers in hgTracks and hgc
and a customFactory.  It is a new bigDataUrl type of track; the
remote VCF file must be compressed and indexed by tabix, so like
BAM a separate index file is required.  If the VCF file has
genotypes, then each sample's two haplotypes are graphed in a
line, with one line per sample.  Otherwise, alleles and counts
(if available) are drawn using Belinda's pgSnp methods.
The source code has to be compiled with USE_TABIX=1 (which is
automatically set for us by common.mk when it finds the local
installation) in order for the CGIs to recognize the track type.

diff --git src/inc/vcf.h src/inc/vcf.h
index 76b9380..036a600 100644
--- src/inc/vcf.h
+++ src/inc/vcf.h
@@ -64,56 +64,56 @@
 struct vcfRecord
 /* A VCF data row (or list of rows). */
 {
     struct vcfRecord *next;
     char *chrom;		// Reference assembly sequence name
     unsigned int chromStart;	// Start offset in chrom
     unsigned int chromEnd;	// End offset in chrom
     char *name;			// Variant name from ID column
     char *ref;			// Allele found in reference assembly
     char *alt;			// Alternate allele(s)
     float qual;			// Phred-scaled score, i.e. -10log_10 P(call in ALT is wrong)
     char *filter;		// Either PASS or code(s) described in header for failed filters
     int infoCount;		// Number of components of INFO column
     struct vcfInfoElement *infoElements;	// Array of INFO column components
     char *format;		// Optional column containing ordered list of genotype components
-    char **genotypeUnparsedStrings;	// Array of unparsed optional genotype columns
-    struct vcfGenotype *genotypes;	// If built, array of parsed genotype components
+    char **genotypeUnparsedStrings;	// Temporary array of unparsed optional genotype columns
+    struct vcfGenotype *genotypes;	// If built, array of parsed genotype components;
+					// call vcfParseGenotypes(record) to build.
     struct vcfFile *file;	// Pointer back to parent vcfFile
 };
 
 struct vcfFile
 /* Info extracted from a VCF file.  Manages all memory for contents.
  * Clearly borrowing structure from MarkD's gff3File. :) */
 {
     char *fileOrUrl;		// VCF local file path or URL
     struct hash *metaDataHash;	// Store all header metadata lines here
     int majorVersion;		// 4 etc.
     int minorVersion;		// 0, 1 etc.
     struct vcfInfoDef *infoDefs;	// Header's definitions of INFO column components
     struct vcfInfoDef *filterDefs;	// Header's definitions of FILTER column failure codes
     struct vcfInfoDef *altDefs;	// Header's defs of symbolic alternate alleles (e.g. DEL, INS)
     struct vcfInfoDef *gtFormatDefs;	// Header's defs of GENOTYPE compnts. listed in FORMAT col.
     int genotypeCount;		// Number of optional genotype columns described in header
     char **genotypeIds;		// Array of optional genotype column names described in header
     struct vcfRecord *records;	// VCF data rows, sorted by position
     struct hash *byName;		// Hash records by name -- not populated until needed.
     struct hash *pool;		// Used to allocate string values that tend to
 				// be repeated in the files.  hash's localMem is also 
 				// use to allocated memory for all other objects.
     struct lineFile *lf;	// Used only during parsing
-    FILE *errFh;		// Write errors to this file
     int maxErr;			// Maximum number of errors before aborting
     int errCnt;			// Error count
 };
 
 /* Reserved but optional INFO keys: */
 extern const char *vcfInfoAncestralAllele;
 extern const char *vcfInfoPerAlleleGtCount;	// allele count in genotypes, for each ALT allele,
 						// in the same order as listed
 extern const char *vcfInfoAlleleFrequency;  	// allele frequency for each ALT allele in the same
 						// order as listed: use this when estimated from
 						// primary data, not called genotypes
 extern const char *vcfInfoNumAlleles;		// total number of alleles in called genotypes
 extern const char *vcfInfoBaseQuality;		// RMS base quality at this position
 extern const char *vcfInfoCigar;		// cigar string describing how to align an
 						// alternate allele to the reference allele
@@ -121,61 +121,97 @@
 extern const char *vcfInfoDepth;		// combined depth across samples, e.g. DP=154
 extern const char *vcfInfoEnd;			// end position of the variant described in this
 						// record (esp. for CNVs)
 extern const char *vcfInfoIsHapMap2;		// membership in hapmap2
 extern const char *vcfInfoIsHapMap3;		// membership in hapmap3
 extern const char *vcfInfoIs1000Genomes;	// membership in 1000 Genomes
 extern const char *vcfInfoMappingQuality;	// RMS mapping quality, e.g. MQ=52
 extern const char *vcfInfoMapQual0Count;	// number of MAPQ == 0 reads covering this record
 extern const char *vcfInfoNumSamples;		// Number of samples with data
 extern const char *vcfInfoStrandBias;		// strand bias at this position
 extern const char *vcfInfoIsSomatic;		// indicates that the record is a somatic mutation,
 						// for cancer genomics
 extern const char *vcfInfoIsValidated;		// validated by follow-up experiment
 
 /* Reserved but optional per-genotype keys: */
-extern const char *vcfGtGenotypes;	// numeric allele values separated by "/" (unphased)
+extern const char *vcfGtGenotype;	// numeric allele values separated by "/" (unphased)
 					// or "|" (phased). Allele values are 0 for
 					// reference allele, 1 for the first allele in ALT,
 					// 2 for the second allele in ALT and so on.
 extern const char *vcfGtDepth;		// read depth at this position for this sample
 extern const char *vcfGtFilter;		// analogous to variant's FILTER field
 extern const char *vcfGtLikelihoods;	// three floating point log10-scaled likelihoods for
 					// AA,AB,BB genotypes where A=ref and B=alt;
 					// not applicable if site is not biallelic.
 extern const char *vcfGtPhred;		// Phred-scaled genotype likelihoods rounded to closest int
 extern const char *vcfGtConditionalQual;	// Conditional genotype quality
 					// i.e. phred quality -10log_10 P(genotype call is wrong,
 					// conditioned on the site's being variant)
 extern const char *vcfGtHaplotypeQualities;	// Two phred qualities comma separated
 extern const char *vcfGtPhaseSet;	// Set of phased genotypes to which this genotype belongs
 extern const char *vcfGtPhasingQuality;	// Phred-scaled P(alleles ordered wrongly in heterozygote)
 extern const char *vcfGtExpectedAltAlleleCount;	// Typically used in association analyses
 
-struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, FILE *errFh);
+INLINE void vcfPrintDatum(FILE *f, const union vcfDatum datum, const enum vcfInfoType type)
+/* Print datum to f in the format determined by type. */
+{
+switch (type)
+    {
+    case vcfInfoInteger:
+	fprintf(f, "%d", datum.datInt);
+	break;
+    case vcfInfoFloat:
+	fprintf(f, "%f", datum.datFloat);
+	break;
+    case vcfInfoFlag:
+	fprintf(f, "%s", datum.datString); // Flags could have values in older VCF
+	break;
+    case vcfInfoCharacter:
+	fprintf(f, "%c", datum.datChar);
+	break;
+    case vcfInfoString:
+	fprintf(f, "%s", datum.datString);
+	break;
+    default:
+	errAbort("vcfPrintDatum: Unrecognized type %d", type);
+	break;
+    }
+}
+
+struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr);
 /* Parse a VCF file into a vcfFile object; return NULL if unable.
  * If maxErr not zero, then continue to parse until this number of error have been reached.
- * A maxErr less than zero does not stop and reports all errors. Write errors to errFh,
- * if NULL, use stderr. */
+ * A maxErr less than zero does not stop and reports all errors. */
 
 struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
-				    int maxErr, FILE *errFh);
+				    int maxErr);
 /* Parse header and rows within the given position range from a VCF file that has been
  * compressed and indexed by tabix into a vcfFile object; return NULL if or if file has
  * no items in range.
  * If maxErr not zero, then continue to parse until this number of error have been reached.
- * A maxErr less than zero does not stop and reports all errors. Write errors to errFh,
- * if NULL, use stderr. */
+ * A maxErr less than zero does not stop and reports all errors. */
 
 void vcfFileFree(struct vcfFile **vcffPtr);
 /* Free a vcfFile object. */
 
 const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId);
 /* Return all records with name=variantId, or NULL if not found. */
 
 const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key);
 /* Find an INFO element, or NULL. */
 
+struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key);
+/* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */
+
+void vcfParseGenotypes(struct vcfRecord *record);
+/* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[].
+ * This destroys genotypesUnparsedStrings. */
+
 const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId);
-/* Find the genotype and associated info for the individual, or return NULL. */
+/* Find the genotype and associated info for the individual, or return NULL.
+ * This calls vcfParseGenotypes if it has not already been called. */
+
+struct vcfInfoDef *vcfInfoDefForGtKey(struct vcfFile *vcff, const char *key);
+/* Look up the type of genotype FORMAT component key, in the definitions from the header,
+ * and failing that, from the keys reserved in the spec. */
 
 #endif // vcf_h