752bae9723eeeabb02d950157ab0f9512fd689b3
tdreszer
  Mon Jan 28 15:05:59 2013 -0800
Checking in 'elmTree' which uses neighbor joining to build a tree in local memory.  Also checking in 2 sets of changes to vcf.  First, added support for a 'reuse' local memory pool which aid traversing giant files.  Second, added routines for bitmap based analysis of variants.
diff --git src/inc/vcf.h src/inc/vcf.h
index 16e7f3c..2d7a826 100644
--- src/inc/vcf.h
+++ src/inc/vcf.h
@@ -1,27 +1,30 @@
 /* VCF: Variant Call Format, version 4.0 / 4.1
  * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40
  * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
  * The vcfFile object borrows many memory handling and error reporting tricks from MarkD's
  * gff3File; any local deficiencies are not to reflect poorly on Mark's fine work! :) */
 
 #ifndef vcf_h
 #define vcf_h
 
+#include "limits.h"
 #include "hash.h"
 #include "linefile.h"
 #include "asParse.h"
+#include "bits.h"
+#include "elmTree.h"
 
 enum vcfInfoType
 /* VCF header defines INFO column components; each component has one of these types: */
     {
     vcfInfoNoType,	// uninitialized value (0) or unrecognized type name
     vcfInfoInteger,
     vcfInfoFloat,
     vcfInfoFlag,
     vcfInfoCharacter,
     vcfInfoString,
     };
 
 union vcfDatum
 /* Container for a value whose type is specified by an enum vcfInfoType. */
     {
@@ -91,32 +94,34 @@
  * Clearly borrowing structure from MarkD's gff3File. :) */
 {
     char *fileOrUrl;		// VCF local file path or URL
     char *headerString;		// Complete original header including newlines.
     int majorVersion;		// 4 etc.
     int minorVersion;		// 0, 1 etc.
     struct vcfInfoDef *infoDefs;	// Header's definitions of INFO column components
     struct vcfInfoDef *filterDefs;	// Header's definitions of FILTER column failure codes
     struct vcfInfoDef *altDefs;	// Header's defs of symbolic alternate alleles (e.g. DEL, INS)
     struct vcfInfoDef *gtFormatDefs;	// Header's defs of GENOTYPE compnts. listed in FORMAT col.
     int genotypeCount;		// Number of optional genotype columns described in header
     char **genotypeIds;		// Array of optional genotype column names described in header
     struct vcfRecord *records;	// VCF data rows, sorted by position
     struct hash *byName;		// Hash records by name -- not populated until needed.
     struct hash *pool;		// Used to allocate string values that tend to
-				// be repeated in the files.  hash's localMem is also
-				// use to allocated memory for all other objects.
+				// be repeated in the files.  hash's localMem is also used to
+				// allocated memory for all other objects (if recordPool null)
+    struct lm *reusePool;       // If created with vcfFileMakeReusePool, non-shared record data is
+                                // allocated from this pool. Useful when walking through huge files.
     struct lineFile *lf;	// Used only during parsing
     int maxErr;			// Maximum number of errors before aborting
     int errCnt;			// Error count
 };
 
 /* Reserved but optional INFO keys: */
 extern const char *vcfInfoAncestralAllele;
 extern const char *vcfInfoPerAlleleGtCount;	// allele count in genotypes, for each ALT allele,
 						// in the same order as listed
 extern const char *vcfInfoAlleleFrequency;  	// allele frequency for each ALT allele in the same
 						// order as listed: use this when estimated from
 						// primary data, not called genotypes
 extern const char *vcfInfoNumAlleles;		// total number of alleles in called genotypes
 extern const char *vcfInfoBaseQuality;		// RMS base quality at this position
 extern const char *vcfInfoCigar;		// cigar string describing how to align an
@@ -169,45 +174,75 @@
     case vcfInfoFlag:
 	fprintf(f, "%s", datum.datString); // Flags could have values in older VCF
 	break;
     case vcfInfoCharacter:
 	fprintf(f, "%c", datum.datChar);
 	break;
     case vcfInfoString:
 	fprintf(f, "%s", datum.datString);
 	break;
     default:
 	errAbort("vcfPrintDatum: Unrecognized type %d", type);
 	break;
     }
 }
 
+#define VCF_IGNORE_ERRS (INT_MAX - 1)
+
 struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, int maxRecords, boolean parseAll);
 /* Open fileOrUrl and parse VCF header; return NULL if unable.
  * If parseAll, then read in all lines, parse and store in
  * vcff->records; if maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
- * and reports all errors. */
+ * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */
 
 struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
 				    int maxErr, int maxRecords);
 /* Open a VCF file that has been compressed and indexed by tabix and
  * parse VCF header, or return NULL if unable.  If chrom is non-NULL,
  * seek to the position range and parse all lines in range into
  * vcff->records.  If maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
- * and reports all errors. */
+ * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */
+
+int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end,
+                      int maxErr, int maxRecords);
+// Reads a batch of records from an opened and indexed VCF file, returning number
+// of records in batch.  Seeks to the start position and parses all lines in range,
+// adding them to vcff->records.  Note: vcff->records will continue to be sorted,
+// even if batches are loaded out of order.  If maxErr >= zero, then continue to
+// parse until there are maxErr+1 errors.  A maxErr less than zero does not stop
+// and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence.
+
+void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize);
+// Creates a separate memory pool for records.  Establishing this pool allows
+// using vcfFileFlushRecords to abandon previously read records and free
+// the associated memory. Very useful when reading an entire file in batches.
+#define vcfFileLm(vcff) ((vcff)->reusePool ? (vcff)->reusePool : (vcff)->pool->lm)
+
+void vcfFileAbandonReusePool(struct vcfFile *vcff);
+// Abandons all previously allocated data from the reuse pool and reverts to
+// common pool. The vcf->records set will also be abandoned as pointers are invalid.
+// USE WITH CAUTION.  All previously allocated pointers from this pool are now invalid.
+
+void vcfFileFlushRecords(struct vcfFile *vcff);
+// Abandons all previously read vcff->records and flushes the reuse pool (if it exists).
+// USE WITH CAUTION.  All previously allocated record pointers are now invalid.
+
+struct vcfRecord *vcfNextRecord(struct vcfFile *vcff);
+/* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file.
+ * Note: this does not store record in vcff->records! */
 
 struct vcfRecord *vcfRecordFromRow(struct vcfFile *vcff, char **words);
 /* Parse words from a VCF data line into a VCF record structure. */
 
 unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec);
 /* For indels, VCF includes the left neighboring base; for example, if the alleles are
  * AA/- following a G base, then the VCF record will start one base to the left and have
  * "GAA" and "G" as the alleles.  That is not nice for display for two reasons:
  * 1. Indels appear one base wider than their dbSNP entries.
  * 2. In pgSnp display mode, the two alleles are always the same color.
  * However, for hgTracks' mapBox we need the correct chromStart for identifying the
  * record in hgc -- so return the original chromStart. */
 
 void vcfFileFree(struct vcfFile **vcffPtr);
 /* Free a vcfFile object. */
@@ -229,16 +264,124 @@
 /* Find the genotype and associated info for the individual, or return NULL.
  * This calls vcfParseGenotypes if it has not already been called. */
 
 struct vcfInfoDef *vcfInfoDefForGtKey(struct vcfFile *vcff, const char *key);
 /* Look up the type of genotype FORMAT component key, in the definitions from the header,
  * and failing that, from the keys reserved in the spec. */
 
 char *vcfFilePooledStr(struct vcfFile *vcff, char *str);
 /* Allocate memory for a string from vcff's shared string pool. */
 
 #define VCF_NUM_COLS 10
 
 struct asObject *vcfAsObj();
 // Return asObject describing fields of VCF
 
+
+// - - - - - - Support for bit map based analysis of variants - - - - - -
+struct variantBits
+// all genotypes/haplotypes/alleles for one record are converted to a bit map
+// One struct per variant record in vcff->records.  One slot per genotype containing
+// 2 slots for haplotypes and then 1 or 2 bits per allele.
+    {
+    struct variantBits *next;
+    struct vcfRecord *record;     // keep track of record for later interpretation
+    int genotypeSlots;            // subjects covered in vcf file
+    unsigned char haplotypeSlots; // 2 unless haploid or homozygous only
+    unsigned char alleleSlots;    // 1 for 1 alt allele, 2 for 2 or 3 alt alleles >3 unsupported
+    int bitsOn;                   // count of bits on.
+    Bits *bits;                   // allele bits genotype x haplotype x allele
+    Bits *unphased;               // unphased bits (1 bit per genotype) if requested, else NULL
+    void **variants;              // special purposes array of variants filled and used by caller
+    };
+
+#define genoIxFromGenoHapIx(vBits,genoHaploIx)  (genoHaploIx / vBits->haplotypeSlots)
+#define hapIxFromGenoHapIx(vBits,genoHaploIx)   (genoHaploIx % vBits->haplotypeSlots)
+#define genoHapIx(vBits,genoIx,hapIx)          ((genoIx * vBits->haplotypeSlots) + hapIx)
+#define vBitsSlot(vBits,genoIx,hapIx,variantIx) \
+        ( (genoHapIx(vBits,genoIx,hapIx) * vBits->alleleSlots) + variantIx)
+#define vBitsSlotCount(vBits) \
+        ((vBits)->genotypeSlots * (vBits)->haplotypeSlots * (vBits)->alleleSlots)
+
+struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records,
+                                            boolean phasedOnly, boolean homozygousOnly,
+                                            boolean unphasedBits);
+// Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record
+// provided.  If records is NULL will use vcff->records. Bit map has one slot per genotype
+// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
+// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
+// is represented by two bits per allele (>3 non-reference alleles unsupported).
+// If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable)
+// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
+// If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes.
+// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
+
+int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin);
+// Drops vBits found in less than a minimum number of haplotype genomes.
+// Returns count of vBits structure that were dropped.
+
+int vcfVariantMostPopularCmp(const void *va, const void *vb);
+// Compare to sort variantBits based upon how many genomes/chrom has the variant
+// This can be used to build haploBits in most populous order for tree building
+
+struct haploBits
+// all variants/haplotypes/genotypes for a set of records are converted to a bit map
+// One struct per haplotype genome covering vcff->records.  One slot per variant
+// and 1 or 2 bits per allele.  NOTE: variant slots will all be normalized to max.
+    {
+    struct haploBits *next;
+    char *ids;                 // comma separated lists of genotype names and haplotypes
+    int haploGenomes;          // count of haploid genomes this structure covers
+    int genomeIx;              // genome sample index (allows later lookups)
+    unsigned char haploidIx;   // haploid index [0,1] (allows later  lookups)
+    int variantSlots;          // count of variants covered in set of vcf records
+    unsigned char alleleSlots; // 1 for 1 alt allele, 2 for 2 or 3 alt alleles >3 unsupported
+    int bitsOn;                // count of bits on.
+    Bits *bits;                // allele bits variant x allele
+    };
+
+#define vcfRecordIxFromBitIx(hBits,bitIx)  (bitIx / hBits->alleleSlots)
+#define variantSlotFromBitIx(hBits,bitIx)  (vcfRecordIxFromBitIx(hBits,bitIx) * hBits->alleleSlots)
+#define variantNextFromBitIx(hBits,bitIx)  (variantSlotFromBitIx(hBits,bitIx) + hBits->alleleSlots)
+#define hBitsSlot(hBits,variantIx,alleleIx) ((hBits->alleleSlots * variantIx) + alleleIx)
+#define hBitsSlotCount(hBits) ((hBits)->variantSlots * (hBits)->alleleSlots)
+
+// An hBits struct is "Real" if it is generated from variants.  It may also be a subset.
+#define hBitsIsSubset(hBits)    ((hBits)->haploGenomes == 0)
+#define hBitsIsReal(hBits)      ((hBits)->haploGenomes >  0)
+
+struct haploBits *vcfVariantBitsToHaploBits(struct vcfFile *vcff, struct variantBits *vBitsList,
+                                            boolean ignoreReference);
+// Converts a set of variant bits to haplotype bits, resulting in one bit struct
+// per haplotype genome that has non-reference variations.  If ignoreReference, only
+// haplotype genomes with at lone non-reference variant are returned.
+// A haploBit array has one variant slot per vBit struct and one or more bits per allele.
+// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
+
+int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList,
+                                      int haploGenomeMin);
+// Collapses a list of haploBits based upon identical bit arrays.
+// If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes.
+// Returns count of hBit structs removed.
+
+INLINE struct variantBits *vcfHaploBitIxToVariantBits(struct haploBits *hBits, int bitIx,
+                                                      struct variantBits *vBitsList)
+// Returns appropriate vBits from vBits list associated with a given bit in an hBits struct.
+// Assumes vBitsList is in same order as hBits bit array.  Note vBits->record has full vcf details.
+{
+return slElementFromIx(vBitsList,vcfRecordIxFromBitIx(hBits,bitIx));
+}
+
+unsigned char vcfHaploBitsToVariantIx(struct haploBits *hBits,int bitIx);
+// Given a hBits struct and bitIx, what is the actual variant allele ix
+// to use when accessing the vcfRecord?
+
+enum elmNodeOverlap vcfHaploBitsCmp(const struct slList *elA, const struct slList *elB,
+                                    int *matchWeight, void *extra);
+// HaploBits compare routine for building tree of relations using elmTreeGrow().
+
+struct slList *vcfHaploBitsMatching(const struct slList *elA, const struct slList *elB,
+                                    void *extra);
+// Returns a HaploBits structure representing the common parts of elements A and B.
+// Used with elmTreeGrow() to create nodes that are the common parts between leaves/branches.
+
 #endif // vcf_h