716be62e7d41dc55f37642a85c7868bc6023c470 tdreszer Tue Feb 5 18:24:43 2013 -0800 Split vcfBits.c/.h off from vcf.c/.h. No code changes made. diff --git src/inc/vcf.h src/inc/vcf.h index 28c14a6..8d7f14d 100644 --- src/inc/vcf.h +++ src/inc/vcf.h @@ -1,30 +1,28 @@ /* VCF: Variant Call Format, version 4.0 / 4.1 * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40 * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 * The vcfFile object borrows many memory handling and error reporting tricks from MarkD's * gff3File; any local deficiencies are not to reflect poorly on Mark's fine work! :) */ #ifndef vcf_h #define vcf_h #include "limits.h" #include "hash.h" #include "linefile.h" #include "asParse.h" -#include "bits.h" -#include "elmTree.h" enum vcfInfoType /* VCF header defines INFO column components; each component has one of these types: */ { vcfInfoNoType, // uninitialized value (0) or unrecognized type name vcfInfoInteger, vcfInfoFloat, vcfInfoFlag, vcfInfoCharacter, vcfInfoString, }; union vcfDatum /* Container for a value whose type is specified by an enum vcfInfoType. */ { @@ -261,124 +259,16 @@ /* Find the genotype and associated info for the individual, or return NULL. * This calls vcfParseGenotypes if it has not already been called. */ struct vcfInfoDef *vcfInfoDefForGtKey(struct vcfFile *vcff, const char *key); /* Look up the type of genotype FORMAT component key, in the definitions from the header, * and failing that, from the keys reserved in the spec. */ char *vcfFilePooledStr(struct vcfFile *vcff, char *str); /* Allocate memory for a string from vcff's shared string pool. */ #define VCF_NUM_COLS 10 struct asObject *vcfAsObj(); // Return asObject describing fields of VCF - -// - - - - - - Support for bit map based analysis of variants - - - - - - -struct variantBits -// all genotypes/haplotypes/alleles for one record are converted to a bit map -// One struct per variant record in vcff->records. One slot per genotype containing -// 2 slots for haplotypes and then 1 or 2 bits per allele. - { - struct variantBits *next; - struct vcfRecord *record; // keep track of record for later interpretation - int genotypeSlots; // subjects covered in vcf file - unsigned char haplotypeSlots; // 2 unless haploid or homozygous only - unsigned char alleleSlots; // 1 for 1 alt allele, 2 for 2 or 3 alt alleles >3 unsupported - int bitsOn; // count of bits on. - Bits *bits; // allele bits genotype x haplotype x allele - Bits *unphased; // unphased bits (1 bit per genotype) if requested, else NULL - void **variants; // special purposes array of variants filled and used by caller - }; - -#define genoIxFromGenoHapIx(vBits,genoHaploIx) (genoHaploIx / vBits->haplotypeSlots) -#define hapIxFromGenoHapIx(vBits,genoHaploIx) (genoHaploIx % vBits->haplotypeSlots) -#define genoHapIx(vBits,genoIx,hapIx) ((genoIx * vBits->haplotypeSlots) + hapIx) -#define vBitsSlot(vBits,genoIx,hapIx,variantIx) \ - ( (genoHapIx(vBits,genoIx,hapIx) * vBits->alleleSlots) + variantIx) -#define vBitsSlotCount(vBits) \ - ((vBits)->genotypeSlots * (vBits)->haplotypeSlots * (vBits)->alleleSlots) - -struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records, - boolean phasedOnly, boolean homozygousOnly, - boolean unphasedBits); -// Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record -// provided. If records is NULL will use vcff->records. Bit map has one slot per genotype -// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of -// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles -// is represented by two bits per allele (>3 non-reference alleles unsupported). -// If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable) -// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot. -// If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes. -// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this. - -int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin); -// Drops vBits found in less than a minimum number of haplotype genomes. -// Returns count of vBits structure that were dropped. - -int vcfVariantMostPopularCmp(const void *va, const void *vb); -// Compare to sort variantBits based upon how many genomes/chrom has the variant -// This can be used to build haploBits in most populous order for tree building - -struct haploBits -// all variants/haplotypes/genotypes for a set of records are converted to a bit map -// One struct per haplotype genome covering vcff->records. One slot per variant -// and 1 or 2 bits per allele. NOTE: variant slots will all be normalized to max. - { - struct haploBits *next; - char *ids; // comma separated lists of genotype names and haplotypes - int haploGenomes; // count of haploid genomes this structure covers - int genomeIx; // genome sample index (allows later lookups) - unsigned char haploidIx; // haploid index [0,1] (allows later lookups) - int variantSlots; // count of variants covered in set of vcf records - unsigned char alleleSlots; // 1 for 1 alt allele, 2 for 2 or 3 alt alleles >3 unsupported - int bitsOn; // count of bits on. - Bits *bits; // allele bits variant x allele - }; - -#define vcfRecordIxFromBitIx(hBits,bitIx) (bitIx / hBits->alleleSlots) -#define variantSlotFromBitIx(hBits,bitIx) (vcfRecordIxFromBitIx(hBits,bitIx) * hBits->alleleSlots) -#define variantNextFromBitIx(hBits,bitIx) (variantSlotFromBitIx(hBits,bitIx) + hBits->alleleSlots) -#define hBitsSlot(hBits,variantIx,alleleIx) ((hBits->alleleSlots * variantIx) + alleleIx) -#define hBitsSlotCount(hBits) ((hBits)->variantSlots * (hBits)->alleleSlots) - -// An hBits struct is "Real" if it is generated from variants. It may also be a subset. -#define hBitsIsSubset(hBits) ((hBits)->haploGenomes == 0) -#define hBitsIsReal(hBits) ((hBits)->haploGenomes > 0) - -struct haploBits *vcfVariantBitsToHaploBits(struct vcfFile *vcff, struct variantBits *vBitsList, - boolean ignoreReference); -// Converts a set of variant bits to haplotype bits, resulting in one bit struct -// per haplotype genome that has non-reference variations. If ignoreReference, only -// haplotype genomes with at lone non-reference variant are returned. -// A haploBit array has one variant slot per vBit struct and one or more bits per allele. -// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this. - -int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList, - int haploGenomeMin); -// Collapses a list of haploBits based upon identical bit arrays. -// If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes. -// Returns count of hBit structs removed. - -INLINE struct variantBits *vcfHaploBitIxToVariantBits(struct haploBits *hBits, int bitIx, - struct variantBits *vBitsList) -// Returns appropriate vBits from vBits list associated with a given bit in an hBits struct. -// Assumes vBitsList is in same order as hBits bit array. Note vBits->record has full vcf details. -{ -return slElementFromIx(vBitsList,vcfRecordIxFromBitIx(hBits,bitIx)); -} - -unsigned char vcfHaploBitsToVariantAlleleIx(struct haploBits *hBits,int bitIx); -// Given a hBits struct and bitIx, what is the actual variant allele ix -// to use when accessing the vcfRecord? - -enum elmNodeOverlap vcfHaploBitsCmp(const struct slList *elA, const struct slList *elB, - int *matchWeight, void *extra); -// HaploBits compare routine for building tree of relations using elmTreeGrow(). - -struct slList *vcfHaploBitsMatching(const struct slList *elA, const struct slList *elB, - void *extra); -// Returns a HaploBits structure representing the common parts of elements A and B. -// Used with elmTreeGrow() to create nodes that are the common parts between leaves/branches. - #endif // vcf_h