15ab88298277f4c822a77f667aa4d2f7def37b3c tdreszer Tue Apr 23 13:35:05 2013 -0700 Changes in preparation for checking in haplotypes code. diff --git src/inc/vcfBits.h src/inc/vcfBits.h index 774cec7..1a91fd3 100644 --- src/inc/vcfBits.h +++ src/inc/vcfBits.h @@ -1,71 +1,107 @@ /* vcfBits.c/.h: Variant Call Format, analysis by bit maps. * The routines found here are dependent upon vcf.c/.h for accessing vcf records. * They allow analysis of a set of vcf records by bit maps with one bit map per variant * location and where each haplotype covered by the vcf record is represented by a single * bit (or pair of bits). Additional analysis can be performed by creating haplotype based * bit maps from variant bit maps. There is one haplotype bit map for each haplotype * (subject chromosome) with one (or two) bits for each variant location in the set of records. */ #ifndef vcfBits_h #define vcfBits_h #include "vcf.h" #include "bits.h" #include "elmTree.h" +enum variantType +/* VCF header defines INFO column components; each component has one of these types: */ + { + vtNoType = 0, // uninitialized value (0) or unrecognized type name + vtSNP = 1, // Single Nucleotide polymorphism (most) + vtInsertion = 2, // Insertion relative to reference genome + vtDeletion = 3, // Insertion relative to reference genome + }; + struct variantBits // all genotypes/haplotypes/alleles for one record are converted to a bit map // One struct per variant record in vcff->records. One slot per genotype containing // 2 slots for haplotypes and then 1 or 2 bits per allele. { struct variantBits *next; struct vcfRecord *record; // keep track of record for later interpretation + enum variantType vType; // Might as well know this up front int genotypeSlots; // subjects covered in vcf file unsigned char haplotypeSlots; // 2 unless haploid or homozygous only unsigned char alleleSlots; // 1 for 1 alt allele, 2 for 2 or 3 alt alleles >3 unsupported int bitsOn; // count of bits on. Bits *bits; // allele bits genotype x haplotype x allele Bits *unphased; // unphased bits (1 bit per genotype) if requested, else NULL void **variants; // special purposes array of variants filled and used by caller }; #define genoIxFromGenoHapIx(vBits,genoHaploIx) (genoHaploIx / vBits->haplotypeSlots) #define hapIxFromGenoHapIx(vBits,genoHaploIx) (genoHaploIx % vBits->haplotypeSlots) #define genoHapIx(vBits,genoIx,hapIx) ((genoIx * vBits->haplotypeSlots) + hapIx) #define vBitsSlot(vBits,genoIx,hapIx,variantIx) \ ( (genoHapIx(vBits,genoIx,hapIx) * vBits->alleleSlots) + variantIx) #define vBitsSlotCount(vBits) \ ((vBits)->genotypeSlots * (vBits)->haplotypeSlots * (vBits)->alleleSlots) +#define vcfBitsSubjectCount(vBits) ((vBits)->genotypeSlots) struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records, boolean phasedOnly, boolean homozygousOnly, boolean unphasedBits); // Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record // provided. If records is NULL will use vcff->records. Bit map has one slot per genotype // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles // is represented by two bits per allele (>3 non-reference alleles unsupported). // If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable) // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot. // If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes. // NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this. -int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin); -// Drops vBits found in less than a minimum number of haplotype genomes. +Bits *vcfRecordHaploidBits(struct vcfFile *vcff, struct vcfRecord *record); +// Returns array (1 bit per genotype) for each haploid genotype. +// This is useful for interpreting chrX. + +int vBitsHaploidCount(struct variantBits *vBits); +// Returns the number of subjects in the VCF dataset that are haploid at this location + +//#define vBitsSubjectChromCount(vBits) ((vBits)->genotypeSlots * (vBits)->haplotypeSlots) +int vBitsSubjectChromCount(struct variantBits *vBits); +// Returns the chromosomes in the VCF dataset that are covered at this location + +int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin, + boolean dropRefErrors); +// Drops vBits found in less than a minimum number of haplotype genomes. Supplying 1 will +// drop variants found in no haplotype genomes. Declaring dropRefErrors will drop variants +// in all haplotype genomes (variants where reference is wrong). // Returns count of vBits structure that were dropped. +int vcfVariantBitsAlleleOccurs(struct variantBits *vBits, unsigned char alleleIx, + boolean homozygousOrHaploid); +// Counts the number of times a particular allele occurs in the subjects*chroms covered. +// If homozygousOrHaploid then the allele must occur in both chroms to be counted + +int vcfVariantBitsReferenceOccurs(struct vcfFile *vcff, struct variantBits *vBitsList, + boolean homozygousOrHaploid); +// For an entire list of vBits structs, counts the times the reference allele occurs. +// If homozygousOrHaploid then the reference allele must occur in both chroms to be counted + + int vcfVariantMostPopularCmp(const void *va, const void *vb); // Compare to sort variantBits based upon how many genomes/chrom has the variant // This can be used to build haploBits in most populous order for tree building struct haploBits // all variants/haplotypes/genotypes for a set of records are converted to a bit map // One struct per haplotype genome covering vcff->records. One slot per variant // and 1 or 2 bits per allele. NOTE: variant slots will all be normalized to max. { struct haploBits *next; char *ids; // comma separated lists of genotype names and haplotypes int haploGenomes; // count of haploid genomes this structure covers int genomeIx; // genome sample index (allows later lookups) unsigned char haploidIx; // haploid index [0,1] (allows later lookups) int variantSlots; // count of variants covered in set of vcf records