716be62e7d41dc55f37642a85c7868bc6023c470
tdreszer
  Tue Feb 5 18:24:43 2013 -0800
Split vcfBits.c/.h off from vcf.c/.h.  No code changes made.
diff --git src/lib/vcf.c src/lib/vcf.c
index 3687fbb..ced7d64 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -1069,465 +1069,15 @@
                                   "semicolon-separated list of codes for filters that fail\""
         "    string info;       \"Additional information encoded as a semicolon-separated series "
                                  "of short keys with optional comma-separated values\""
         "    string format;     \"If genotype columns are specified in header, a "
                                  "semicolon-separated list of of short keys starting with GT\""
         "    string genotypes;  \"If genotype columns are specified in header, a tab-separated "
                                  "set of genotype column values; each value is a colon-separated "
                                  "list of values corresponding to keys in the format column\""
         "    )";
 
 struct asObject *vcfAsObj()
 // Return asObject describing fields of VCF
 {
 return asParseText(vcfDataLineAutoSqlString);
 }
-
-// - - - - - - Support for bit map based analysis of variants - - - - - -
-
-static struct variantBits *vcfOneRecordToVariantBits(struct vcfFile *vcff,struct vcfRecord *record,
-                                                     boolean phasedOnly, boolean homozygousOnly)
-// Returns bit array covering all genotypes/haplotype/alleles per record.  One slot per genotype
-// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
-// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
-// is represented by two bits per allele (>3 alternate alleles is unsupported).
-// If phasedOnly, unphased haplotype bits will all be set only if all agree (00 is uninterpretable)
-// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
-{
-// parse genotypes if needed
-if (record->genotypes == NULL)
-    vcfParseGenotypes(record);
-assert(vcff->genotypeCount > 0);
-int ix = 0;
-
-// allocate vBits struct
-struct variantBits *vBits;
-struct lm *lm = vcfFileLm(vcff);
-lmAllocVar(lm,vBits);
-vBits->genotypeSlots  = vcff->genotypeCount;
-vBits->record         = record;
-vBits->haplotypeSlots = 2;  // assume diploid, but then check for complete haploid
-if (homozygousOnly)
-    vBits->haplotypeSlots = 1;
-else
-    { // spin through all the subjects to see if any are non-haploid
-    for (ix = 0; ix < vcff->genotypeCount && record->genotypes[ix].isHaploid;ix++)
-        ;
-    if (ix == vcff->genotypeCount)
-        vBits->haplotypeSlots = 1;
-    }
-
-// allocate bits
-assert(record->alleleCount < 4);
-vBits->alleleSlots = (record->alleleCount <= 2 ? 1 : 2);
-int bitCount = vBitsSlotCount(vBits);
-Bits *bits = lmBitAlloc(lm,bitCount);
-vBits->bits = bits;
-
-
-// walk through genotypes and set the bit
-for (ix = 0; ix < vcff->genotypeCount;ix++)
-    {
-    struct vcfGenotype *gt = gt = &(record->genotypes[ix]);
-
-    boolean homozygous = (gt->isHaploid || gt->hapIxA == gt->hapIxB);
-
-    if ((!phasedOnly || gt->isPhased || homozygous))
-        {
-        if ((!homozygousOnly || homozygous) && gt->hapIxA > 0)
-            {
-            switch (gt->hapIxA)
-                {
-                case 1: bitSetOne(bits,  vBitsSlot(vBits,ix,0,0));   vBits->bitsOn++;    break;
-                case 2: bitSetOne(bits,  vBitsSlot(vBits,ix,0,1));   vBits->bitsOn++;    break;
-                case 3: bitSetRange(bits,vBitsSlot(vBits,ix,0,0),2); vBits->bitsOn += 2; break;
-                default:                                             break;
-                }
-            }
-        if (!gt->isHaploid && !homozygousOnly && gt->hapIxB > 0)
-            {
-            switch (gt->hapIxB)
-                {
-                case 1: bitSetOne(bits,  vBitsSlot(vBits,ix,1,0));   vBits->bitsOn++;    break;
-                case 2: bitSetOne(bits,  vBitsSlot(vBits,ix,1,1));   vBits->bitsOn++;    break;
-                case 3: bitSetRange(bits,vBitsSlot(vBits,ix,1,0),2); vBits->bitsOn += 2; break;
-                default:                                             break;
-                }
-            }
-        }
-    }
-
-return vBits;
-}
-
-static Bits *vcfRecordUnphasedBits(struct vcfFile *vcff, struct vcfRecord *record)
-// Returns array (1 bit per genotype) for each unphased genotype.
-{
-// allocate bits
-Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount);
-
-// parse genotypes if needed
-if (record->genotypes == NULL)
-    vcfParseGenotypes(record);
-
-// walk through genotypes and set the bit
-int ix = 0;
-for ( ; ix < vcff->genotypeCount;ix++)
-    {
-    if (record->genotypes[ix].isPhased == FALSE
-    && record->genotypes[ix].isHaploid == FALSE)
-        bitSetOne(bits,ix);
-    }
-return bits;
-}
-
-struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records,
-                                            boolean phasedOnly, boolean homozygousOnly,
-                                            boolean unphasedBits)
-// Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record
-// provided.  If records is NULL will use vcff->records. Bit map has one slot per genotype
-// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
-// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
-// is represented by two bits per allele (>3 non-reference alleles unsupported).
-// If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable)
-// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
-// If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes.
-// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
-{
-struct variantBits *vBitsList = NULL;
-
-struct vcfRecord *record = records ? records : vcff->records;
-for (;record != NULL; record = record->next)
-    {
-    struct variantBits *vBits = vcfOneRecordToVariantBits(vcff, record, phasedOnly, homozygousOnly);
-    if (unphasedBits)
-        vBits->unphased = vcfRecordUnphasedBits(vcff, record);
-    slAddHead(&vBitsList,vBits);
-    }
-slReverse(&vBitsList);
-
-return vBitsList;
-}
-
-int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin)
-// Drops vBits found in less than a minimum number of haplotype genomes.
-// Returns count of vBits structure that were dropped.
-{
-struct variantBits *vBitsKeep = NULL;
-struct variantBits *vBits = NULL;
-int dropped = 0;
-while ((vBits = slPopHead(vBitsList)) != NULL)
-    {
-    if (vBits->bitsOn >= haploGenomeMin)
-        slAddHead(&vBitsKeep,vBits);
-    else  // drop memory, since this is on lm
-        dropped++;
-    }
-slReverse(&vBitsKeep);
-*vBitsList = vBitsKeep;
-return dropped;
-}
-
-int vcfVariantMostPopularCmp(const void *va, const void *vb)
-// Compare to sort variantBits based upon how many genomes/chrom has the variant
-// This can be used to build haploBits in most populous order for tree building
-{
-const struct variantBits *a = *((struct variantBits **)va);
-const struct variantBits *b = *((struct variantBits **)vb);
-//int ret = memcmp(a->bits, b->bits, bitToByteSize(vBitsSlotCount(a)));
-int ret = (b->bitsOn - a->bitsOn); // higher bits first (descending order)
-if (ret == 0)
-    { // sort deterministically
-    ret = (a->record->chromStart - b->record->chromStart);
-    if (ret == 0)
-        ret = (a->record->chromEnd - b->record->chromEnd);
-    }
-return ret;
-}
-
-struct haploBits *vcfVariantBitsToHaploBits(struct vcfFile *vcff, struct variantBits *vBitsList,
-                                            boolean ignoreReference)
-// Converts a set of variant bits to haplotype bits, resulting in one bit struct
-// per haplotype genome that has non-reference variations.  If ignoreReference, only
-// haplotype genomes with at least one non-reference variant are returned.
-// A haploBit array has one variant slot per vBit struct and one or more bits per allele.
-// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
-{
-// First determine dimensions
-int variantSlots = 0;
-unsigned char alleleSlots  = 1;
-struct variantBits *vBits = vBitsList;
-for ( ; vBits != NULL; vBits = vBits->next)
-    {
-    variantSlots++;
-    if (vBits->alleleSlots == 2)
-        alleleSlots = 2; // Will normalize allele slots across all variants.
-    // This should catch any problems where chrX is haploid/diploid
-    assert(vBitsList->haplotypeSlots == vBits->haplotypeSlots);
-    }
-
-
-// Now create a struct per haplotype
-struct lm *lm = vcfFileLm(vcff);
-struct haploBits *hBitsList = NULL;
-int genoIx = 0;
-for (; genoIx < vBitsList->genotypeSlots; genoIx++)
-    {
-    int haploIx = 0;
-    for (; haploIx < vBitsList->haplotypeSlots; haploIx++)
-        {
-        // allocate struct
-        struct haploBits *hBits;
-        lmAllocVar(lm,hBits);
-        hBits->variantSlots = variantSlots;
-        hBits->alleleSlots  = alleleSlots;
-        hBits->haploGenomes = 1;
-        int bitCount = hBitsSlotCount(hBits);
-
-        // allocate bits
-        Bits *bits = lmBitAlloc(lm,bitCount);
-        hBits->bits = bits;
-
-        int variantIx = 0;
-        for (vBits =  vBitsList; vBits != NULL; vBits = vBits->next, variantIx++)
-            {
-            if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,0)))
-                {
-                bitSetOne(        bits,hBitsSlot(hBits,variantIx,     0));
-                hBits->bitsOn++;
-                }
-
-            if (vBits->alleleSlots == 2) // (hBit->alleleSlot will also == 2)
-                {
-                if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,1)))
-                    {
-                    bitSetOne(        bits,hBitsSlot(hBits,variantIx,     1));
-                    hBits->bitsOn++;
-                    }
-                }
-            }
-        if (!ignoreReference || hBits->bitsOn > 0) // gonna keep it so flesh it out
-            {
-            hBits->genomeIx  = genoIx;
-            hBits->haploidIx = haploIx;
-            struct vcfRecord *record = vBitsList->record; // any will do
-            struct vcfGenotype *gt = &(record->genotypes[genoIx]);
-            if (gt->isHaploid || vBitsList->haplotypeSlots == 1)
-                { // chrX will have haplotypeSlots==2 but be haploid for this subject.
-                  // Meanwhile if vBits were for homozygous only,  haplotypeSlots==1
-                //assert(haploIx == 0);
-                hBits->ids = lmCloneString(lm,gt->id);
-                }
-            else
-                {
-                int sz = strlen(gt->id) + 3;
-                hBits->ids = lmAlloc(lm,sz);
-                safef(hBits->ids,sz,"%s-%c",gt->id,'a' + haploIx);
-                }
-            slAddHead(&hBitsList,hBits);
-            }
-        //else
-        //    haploBitsFree(&hBits); // lm so just abandon
-        }
-    }
-slReverse(&hBitsList);
-
-return hBitsList;
-}
-
-int vcfHaploBitsOnCmp(const void *va, const void *vb)
-// Compare to sort haploBits based upon how many bits are on
-{
-const struct haploBits *a = *((struct haploBits **)va);
-const struct haploBits *b = *((struct haploBits **)vb);
-int ret = (a->bitsOn - b->bitsOn);
-if (ret == 0)
-    { // sort deterministically
-    ret = (a->genomeIx - b->genomeIx);
-    if (ret == 0)
-        ret = (a->haploidIx - b->haploidIx);
-    }
-return ret;
-}
-
-int vcfHaploMemCmp(const void *va, const void *vb)
-// Compare to sort haploBits based upon how many bits are on
-{
-const struct haploBits *a = *((struct haploBits **)va);
-const struct haploBits *b = *((struct haploBits **)vb);
-int ret = memcmp(a->bits, b->bits, bitToByteSize(hBitsSlotCount(a)));
-if (ret == 0)
-    { // sort deterministically
-    ret = (a->genomeIx - b->genomeIx);
-    if (ret == 0)
-        ret = (a->haploidIx - b->haploidIx);
-    }
-return ret;
-}
-
-static struct haploBits *hBitsCollapsePair(struct lm *lm, struct haploBits *hBits,
-                                           struct haploBits *dupMap)
-// Adds Ids of duplicate bitmap to hBits and abandons the duplicate.
-{
-char *ids = lmAlloc(lm,strlen(dupMap->ids) + strlen(hBits->ids) + 2);
-// NOTE: Putting Dup Ids first could leave these in lowest genomeIx/haploidIx order
-sprintf(ids,"%s,%s",dupMap->ids,hBits->ids);
-hBits->ids = ids;
-hBits->haploGenomes++;
-hBits->genomeIx  = dupMap->genomeIx;  // Arbitrary but could leave lowest first
-hBits->haploidIx = dupMap->haploidIx;
-return hBits;
-}
-
-int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList,
-                                      int haploGenomeMin)
-// Collapses a list of haploBits based upon identical bit arrays.
-// If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes.
-// Returns count of hBit structs removed.
-{
-struct lm *lm = vcfFileLm(vcff);
-int collapsed = 0;
-struct haploBits *hBitsPassed = NULL;
-struct haploBits *hBitsToTest = *phBitsList;
-if (hBitsToTest->next == NULL)
-    return 0;
-
-// Sort early bits first
-slSort(&hBitsToTest, vcfHaploMemCmp);
-
-// Walk through those of matching size and compare
-int bitCount = hBitsSlotCount(hBitsToTest);
-int memSize = bitToByteSize(bitCount);
-struct haploBits *hBitsQ = NULL;
-while ((hBitsQ = slPopHead(&hBitsToTest)) != NULL)
-    {
-    struct haploBits *hBitsT = hBitsToTest; //hBitsQ->next; is already null.
-    for (;hBitsT != NULL;hBitsT = hBitsT->next)
-        {
-        // If the 2 bitmaps do not have the same number of bits on, they are not identical
-        if (hBitsT->bitsOn != hBitsQ->bitsOn)
-            break;
-
-        // If 2 bitmaps are identical, then collapse into one
-        if (memcmp(hBitsT->bits,hBitsQ->bits,memSize) == 0)
-            {     // collapse
-            hBitsCollapsePair(lm,hBitsT,hBitsQ);
-            hBitsQ = NULL; // abandon lm memory
-            collapsed++;
-            break;  // hBitsQ used up so make hBitsT the new Q.
-            }
-        }
-
-    // If hBitsQ survived the collapsing, then don't loose it.
-    if (hBitsQ != NULL)
-        slAddHead(&hBitsPassed,hBitsQ);
-    }
-
-// Now that we have collapsed hBits, we can drop any that are not found in enough subjects
-if (haploGenomeMin > 1)
-    {
-    hBitsToTest = hBitsPassed;
-    hBitsPassed = NULL;
-    while ((hBitsQ = slPopHead(hBitsToTest)) != NULL)
-        {
-        if (hBitsQ->haploGenomes >= haploGenomeMin)
-            slAddHead(&hBitsPassed,hBitsQ);
-        else  // drop memory, since this is on lm
-            collapsed++;
-        }
-    // double pop passes means no reverse required.
-    }
-else
-    slReverse(&hBitsPassed);
-
-*phBitsList = hBitsPassed;
-
-return collapsed;
-}
-
-unsigned char vcfHaploBitsToVariantAlleleIx(struct haploBits *hBits,int bitIx)
-// Given a hBits struct and bitIx, what is the actual variant allele ix
-// to use when accessing the vcfRecord?  NOTE: here 0 is reference allele
-{
-if (hBits->alleleSlots == 1)
-    return bitReadOne(hBits->bits,bitIx); // 0 or 1
-else
-    {
-    assert(hBits->alleleSlots == 2); // Only supports 1-3 variants encoded as 2 bits
-    unsigned char varIx = 0;
-    int bit = variantSlotFromBitIx(hBits,bitIx);
-    if (bitReadOne(hBits->bits,bit++))
-        varIx = 1;  // Note that bitmaps represent 1,2,3 as 10,01,11
-    if (bitReadOne(hBits->bits,bit))
-        varIx += 2;
-    return varIx;
-    }
-}
-
-enum elmNodeOverlap vcfHaploBitsCmp(const struct slList *elA, const struct slList *elB,
-                                    int *matchWeight, void *extra)
-// HaploBits compare routine for building tree of relations using elmTreeGrow().
-{
-const struct haploBits *a = (struct haploBits *)elA;
-const struct haploBits *b = (struct haploBits *)elB;
-
-int bitCount = hBitsSlotCount(a);
-int and = bitAndCount(a->bits,b->bits,bitCount);
-if (matchWeight)
-    *matchWeight = and;
-if (and == 0)// && a->bitsOn > 0)
-    return enoExcluding; // nothing in common
-if (a->bitsOn == and)
-    {
-    if (b->bitsOn == and)
-        return enoEqual; // perfect match
-    else
-        return enoSubset;
-    }
-// super and mixed?
-if (b->bitsOn == and)
-    return enoSuperset;
-return enoMixed;
-}
-
-struct slList *vcfHaploBitsMatching(const struct slList *elA, const struct slList *elB,
-                                    void *extra)
-// Returns a HaploBits structure representing the common parts of elements A and B.
-// Used with elmTreeGrow() to create nodes that are the common parts between leaves/branches.
-{
-struct haploBits *a = (struct haploBits *)elA;
-struct haploBits *b = (struct haploBits *)elB;
-struct lm *lm = extra;
-
-// If the 2 bitmaps are equal, then collapse into a single struct
-if (enoEqual == vcfHaploBitsCmp(elA,elB,NULL,extra))
-    {
-    if (hBitsIsReal(a)) // bitmap represents variants on at least one subject chromosome
-        {
-        assert(!hBitsIsReal(b));  // Won't always work but hBits have been precollapsed
-        if (hBitsIsReal(b)) // Should not be if these have already been collapsed.
-            {
-            hBitsCollapsePair(lm,a,b);    // a is real, b is real so collapse into one
-            //haploBitsFreeList(&b);   // lm so just abandon
-            return (struct slList *)a;
-            }
-        else
-            return (struct slList *)a; // a is real, b is not so just use a
-        }
-    else if (hBitsIsReal(b))
-        return (struct slList *)b;    // b is real, a is not so just use b
-    }
-
-// Must make non "real" bitmap which is the common bits of a and b
-struct haploBits *match;
-lmAllocVar(lm,match);
-match->alleleSlots  = a->alleleSlots;
-match->variantSlots = a->variantSlots;
-match->haploGenomes = 0;  // Marking this as a subset
-
-int bitCount  = hBitsSlotCount(match);
-match->bits = lmBitClone(lm,a->bits, bitCount);
-bitAnd(match->bits, b->bits, bitCount);
-match->bitsOn = bitCountRange(match->bits, 0, bitCount);
-
-return (struct slList *)match;
-}