752bae9723eeeabb02d950157ab0f9512fd689b3 tdreszer Mon Jan 28 15:05:59 2013 -0800 Checking in 'elmTree' which uses neighbor joining to build a tree in local memory. Also checking in 2 sets of changes to vcf. First, added support for a 'reuse' local memory pool which aid traversing giant files. Second, added routines for bitmap based analysis of variants. diff --git src/lib/vcf.c src/lib/vcf.c index 9acc900..4eb8f5b 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -156,74 +156,76 @@ static bool vcfFileStopDueToErrors(struct vcfFile *vcff) /* determine if we should stop due to the number of errors */ { return vcff->errCnt > vcff->maxErr; } static void vcfFileErr(struct vcfFile *vcff, char *format, ...) #if defined(__GNUC__) __attribute__((format(printf, 2, 3))) #endif ; static void vcfFileErr(struct vcfFile *vcff, char *format, ...) /* Send error message to errabort stack's warn handler and abort */ { +vcff->errCnt++; +if (vcff->maxErr == VCF_IGNORE_ERRS) + return; va_list args; va_start(args, format); char formatPlus[1024]; if (vcff->lf != NULL) sprintf(formatPlus, "%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format); else strcpy(formatPlus, format); vaWarn(formatPlus, args); va_end(args); -vcff->errCnt++; if (vcfFileStopDueToErrors(vcff)) errAbort("VCF: %d parser errors, quitting", vcff->errCnt); } static void *vcfFileAlloc(struct vcfFile *vcff, size_t size) /* Use vcff's local mem to allocate memory. */ { -return lmAlloc(vcff->pool->lm, size); +return lmAlloc( vcfFileLm(vcff), size); } -static char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size) +inline char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size) /* Use vcff's local mem to allocate memory for a string and copy it. */ { -return lmCloneStringZ(vcff->pool->lm, str, size); +return lmCloneStringZ( vcfFileLm(vcff), str, size); } -static char *vcfFileCloneStr(struct vcfFile *vcff, char *str) +inline char *vcfFileCloneStr(struct vcfFile *vcff, char *str) /* Use vcff's local mem to allocate memory for a string and copy it. */ { return vcfFileCloneStrZ(vcff, str, strlen(str)); } -static char *vcfFileCloneSubstr(struct vcfFile *vcff, char *line, regmatch_t substr) +inline char *vcfFileCloneSubstr(struct vcfFile *vcff, char *line, regmatch_t substr) /* Allocate memory for and copy a substring of line. */ { return vcfFileCloneStrZ(vcff, line+substr.rm_so, (substr.rm_eo - substr.rm_so)); } -#define vcfFileCloneVar(var) lmCloneMem(vcff->pool->lm, var, sizeof(var)); +#define vcfFileCloneVar(vcff, var) lmCloneMem( vcfFileLm(vcff), var, sizeof(var)) char *vcfFilePooledStr(struct vcfFile *vcff, char *str) /* Allocate memory for a string from vcff's shared string pool. */ { -return hashStoreName(vcff->pool, str); +return hashStoreName(vcff->pool, str); // Always stored in main pool, not reuse pool } static enum vcfInfoType vcfInfoTypeFromSubstr(struct vcfFile *vcff, char *line, regmatch_t substr) /* Translate substring of line into vcfInfoType or complain. */ { char typeWord[16]; int substrLen = substr.rm_eo - substr.rm_so; if (substrLen > sizeof(typeWord) - 1) { vcfFileErr(vcff, "substring passed to vcfInfoTypeFromSubstr is too long."); return vcfInfoNoType; } safencpy(typeWord, sizeof(typeWord), line + substr.rm_so, substrLen); if (sameString("Integer", typeWord)) return vcfInfoInteger; @@ -402,37 +404,74 @@ vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?"); vcff->genotypeCount = (wordCount - 9); vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; for (i = 9; i < wordCount; i++) vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]); } } static struct vcfFile *vcfFileNew() /* Return a new, empty vcfFile object. */ { struct vcfFile *vcff = NULL; AllocVar(vcff); vcff->pool = hashNew(0); +vcff->reusePool = NULL; // Must explicitly request a separate record pool return vcff; } +void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize) +// Creates a separate memory pool for records. Establishing this pool allows +// using vcfFileFlushRecords to abandon previously read records and free +// the associated memory. Very useful when reading an entire file in batches. +{ +assert(vcff->reusePool == NULL); // don't duplicate this +vcff->reusePool = lmInit(initialSize); +} + +void vcfFileAbandonReusePool(struct vcfFile *vcff) +// Abandons all previously allocated data from the reuse pool and reverts to +// common pool. The vcf->records set will also be abandoned as pointers are invalid. +// USE WITH CAUTION. All previously allocated pointers from this pool are now invalid. +{ +assert(vcff->reusePool != NULL); // don't duplicate this +//printf("Reuse pool %ld of %ld unused\n",lmAvailable(vcff->reusePool),lmSize(vcff->reusePool)); +lmCleanup(&vcff->reusePool); +vcff->records = NULL; +} + +void vcfFileFlushRecords(struct vcfFile *vcff) +// Abandons all previously read vcff->records and flushes the reuse pool (if it exists). +// USE WITH CAUTION. All previously allocated record pointers are now invalid. +{ +if (vcff->reusePool != NULL) + { + size_t poolSize = lmSize(vcff->reusePool); + //if (poolSize > (48 * 1024 * 1024)) + // printf("\nReuse pool %ld of %ld unused\n",lmAvailable(vcff->reusePool),poolSize); + lmCleanup(&vcff->reusePool); + vcff->reusePool = lmInit(poolSize); + } +vcff->records = NULL; +} + static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr) /* Parse a VCF file into a vcfFile object. If maxErr not zero, then * continue to parse until this number of error have been reached. A maxErr - * less than zero does not stop and reports all errors. */ + * less than zero does not stop and reports all errors. + * Set maxErr to VCF_IGNORE_ERRS for silence */ { initVcfSpecInfoDefs(); initVcfSpecGtFormatDefs(); if (lf == NULL) return NULL; struct vcfFile *vcff = vcfFileNew(); vcff->lf = lf; vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName); vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr; struct dyString *dyHeader = dyStringNew(1024); char *line = NULL; // First, metadata lines beginning with "##": while (lineFileNext(lf, &line, NULL) && startsWith("##", line)) { @@ -694,131 +733,198 @@ if (allSameFirstBase) { rec->chromStart++; for (i = 0; i < rec->alleleCount; i++) { if (rec->alleles[i][1] == '\0') rec->alleles[i] = vcfFilePooledStr(vcff, "-"); else rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1); } } } return chromStartOrig; } -static void vcfParseData(struct vcfFile *vcff, int maxRecords) -/* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned - * at the beginning of a data row, parse and store all data rows from lineFile. */ +static struct vcfRecord *vcfParseData(struct vcfFile *vcff, int maxRecords) +// Given a vcfFile into which the header has been parsed, and whose +// lineFile is positioned at the beginning of a data row, parse and +// return all data rows from lineFile. { if (vcff == NULL) - return; + return NULL; int recCount = 0; +struct vcfRecord *records = NULL; struct vcfRecord *record; while ((record = vcfNextRecord(vcff)) != NULL) { if (maxRecords >= 0 && recCount >= maxRecords) break; - slAddHead(&(vcff->records), record); + slAddHead(&records, record); recCount++; } -slReverse(&(vcff->records)); -lineFileClose(&(vcff->lf)); +slReverse(&records); + +return records; } struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, int maxRecords, boolean parseAll) /* Open fileOrUrl and parse VCF header; return NULL if unable. * If parseAll, then read in all lines, parse and store in * vcff->records; if maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop - * and reports all errors. */ + * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */ { struct lineFile *lf = NULL; if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) || startsWith("https://", fileOrUrl)) lf = netLineFileOpen(fileOrUrl); else lf = lineFileMayOpen(fileOrUrl, TRUE); struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); if (parseAll) - vcfParseData(vcff, maxRecords); + { + vcff->records = vcfParseData(vcff, maxRecords); + lineFileClose(&(vcff->lf)); // Not sure why it is closed. Angie? + } return vcff; } struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, int maxErr, int maxRecords) /* Open a VCF file that has been compressed and indexed by tabix and * parse VCF header, or return NULL if unable. If chrom is non-NULL, * seek to the position range and parse all lines in range into * vcff->records. If maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop - * and reports all errors. */ + * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */ { struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE); struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); if (vcff == NULL) return NULL; if (isNotEmpty(chrom) && start != end) { if (lineFileSetTabixRegion(lf, chrom, start, end)) - vcfParseData(vcff, maxRecords); + { + vcff->records = vcfParseData(vcff, maxRecords); + lineFileClose(&(vcff->lf)); // Not sure why it is closed. Angie? + } } return vcff; } +int vcfRecordCmp(const void *va, const void *vb) +/* Compare to sort based on position. */ +{ +const struct vcfRecord *a = *((struct vcfRecord **)va); +const struct vcfRecord *b = *((struct vcfRecord **)vb); +int dif; +dif = strcmp(a->chrom, b->chrom); +if (dif == 0) + dif = a->chromStart - b->chromStart; +if (dif == 0) + dif = a->chromEnd - b->chromEnd; // shortest first +if (dif == 0) + dif = strcmp(a->name, b->name); // finally by name +return dif; +} + +int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end, + int maxErr, int maxRecords) +// Reads a batch of records from an opened and indexed VCF file, returning number +// of records in batch. Seeks to the start position and parses all lines in range, +// adding them to vcff->records. Note: vcff->records will continue to be sorted, +// even if batches are loaded out of order. If maxErr >= zero, then continue to +// parse until there are maxErr+1 errors. A maxErr less than zero does not stop +// and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence +{ +int count = 0; + +if (lineFileSetTabixRegion(vcff->lf, chrom, start, end)) + { + struct vcfRecord *records = vcfParseData(vcff, maxRecords); + if (records) + { + struct vcfRecord *lastRec = vcff->records; + count = slCount(records); + if (lastRec == NULL) + vcff->records = records; + else + { + for (; lastRec->next != NULL; lastRec = lastRec->next) + ; + lastRec->next = records; + // Do we need to sort? + if (vcfRecordCmp(&lastRec,&records) > 0) + slSort(&(vcff->records), vcfRecordCmp); + } + } + } + +return count; +} + void vcfFileFree(struct vcfFile **pVcff) /* Free a vcfFile object. */ { if (pVcff == NULL || *pVcff == NULL) return; struct vcfFile *vcff = *pVcff; +if (vcff->maxErr == VCF_IGNORE_ERRS && vcff->errCnt > 0) + { + vcff->maxErr++; // Never completely silent! Go ahead and report how many errs were detected + vcfFileErr(vcff,"Closing with %d errors.",vcff->errCnt); + } freez(&(vcff->headerString)); hashFree(&(vcff->pool)); +if (vcff->reusePool) + vcfFileAbandonReusePool(vcff); hashFree(&(vcff->byName)); lineFileClose(&(vcff->lf)); freez(pVcff); } const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId) /* Return all records with name=variantId, or NULL if not found. */ { struct vcfRecord *varList = NULL; if (vcff->byName == NULL) { vcff->byName = hashNew(0); struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { if (sameString (rec->name, variantId)) { // Make shallow copy of rec so we can alter ->next: - struct vcfRecord *newRec = vcfFileCloneVar(rec); + struct vcfRecord *newRec = vcfFileCloneVar(vcff,rec); slAddHead(&varList, newRec); } hashAdd(vcff->byName, rec->name, rec); } slReverse(&varList); } else { struct hashEl *hel = hashLookup(vcff->byName, variantId); while (hel != NULL) { if (sameString(hel->name, variantId)) { struct vcfRecord *rec = hel->val; - struct vcfRecord *newRec = vcfFileCloneVar(rec); + struct vcfRecord *newRec = vcfFileCloneVar(vcff,rec); slAddHead(&varList, newRec); } hel = hel->next; } // Don't reverse varList -- hash element list was already reversed } return varList; } const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key) /* Find an INFO element, or NULL. */ { int i; for (i = 0; i < record->infoCount; i++) { @@ -977,15 +1083,457 @@ " string info; \"Additional information encoded as a semicolon-separated series " "of short keys with optional comma-separated values\"" " string format; \"If genotype columns are specified in header, a " "semicolon-separated list of of short keys starting with GT\"" " string genotypes; \"If genotype columns are specified in header, a tab-separated " "set of genotype column values; each value is a colon-separated " "list of values corresponding to keys in the format column\"" " )"; struct asObject *vcfAsObj() // Return asObject describing fields of VCF { return asParseText(vcfDataLineAutoSqlString); } +// - - - - - - Support for bit map based analysis of variants - - - - - - + +static struct variantBits *vcfOneRecordToVariantBits(struct vcfFile *vcff,struct vcfRecord *record, + boolean phasedOnly, boolean homozygousOnly) +// Returns bit array covering all genotypes/haplotype/alleles per record. One slot per genotype +// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of +// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles +// is represented by two bits per allele (>3 alternate alleles is unsupported). +// If phasedOnly, unphased haplotype bits will all be set only if all agree (00 is uninterpretable) +// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot. +{ +// parse genotypes if needed +if (record->genotypes == NULL) + vcfParseGenotypes(record); +struct vcfGenotype *gt = &(record->genotypes[0]); +assert(vcff->genotypeCount > 0 && gt != NULL); + +// allocate vBits struct +struct variantBits *vBits; +struct lm *lm = vcfFileLm(vcff); +lmAllocVar(lm,vBits); +vBits->genotypeSlots = vcff->genotypeCount; +vBits->haplotypeSlots = (gt->isHaploid || homozygousOnly ? 1 : 2); +vBits->record = record; + +// allocate bits +assert(record->alleleCount < 4); +vBits->alleleSlots = (record->alleleCount <= 2 ? 1 : 2); +int bitCount = vBitsSlotCount(vBits); +Bits *bits = lmBitAlloc(lm,bitCount); +vBits->bits = bits; + + +// walk through genotypes and set the bit +int ix = 0; +for ( ; ix < vcff->genotypeCount;ix++) + { + gt = &(record->genotypes[ix]); + + boolean homozygous = (gt->isHaploid || gt->hapIxA == gt->hapIxB); + + if ((!phasedOnly || gt->isPhased || homozygous)) + { + if ((!homozygousOnly || homozygous) && gt->hapIxA > 0) + { + switch (gt->hapIxA) + { + case 1: bitSetOne(bits, vBitsSlot(vBits,ix,0,0)); vBits->bitsOn++; break; + case 2: bitSetOne(bits, vBitsSlot(vBits,ix,0,1)); vBits->bitsOn++; break; + case 3: bitSetRange(bits,vBitsSlot(vBits,ix,0,0),2); vBits->bitsOn += 2; break; + default: break; + } + } + if (!gt->isHaploid && !homozygousOnly && gt->hapIxB > 0) + { + switch (gt->hapIxB) + { + case 1: bitSetOne(bits, vBitsSlot(vBits,ix,1,0)); vBits->bitsOn++; break; + case 2: bitSetOne(bits, vBitsSlot(vBits,ix,1,1)); vBits->bitsOn++; break; + case 3: bitSetRange(bits,vBitsSlot(vBits,ix,1,0),2); vBits->bitsOn += 2; break; + default: break; + } + } + } + } + +return vBits; +} + +static Bits *vcfRecordUnphasedBits(struct vcfFile *vcff, struct vcfRecord *record) +// Returns array (1 bit per genotype) for each unphased genotype. +{ +// allocate bits +Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount); + +// parse genotypes if needed +if (record->genotypes == NULL) + vcfParseGenotypes(record); + +// walk through genotypes and set the bit +int ix = 0; +for ( ; ix < vcff->genotypeCount;ix++) + { + if (record->genotypes[ix].isPhased == FALSE) + bitSetOne(bits,ix); + } +return bits; +} + +struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records, + boolean phasedOnly, boolean homozygousOnly, + boolean unphasedBits) +// Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record +// provided. If records is NULL will use vcff->records. Bit map has one slot per genotype +// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of +// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles +// is represented by two bits per allele (>3 non-reference alleles unsupported). +// If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable) +// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot. +// If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes. +// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this. +{ +struct variantBits *vBitsList = NULL; + +struct vcfRecord *record = records ? records : vcff->records; +for (;record != NULL; record = record->next) + { + struct variantBits *vBits = vcfOneRecordToVariantBits(vcff, record, phasedOnly, homozygousOnly); + if (unphasedBits) + vBits->unphased = vcfRecordUnphasedBits(vcff, record); + slAddHead(&vBitsList,vBits); + } +slReverse(&vBitsList); + +return vBitsList; +} + +int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin) +// Drops vBits found in less than a minimum number of haplotype genomes. +// Returns count of vBits structure that were dropped. +{ +struct variantBits *vBitsKeep = NULL; +struct variantBits *vBits = NULL; +int dropped = 0; +while ((vBits = slPopHead(vBitsList)) != NULL) + { + if (vBits->bitsOn >= haploGenomeMin) + slAddHead(&vBitsKeep,vBits); + else // drop memory, since this is on lm + dropped++; + } +slReverse(&vBitsKeep); +*vBitsList = vBitsKeep; +return dropped; +} + +int vcfVariantMostPopularCmp(const void *va, const void *vb) +// Compare to sort variantBits based upon how many genomes/chrom has the variant +// This can be used to build haploBits in most populous order for tree building +{ +const struct variantBits *a = *((struct variantBits **)va); +const struct variantBits *b = *((struct variantBits **)vb); +//int ret = memcmp(a->bits, b->bits, bitToByteSize(vBitsSlotCount(a))); +int ret = (b->bitsOn - a->bitsOn); // higher bits first (descending order) +if (ret == 0) + { // sort deterministically + ret = (a->record->chromStart - b->record->chromStart); + if (ret == 0) + ret = (a->record->chromEnd - b->record->chromEnd); + } +return ret; +} + +struct haploBits *vcfVariantBitsToHaploBits(struct vcfFile *vcff, struct variantBits *vBitsList, + boolean ignoreReference) +// Converts a set of variant bits to haplotype bits, resulting in one bit struct +// per haplotype genome that has non-reference variations. If ignoreReference, only +// haplotype genomes with at least one non-reference variant are returned. +// A haploBit array has one variant slot per vBit struct and one or more bits per allele. +// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this. +{ +// First determine dimensions +int variantSlots = 0; +unsigned char alleleSlots = 1; +struct variantBits *vBits = vBitsList; +for ( ; vBits != NULL; vBits = vBits->next) + { + variantSlots++; + if (vBits->alleleSlots == 2) + alleleSlots = 2; // Will normalize allele slots across all variants. + // This should catch any problems where chrX is haploid/diploid + assert(vBitsList->haplotypeSlots == vBits->haplotypeSlots); + } + + +// Now create a struct per haplotype +struct lm *lm = vcfFileLm(vcff); +struct haploBits *hBitsList = NULL; +int genoIx = 0; +for (; genoIx < vBitsList->genotypeSlots; genoIx++) + { + int haploIx = 0; + for (; haploIx < vBitsList->haplotypeSlots; haploIx++) + { + // allocate struct + struct haploBits *hBits; + lmAllocVar(lm,hBits); + hBits->variantSlots = variantSlots; + hBits->alleleSlots = alleleSlots; + hBits->haploGenomes = 1; + int bitCount = hBitsSlotCount(hBits); + + // allocate bits + Bits *bits = lmBitAlloc(lm,bitCount); + hBits->bits = bits; + + int variantIx = 0; + for (vBits = vBitsList; vBits != NULL; vBits = vBits->next, variantIx++) + { + if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,0))) + { + bitSetOne( bits,hBitsSlot(hBits,variantIx, 0)); + hBits->bitsOn++; + } + + if (vBits->alleleSlots == 2) // (hBit->alleleSlot will also == 2) + { + if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,1))) + { + bitSetOne( bits,hBitsSlot(hBits,variantIx, 1)); + hBits->bitsOn++; + } + } + } + if (!ignoreReference || hBits->bitsOn > 0) // gonna keep it so flesh it out + { + hBits->genomeIx = genoIx; + hBits->haploidIx = haploIx; + struct vcfRecord *record = vBitsList->record; // any will do + struct vcfGenotype *gt = &(record->genotypes[genoIx]); + if (vBitsList->haplotypeSlots == 1) + hBits->ids = lmCloneString(lm,gt->id); + else + { + int sz = strlen(gt->id) + 3; + hBits->ids = lmAlloc(lm,sz); + safef(hBits->ids,sz,"%s-%c",gt->id,'a' + haploIx); + } + slAddHead(&hBitsList,hBits); + } + //else + // haploBitsFree(&hBits); // lm so just abandon + } + } +slReverse(&hBitsList); + +return hBitsList; +} + +int vcfHaploBitsOnCmp(const void *va, const void *vb) +// Compare to sort haploBits based upon how many bits are on +{ +const struct haploBits *a = *((struct haploBits **)va); +const struct haploBits *b = *((struct haploBits **)vb); +int ret = (a->bitsOn - b->bitsOn); +if (ret == 0) + { // sort deterministically + ret = (a->genomeIx - b->genomeIx); + if (ret == 0) + ret = (a->haploidIx - b->haploidIx); + } +return ret; +} + +int vcfHaploMemCmp(const void *va, const void *vb) +// Compare to sort haploBits based upon how many bits are on +{ +const struct haploBits *a = *((struct haploBits **)va); +const struct haploBits *b = *((struct haploBits **)vb); +int ret = memcmp(a->bits, b->bits, bitToByteSize(hBitsSlotCount(a))); +if (ret == 0) + { // sort deterministically + ret = (a->genomeIx - b->genomeIx); + if (ret == 0) + ret = (a->haploidIx - b->haploidIx); + } +return ret; +} + +static struct haploBits *hBitsCollapsePair(struct lm *lm, struct haploBits *hBits, + struct haploBits *dupMap) +// Adds Ids of duplicate bitmap to hBits and abandons the duplicate. +{ +char *ids = lmAlloc(lm,strlen(dupMap->ids) + strlen(hBits->ids) + 2); +// NOTE: Putting Dup Ids first could leave these in lowest genomeIx/haploidIx order +sprintf(ids,"%s,%s",dupMap->ids,hBits->ids); +hBits->ids = ids; +hBits->haploGenomes++; +hBits->genomeIx = dupMap->genomeIx; // Arbitrary but could leave lowest first +hBits->haploidIx = dupMap->haploidIx; +return hBits; +} + +int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList, + int haploGenomeMin) +// Collapses a list of haploBits based upon identical bit arrays. +// If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes. +// Returns count of hBit structs removed. +{ +struct lm *lm = vcfFileLm(vcff); +int collapsed = 0; +struct haploBits *hBitsPassed = NULL; +struct haploBits *hBitsToTest = *phBitsList; +if (hBitsToTest->next == NULL) + return 0; + +// Sort early bits first +slSort(&hBitsToTest, vcfHaploMemCmp); + +// Walk through those of matching size and compare +int bitCount = hBitsSlotCount(hBitsToTest); +int memSize = bitToByteSize(bitCount); +struct haploBits *hBitsQ = NULL; +while ((hBitsQ = slPopHead(&hBitsToTest)) != NULL) + { + struct haploBits *hBitsT = hBitsToTest; //hBitsQ->next; is already null. + for (;hBitsT != NULL;hBitsT = hBitsT->next) + { + // If the 2 bitmaps do not have the same number of bits on, they are not identical + if (hBitsT->bitsOn != hBitsQ->bitsOn) + break; + + // If 2 bitmaps are identical, then collapse into one + if (memcmp(hBitsT->bits,hBitsQ->bits,memSize) == 0) + { // collapse + hBitsCollapsePair(lm,hBitsT,hBitsQ); + hBitsQ = NULL; // abandon lm memory + collapsed++; + break; // hBitsQ used up so make hBitsT the new Q. + } + } + + // If hBitsQ survived the collapsing, then don't loose it. + if (hBitsQ != NULL) + slAddHead(&hBitsPassed,hBitsQ); + } + +// Now that we have collapsed hBits, we can drop any that are not found in enough subjects +if (haploGenomeMin > 1) + { + hBitsToTest = hBitsPassed; + hBitsPassed = NULL; + while ((hBitsQ = slPopHead(hBitsToTest)) != NULL) + { + if (hBitsQ->haploGenomes >= haploGenomeMin) + slAddHead(&hBitsPassed,hBitsQ); + else // drop memory, since this is on lm + collapsed++; + } + // double pop passes means no reverse required. + } +else + slReverse(&hBitsPassed); + +*phBitsList = hBitsPassed; + +return collapsed; +} + +unsigned char vcfHaploBitsToVariantIx(struct haploBits *hBits,int bitIx) +// Given a hBits struct and bitIx, what is the actual variant allele ix +// to use when accessing the vcfRecord? NOTE: here 0 is reference allele +{ +if (hBits->alleleSlots == 1) + return bitReadOne(hBits->bits,bitIx); // 0 or 1 +else + { + assert(hBits->alleleSlots == 2); // Only supports 1-3 variants encoded as 2 bits + unsigned char varIx = 0; + int bit = variantSlotFromBitIx(hBits,bitIx); + if (bitReadOne(hBits->bits,bit++)) + varIx = 1; // Note that bitmaps represent 1,2,3 as 10,01,11 + if (bitReadOne(hBits->bits,bit)) + varIx += 2; + return varIx; + } +} + +enum elmNodeOverlap vcfHaploBitsCmp(const struct slList *elA, const struct slList *elB, + int *matchWeight, void *extra) +// HaploBits compare routine for building tree of relations using elmTreeGrow(). +{ +const struct haploBits *a = (struct haploBits *)elA; +const struct haploBits *b = (struct haploBits *)elB; + +int bitCount = hBitsSlotCount(a); +int and = bitAndCount(a->bits,b->bits,bitCount); +if (matchWeight) + *matchWeight = and; +if (and == 0)// && a->bitsOn > 0) + return enoExcluding; // nothing in common +if (a->bitsOn == and) + { + if (b->bitsOn == and) + return enoEqual; // perfect match + else + return enoSubset; + } +// super and mixed? +if (b->bitsOn == and) + return enoSuperset; +return enoMixed; +} + +struct slList *vcfHaploBitsMatching(const struct slList *elA, const struct slList *elB, + void *extra) +// Returns a HaploBits structure representing the common parts of elements A and B. +// Used with elmTreeGrow() to create nodes that are the common parts between leaves/branches. +{ +struct haploBits *a = (struct haploBits *)elA; +struct haploBits *b = (struct haploBits *)elB; +struct lm *lm = extra; + +// If the 2 bitmaps are equal, then collapse into a single struct +if (enoEqual == vcfHaploBitsCmp(elA,elB,NULL,extra)) + { + if (hBitsIsReal(a)) // bitmap represents variants on at least one subject chromosome + { + assert(!hBitsIsReal(b)); // Won't always work but hBits have been precollapsed + if (hBitsIsReal(b)) // Should not be if these have already been collapsed. + { + hBitsCollapsePair(lm,a,b); // a is real, b is real so collapse into one + //haploBitsFreeList(&b); // lm so just abandon + return (struct slList *)a; + } + else + return (struct slList *)a; // a is real, b is not so just use a + } + else if (hBitsIsReal(b)) + return (struct slList *)b; // b is real, a is not so just use b + } + +// Must make non "real" bitmap which is the common bits of a and b +struct haploBits *match; +lmAllocVar(lm,match); +match->alleleSlots = a->alleleSlots; +match->variantSlots = a->variantSlots; +match->haploGenomes = 0; // Marking this as a subset + +int bitCount = hBitsSlotCount(match); +match->bits = lmBitClone(lm,a->bits, bitCount); +bitAnd(match->bits, b->bits, bitCount); +match->bitsOn = bitCountRange(match->bits, 0, bitCount); + +return (struct slList *)match; +} + + + + + +