f53f70b9f131634b0eb32706a98f525fefa55cce tdreszer Tue Feb 5 17:36:46 2013 -0800 Some changes requested by Angie in code review, along with a bug fix. Next checkin will break out the vcfBits routines into separate .c/.h files diff --git src/inc/vcf.h src/inc/vcf.h index 2d7a826..28c14a6 100644 --- src/inc/vcf.h +++ src/inc/vcf.h @@ -194,48 +194,45 @@ * vcff->records; if maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */ struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, int maxErr, int maxRecords); /* Open a VCF file that has been compressed and indexed by tabix and * parse VCF header, or return NULL if unable. If chrom is non-NULL, * seek to the position range and parse all lines in range into * vcff->records. If maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */ int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end, int maxErr, int maxRecords); -// Reads a batch of records from an opened and indexed VCF file, returning number -// of records in batch. Seeks to the start position and parses all lines in range, -// adding them to vcff->records. Note: vcff->records will continue to be sorted, -// even if batches are loaded out of order. If maxErr >= zero, then continue to -// parse until there are maxErr+1 errors. A maxErr less than zero does not stop -// and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. +// Reads a batch of records from an opened and indexed VCF file, adding them to +// vcff->records and returning the count of new records added in this batch. +// Note: vcff->records will continue to be sorted, even if batches are loaded +// out of order. Additionally, resulting vcff->records will contain no duplicates +// so returned count refects only the new records added, as opposed to all records +// in range. If maxErr >= zero, then continue to parse until there are maxErr+1 +// errors. A maxErr less than zero does not stop and reports all errors. Set +// maxErr to VCF_IGNORE_ERRS for silence. void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize); // Creates a separate memory pool for records. Establishing this pool allows // using vcfFileFlushRecords to abandon previously read records and free // the associated memory. Very useful when reading an entire file in batches. #define vcfFileLm(vcff) ((vcff)->reusePool ? (vcff)->reusePool : (vcff)->pool->lm) -void vcfFileAbandonReusePool(struct vcfFile *vcff); -// Abandons all previously allocated data from the reuse pool and reverts to -// common pool. The vcf->records set will also be abandoned as pointers are invalid. -// USE WITH CAUTION. All previously allocated pointers from this pool are now invalid. - void vcfFileFlushRecords(struct vcfFile *vcff); // Abandons all previously read vcff->records and flushes the reuse pool (if it exists). // USE WITH CAUTION. All previously allocated record pointers are now invalid. struct vcfRecord *vcfNextRecord(struct vcfFile *vcff); /* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file. * Note: this does not store record in vcff->records! */ struct vcfRecord *vcfRecordFromRow(struct vcfFile *vcff, char **words); /* Parse words from a VCF data line into a VCF record structure. */ unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec); /* For indels, VCF includes the left neighboring base; for example, if the alleles are * AA/- following a G base, then the VCF record will start one base to the left and have * "GAA" and "G" as the alleles. That is not nice for display for two reasons: @@ -359,29 +356,29 @@ int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList, int haploGenomeMin); // Collapses a list of haploBits based upon identical bit arrays. // If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes. // Returns count of hBit structs removed. INLINE struct variantBits *vcfHaploBitIxToVariantBits(struct haploBits *hBits, int bitIx, struct variantBits *vBitsList) // Returns appropriate vBits from vBits list associated with a given bit in an hBits struct. // Assumes vBitsList is in same order as hBits bit array. Note vBits->record has full vcf details. { return slElementFromIx(vBitsList,vcfRecordIxFromBitIx(hBits,bitIx)); } -unsigned char vcfHaploBitsToVariantIx(struct haploBits *hBits,int bitIx); +unsigned char vcfHaploBitsToVariantAlleleIx(struct haploBits *hBits,int bitIx); // Given a hBits struct and bitIx, what is the actual variant allele ix // to use when accessing the vcfRecord? enum elmNodeOverlap vcfHaploBitsCmp(const struct slList *elA, const struct slList *elB, int *matchWeight, void *extra); // HaploBits compare routine for building tree of relations using elmTreeGrow(). struct slList *vcfHaploBitsMatching(const struct slList *elA, const struct slList *elB, void *extra); // Returns a HaploBits structure representing the common parts of elements A and B. // Used with elmTreeGrow() to create nodes that are the common parts between leaves/branches. #endif // vcf_h