376a3438b1a96761c019bbe128908fc69b5083df angie Thu Jul 30 18:04:05 2020 -0700 Adding VCF_NUM_COLS_BEFORE_GENOTYPES to vcf.h to reduce the proliferation of magic number 9's in the code. diff --git src/inc/vcf.h src/inc/vcf.h index 60af930..c9b547f 100644 --- src/inc/vcf.h +++ src/inc/vcf.h @@ -1,368 +1,369 @@ /* VCF: Variant Call Format, version 4.0 / 4.1 * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40 * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 * The vcfFile object borrows many memory handling and error reporting tricks from MarkD's * gff3File; any local deficiencies are not to reflect poorly on Mark's fine work! :) */ #ifndef vcf_h #define vcf_h #include "limits.h" #include "hash.h" #include "linefile.h" #include "asParse.h" enum vcfInfoType /* VCF header defines INFO column components; each component has one of these types: */ { vcfInfoNoType, // uninitialized value (0) or unrecognized type name vcfInfoInteger, vcfInfoFloat, vcfInfoFlag, vcfInfoCharacter, vcfInfoString, }; union vcfDatum /* Container for a value whose type is specified by an enum vcfInfoType. */ { int datInt; double datFloat; boolean datFlag; char datChar; char *datString; }; struct vcfInfoDef /* Definition of INFO column component from VCF header: */ { struct vcfInfoDef *next; char *key; // A short identifier, e.g. MQ for mapping quality int fieldCount; // The number of values to follow the id, or -1 if it varies enum vcfInfoType type; // The type of values that follow the id char *description; // Brief description of info }; struct vcfInfoElement /* A single INFO column component; each row's INFO column may contain multiple components. */ { char *key; // An identifier described by a struct vcfInfoDef int count; // Number of data values following id union vcfDatum *values; // Array of data values following id bool *missingData; // Array of flags for missing data values ("." instead of number) }; struct vcfGenotype /* A single component of the optional GENOTYPE column. */ { char *id; // Name of individual/sample (pointer to vcfFile genotypeIds) or . char hapIxA; // Index of one haplotype's allele: 0=reference, 1=alt, 2=other alt // *or* if negative, missing data char hapIxB; // Index of other haplotype's allele, or if negative, missing data bool isPhased; // True if haplotypes are phased bool isHaploid; // True if there is only one haplotype (e.g. chrY) int infoCount; // Number of components named in FORMAT column struct vcfInfoElement *infoElements; // Array of info components for this genotype call }; struct vcfRecord /* A VCF data row (or list of rows). */ { struct vcfRecord *next; char *chrom; // Reference assembly sequence name unsigned int chromStart; // Start offset in chrom unsigned int chromEnd; // End offset in chrom char *name; // Variant name from ID column int alleleCount; // Number of alleles (reference + alternates) char **alleles; // Alleles: reference first then alternate alleles char *qual; // . or Phred-scaled score, i.e. -10log_10 P(call in ALT is wrong) int filterCount; // Number of ;-separated filter codes in FILTER column char **filters; // Code(s) described in header for failed filters (or PASS or .) int infoCount; // Number of components of INFO column struct vcfInfoElement *infoElements; // Array of INFO column components char *format; // Optional column containing ordered list of genotype components char **genotypeUnparsedStrings; // Temporary array of unparsed optional genotype columns struct vcfGenotype *genotypes; // If built, array of parsed genotype components; // call vcfParseGenotypes(record) to build. struct vcfFile *file; // Pointer back to parent vcfFile }; struct vcfFile /* Info extracted from a VCF file. Manages all memory for contents. * Clearly borrowing structure from MarkD's gff3File. :) */ { char *fileOrUrl; // VCF local file path or URL char *headerString; // Complete original header including newlines. int majorVersion; // 4 etc. int minorVersion; // 0, 1 etc. struct vcfInfoDef *infoDefs; // Header's definitions of INFO column components struct vcfInfoDef *filterDefs; // Header's definitions of FILTER column failure codes struct vcfInfoDef *altDefs; // Header's defs of symbolic alternate alleles (e.g. DEL, INS) struct vcfInfoDef *gtFormatDefs; // Header's defs of GENOTYPE compnts. listed in FORMAT col. bool allPhased; // True if all record->genotypes have been phased int genotypeCount; // Number of optional genotype columns described in header char **genotypeIds; // Array of optional genotype column names described in header struct vcfRecord *records; // VCF data rows, sorted by position struct hash *byName; // Hash records by name -- not populated until needed. struct hash *pool; // Used to allocate string values that tend to // be repeated in the files. hash's localMem is also used to // allocated memory for all other objects (if recordPool null) struct lm *reusePool; // If created with vcfFileMakeReusePool, non-shared record data is // allocated from this pool. Useful when walking through huge files. struct lineFile *lf; // Used only during parsing int maxErr; // Maximum number of errors before aborting int errCnt; // Error count }; /* Reserved but optional INFO keys: */ extern const char *vcfInfoAncestralAllele; extern const char *vcfInfoPerAlleleGtCount; // allele count in genotypes, for each ALT allele, // in the same order as listed extern const char *vcfInfoAlleleFrequency; // allele frequency for each ALT allele in the same // order as listed: use this when estimated from // primary data, not called genotypes extern const char *vcfInfoNumAlleles; // total number of alleles in called genotypes extern const char *vcfInfoBaseQuality; // RMS base quality at this position extern const char *vcfInfoCigar; // cigar string describing how to align an // alternate allele to the reference allele extern const char *vcfInfoIsDbSnp; // dbSNP membership extern const char *vcfInfoDepth; // combined depth across samples, e.g. DP=154 extern const char *vcfInfoEnd; // end position of the variant described in this // record (esp. for CNVs) extern const char *vcfInfoIsHapMap2; // membership in hapmap2 extern const char *vcfInfoIsHapMap3; // membership in hapmap3 extern const char *vcfInfoIs1000Genomes; // membership in 1000 Genomes extern const char *vcfInfoMappingQuality; // RMS mapping quality, e.g. MQ=52 extern const char *vcfInfoMapQual0Count; // number of MAPQ == 0 reads covering this record extern const char *vcfInfoNumSamples; // Number of samples with data extern const char *vcfInfoStrandBias; // strand bias at this position extern const char *vcfInfoIsSomatic; // indicates that the record is a somatic mutation, // for cancer genomics extern const char *vcfInfoIsValidated; // validated by follow-up experiment /* Reserved but optional per-genotype keys: */ extern const char *vcfGtGenotype; // numeric allele values separated by "/" (unphased) // or "|" (phased). Allele values are 0 for // reference allele, 1 for the first allele in ALT, // 2 for the second allele in ALT and so on. extern const char *vcfGtDepth; // read depth at this position for this sample extern const char *vcfGtFilter; // analogous to variant's FILTER field extern const char *vcfGtLikelihoods; // three floating point log10-scaled likelihoods for // AA,AB,BB genotypes where A=ref and B=alt; // not applicable if site is not biallelic. extern const char *vcfGtPhred; // Phred-scaled genotype likelihoods rounded to closest int extern const char *vcfGtConditionalQual; // Conditional genotype quality // i.e. phred quality -10log_10 P(genotype call is wrong, // conditioned on the site's being variant) extern const char *vcfGtHaplotypeQualities; // Two phred qualities comma separated extern const char *vcfGtPhaseSet; // Set of phased genotypes to which this genotype belongs extern const char *vcfGtPhasingQuality; // Phred-scaled P(alleles ordered wrongly in heterozygote) extern const char *vcfGtExpectedAltAlleleCount; // Typically used in association analyses INLINE void vcfPrintDatum(FILE *f, const union vcfDatum datum, const enum vcfInfoType type) /* Print datum to f in the format determined by type. */ { switch (type) { case vcfInfoInteger: fprintf(f, "%d", datum.datInt); break; case vcfInfoFloat: fprintf(f, "%f", datum.datFloat); break; case vcfInfoFlag: fprintf(f, "%s", datum.datString); // Flags could have values in older VCF break; case vcfInfoCharacter: fprintf(f, "%c", datum.datChar); break; case vcfInfoString: if (startsWith("http", datum.datString)) fprintf(f, "%s", datum.datString, datum.datString); else fprintf(f, "%s", datum.datString); break; default: errAbort("vcfPrintDatum: Unrecognized type %d", type); break; } } #define VCF_IGNORE_ERRS (INT_MAX - 1) struct vcfFile *vcfFileNew(); /* Return a new, empty vcfFile object. */ struct vcfFile *vcfFileFromHeader(char *name, char *headerString, int maxErr); /* Parse the VCF header string into a vcfFile object with no rows. * name is for error reporting. * If maxErr is non-negative then continue to parse until maxErr+1 errors have been found. * A maxErr less than zero does not stop and reports all errors. * Set maxErr to VCF_IGNORE_ERRS for silence. */ struct vcfFile *vcfFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, int maxErr, int maxRecords, boolean parseAll); /* Open fileOrUrl and parse VCF header; return NULL if unable. * If chrom is non-NULL, scan past any variants that precede {chrom, chromStart}. * Note: this is very inefficient -- it's better to use vcfTabix if possible! * If parseAll, then read in all lines in region, parse and store in * vcff->records; if maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */ struct vcfFile *vcfTabixFileAndIndexMayOpen(char *fileOrUrl, char *tbiFileOrUrl, char *chrom, int start, int end, int maxErr, int maxRecords); /* Open a VCF file that has been compressed and indexed by tabix and * parse VCF header, or return NULL if unable. tbiFileOrUrl can be NULL. * If chrom is non-NULL, seek to the position range and parse all lines in * range into vcff->records. If maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */ struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, int maxErr, int maxRecords); /* Open a VCF file that has been compressed and indexed by tabix and * parse VCF header, or return NULL if unable. If chrom is non-NULL, * seek to the position range and parse all lines in range into * vcff->records. If maxErr >= zero, then continue to parse until * there are maxErr+1 errors. A maxErr less than zero does not stop * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence. */ long long vcfTabixItemCount(char *fileOrUrl, char *tbiFileOrUrl); /* Return the total number of items across all sequences in fileOrUrl, using index file. * If tbiFileOrUrl is NULL, the index file is assumed to be fileOrUrl.tbi. * NOTE: not all tabix index files include mapped item counts, so this may return 0 even for * large files. */ int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end, int maxErr, int maxRecords); // Reads a batch of records from an opened and indexed VCF file, adding them to // vcff->records and returning the count of new records added in this batch. // Note: vcff->records will continue to be sorted, even if batches are loaded // out of order. Additionally, resulting vcff->records will contain no duplicates // so returned count refects only the new records added, as opposed to all records // in range. If maxErr >= zero, then continue to parse until there are maxErr+1 // errors. A maxErr less than zero does not stop and reports all errors. Set // maxErr to VCF_IGNORE_ERRS for silence. void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize); // Creates a separate memory pool for records. Establishing this pool allows // using vcfFileFlushRecords to abandon previously read records and free // the associated memory. Very useful when reading an entire file in batches. #define vcfFileLm(vcff) ((vcff)->reusePool ? (vcff)->reusePool : (vcff)->pool->lm) void vcfFileFlushRecords(struct vcfFile *vcff); // Abandons all previously read vcff->records and flushes the reuse pool (if it exists). // USE WITH CAUTION. All previously allocated record pointers are now invalid. struct vcfRecord *vcfNextRecord(struct vcfFile *vcff); /* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file. * Note: this does not store record in vcff->records! */ struct vcfRecord *vcfRecordFromRow(struct vcfFile *vcff, char **words); /* Parse words from a VCF data line into a VCF record structure. */ boolean allelesHavePaddingBase(char **alleles, int alleleCount); /* Examine alleles to see if they either a) all start with the same base or * b) include a symbolic or 0-length allele. In either of those cases, there * must be an initial padding base that we'll need to trim from non-symbolic * alleles. */ unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec); /* For indels, VCF includes the left neighboring base; for example, if the alleles are * AA/- following a G base, then the VCF record will start one base to the left and have * "GAA" and "G" as the alleles. That is not nice for display for two reasons: * 1. Indels appear one base wider than their dbSNP entries. * 2. In pgSnp display mode, the two alleles are always the same color. * However, for hgTracks' mapBox we need the correct chromStart for identifying the * record in hgc -- so return the original chromStart. */ unsigned int vcfRecordTrimAllelesRight(struct vcfRecord *rec); /* Some tools output indels with extra base to the right, for example ref=ACC, alt=ACCC * which should be ref=A, alt=AC. When the extra bases make the variant extend from an * intron (or gap) into an exon, it can cause a false appearance of a frameshift. * To avoid this, when all alleles have identical base(s) at the end, trim all of them, * and update rec->chromEnd. * For hgTracks' mapBox we need the correct chromStart for identifying the record in hgc, * so return the original chromEnd. */ int vcfRecordCmp(const void *va, const void *vb); /* Compare to sort based on position. */ void vcfFileFree(struct vcfFile **vcffPtr); /* Free a vcfFile object. */ const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId); /* Return all records with name=variantId, or NULL if not found. */ const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key); /* Find an INFO element, or NULL. */ struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key); /* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */ void vcfParseGenotypes(struct vcfRecord *record); /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[]. * This destroys genotypesUnparsedStrings. */ void vcfParseGenotypesGtOnly(struct vcfRecord *record); /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[], but ignore * genotype info elements, IDs, etc; parse only the actual genotypes (e.g. for quick display * in hgTracks). This destroys genotypesUnparsedStrings. */ const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId); /* Find the genotype and associated info for the individual, or return NULL. * This calls vcfParseGenotypes if it has not already been called. */ struct vcfInfoDef *vcfInfoDefForGtKey(struct vcfFile *vcff, const char *key); /* Look up the type of genotype FORMAT component key, in the definitions from the header, * and failing that, from the keys reserved in the spec. */ const struct vcfInfoElement *vcfGenotypeFindInfo(const struct vcfGenotype *gt, char *key); /* Find the genotype infoElement for key, or return NULL. */ int vcfGenotypeIndex(int h0Ix, int h1Ix); /* Return the index in a linear array of distinct genotypes, given two 0-based allele indexes. * This follows the following convention used by GnomAD (GATK?), that has the advantage that * gt indexes of small numbers don't change as the number of alleles increases, and also matches * the ref/ref, ref/alt, alt/alt convention for biallelic variants: * 0/0, * 0/1, 1/1, * 0/2, 1/2, 2/2, * 0/3, 1/3, 2/3, 3/3, * ... */ void vcfCountGenotypes(struct vcfRecord *rec, int **retGtCounts, int **retAlleleCounts, int *retPhasedCount, int *retDiploidCount); /* Tally genotypes and alleles for summary, adding 1 to rec->alleleCount to represent missing data */ char *vcfFilePooledStr(struct vcfFile *vcff, char *str); /* Allocate memory for a string from vcff's shared string pool. */ #define VCF_NUM_COLS 10 +#define VCF_NUM_COLS_BEFORE_GENOTYPES 9 struct asObject *vcfAsObj(); // Return asObject describing fields of VCF char *vcfGetSlashSepAllelesFromWords(char **words, struct dyString *dy); /* Overwrite dy with a /-separated allele string from VCF words, * skipping the extra initial base that VCF requires for indel alleles if necessary. * Return dy->string for convenience. */ void vcfRecordWriteNoGt(FILE *f, struct vcfRecord *rec); /* Write the first 8 columns of VCF rec to f. Genotype data will be ignored if present. */ // Characters we expect to see in |-separated parts of an ##INFO description that specifies // tabular contents: #define COL_DESC_WORD_REGEX "[A-Za-z_0-9.-]+" // Series of |-separated words: #define COL_DESC_REGEX COL_DESC_WORD_REGEX"(\\|"COL_DESC_WORD_REGEX")+" // Minimum number of |-separated values for interpreting descriptions and values as tabular: #define MIN_COLUMN_COUNT 3 boolean looksTabular(const struct vcfInfoDef *def, const struct vcfInfoElement *el); /* Return TRUE if def->description seems to contain a |-separated description of columns * and el's first non-empty string value has the same number of |-separated parts. */ #endif // vcf_h