11e45667d4e291b3038ccda729a1cdf5bcaf004a braney Mon Jul 11 15:46:54 2016 -0700 incorporate htslib in kent src, remove USE_BAM, USE_SAMTABIX, USE_TABIX defines, modify a bunch of makefiles to include kentSrc variable pointing to top of the tree. diff --git src/htslib/htslib/vcf.h src/htslib/htslib/vcf.h new file mode 100644 index 0000000..f5681ad --- /dev/null +++ src/htslib/htslib/vcf.h @@ -0,0 +1,907 @@ +/* vcf.h -- VCF/BCF API functions. + + Copyright (C) 2012, 2013 Broad Institute. + Copyright (C) 2012-2014 Genome Research Ltd. + + Author: Heng Li <lh3@sanger.ac.uk> + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + todo: + - make the function names consistent + - provide calls to abstract away structs as much as possible + */ + +#ifndef HTSLIB_VCF_H +#define HTSLIB_VCF_H + +#include <stdint.h> +#include <limits.h> +#include <assert.h> +#include "hts.h" +#include "kstring.h" +#include "hts_defs.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/***************** + * Header struct * + *****************/ + +#define BCF_HL_FLT 0 // header line +#define BCF_HL_INFO 1 +#define BCF_HL_FMT 2 +#define BCF_HL_CTG 3 +#define BCF_HL_STR 4 // structured header line TAG=<A=..,B=..> +#define BCF_HL_GEN 5 // generic header line + +#define BCF_HT_FLAG 0 // header type +#define BCF_HT_INT 1 +#define BCF_HT_REAL 2 +#define BCF_HT_STR 3 + +#define BCF_VL_FIXED 0 // variable length +#define BCF_VL_VAR 1 +#define BCF_VL_A 2 +#define BCF_VL_G 3 +#define BCF_VL_R 4 + +/* === Dictionary === + + The header keeps three dictonaries. The first keeps IDs in the + "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths + in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] + is the actual hash table, which is opaque to the end users. In the hash + table, the key is the ID or sample name as a C string and the value is a + bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash + table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the + size of the hash table or, equivalently, the length of the id[] arrays. +*/ + +#define BCF_DT_ID 0 // dictionary type +#define BCF_DT_CTG 1 +#define BCF_DT_SAMPLE 2 + +// Complete textual representation of a header line +typedef struct { + int type; // One of the BCF_HL_* type + char *key; // The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. + char *value; // Set only for generic lines, NULL for FILTER/INFO, etc. + int nkeys; // Number of structured fields + char **keys, **vals; // The key=value pairs +} bcf_hrec_t; + +typedef struct { + uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2] + // for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG + bcf_hrec_t *hrec[3]; + int id; +} bcf_idinfo_t; + +typedef struct { + const char *key; + const bcf_idinfo_t *val; +} bcf_idpair_t; + +// Note that bcf_hdr_t structs must always be created via bcf_hdr_init() +typedef struct { + int32_t n[3]; // n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI) + bcf_idpair_t *id[3]; + void *dict[3]; // ID dictionary, contig dict and sample dict + char **samples; + bcf_hrec_t **hrec; + int nhrec, dirty; + int ntransl, *transl[2]; // for bcf_translate() + int nsamples_ori; // for bcf_hdr_set_samples() + uint8_t *keep_samples; + kstring_t mem; + int32_t m[3]; // m: allocated size of the dictionary block in use (see n above) +} bcf_hdr_t; + +extern uint8_t bcf_type_shift[]; + +/************** + * VCF record * + **************/ + +#define BCF_BT_NULL 0 +#define BCF_BT_INT8 1 +#define BCF_BT_INT16 2 +#define BCF_BT_INT32 3 +#define BCF_BT_FLOAT 5 +#define BCF_BT_CHAR 7 + +#define VCF_REF 0 +#define VCF_SNP 1 +#define VCF_MNP 2 +#define VCF_INDEL 4 +#define VCF_OTHER 8 + +typedef struct { + int type, n; // variant type and the number of bases affected, negative for deletions +} variant_t; + +typedef struct { + int id; // id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key + int n, size, type; // n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types + uint8_t *p; // same as vptr and vptr_* in bcf_info_t below + uint32_t p_len; + uint32_t p_off:31, p_free:1; +} bcf_fmt_t; + +typedef struct { + int key; // key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key + int type, len; // type: one of BCF_BT_* types; len: vector length, 1 for scalars + union { + int32_t i; // integer value + float f; // float value + } v1; // only set if $len==1; for easier access + uint8_t *vptr; // pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes + uint32_t vptr_len; // length of the vptr block or, when set, of the vptr_mod block, excluding offset + uint32_t vptr_off:31, // vptr offset, i.e., the size of the INFO key plus size+type bytes + vptr_free:1; // indicates that vptr-vptr_off must be freed; set only when modified and the new + // data block is bigger than the original +} bcf_info_t; + + +#define BCF1_DIRTY_ID 1 +#define BCF1_DIRTY_ALS 2 +#define BCF1_DIRTY_FLT 4 +#define BCF1_DIRTY_INF 8 + +typedef struct { + int m_fmt, m_info, m_id, m_als, m_allele, m_flt; // allocated size (high-water mark); do not change + int n_flt; // Number of FILTER fields + int *flt; // FILTER keys in the dictionary + char *id, *als; // ID and REF+ALT block (\0-seperated) + char **allele; // allele[0] is the REF (allele[] pointers to the als block); all null terminated + bcf_info_t *info; // INFO + bcf_fmt_t *fmt; // FORMAT and individual sample + variant_t *var; // $var and $var_type set only when set_variant_types called + int n_var, var_type; + int shared_dirty; // if set, shared.s must be recreated on BCF output + int indiv_dirty; // if set, indiv.s must be recreated on BCF output +} bcf_dec_t; + + +#define BCF_ERR_CTG_UNDEF 1 +#define BCF_ERR_TAG_UNDEF 2 +#define BCF_ERR_NCOLS 4 +#define BCF_ERR_LIMITS 8 + +/* + The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file + is slower because the string is first to be parsed, packed into BCF line + (done in vcf_parse), then unpacked into internal bcf1_t structure. If it + is known in advance that some of the fields will not be required (notably + the sample columns), parsing of these can be skipped by setting max_unpack + appropriately. + Similarly, it is fast to output a BCF line because the columns (kept in + shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF + line must be formatted in vcf_format. + */ +typedef struct { + int32_t rid; // CHROM + int32_t pos; // POS + int32_t rlen; // length of REF + float qual; // QUAL + uint32_t n_info:16, n_allele:16; + uint32_t n_fmt:8, n_sample:24; + kstring_t shared, indiv; + bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() + int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed + int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work + int unpack_size[3]; // the original block size of ID, REF+ALT and FILTER + int errcode; // one of BCF_ERR_* codes +} bcf1_t; + +/******* + * API * + *******/ + + /*********************************************************************** + * BCF and VCF I/O + * + * A note about naming conventions: htslib internally represents VCF + * records as bcf1_t data structures, therefore most functions are + * prefixed with bcf_. There are a few exceptions where the functions must + * be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In + * these cases, functions prefixed with bcf_ are more general and work + * with both BCF and VCF. + * + ***********************************************************************/ + + /** These macros are defined only for consistency with other parts of htslib */ + #define bcf_init1() bcf_init() + #define bcf_read1(fp,h,v) bcf_read((fp),(h),(v)) + #define vcf_read1(fp,h,v) vcf_read((fp),(h),(v)) + #define bcf_write1(fp,h,v) bcf_write((fp),(h),(v)) + #define vcf_write1(fp,h,v) vcf_write((fp),(h),(v)) + #define bcf_destroy1(v) bcf_destroy(v) + #define bcf_empty1(v) bcf_empty(v) + #define vcf_parse1(s,h,v) vcf_parse((s),(h),(v)) + #define bcf_clear1(v) bcf_clear(v) + #define vcf_format1(h,v,s) vcf_format((h),(v),(s)) + + /** + * bcf_hdr_init() - create an empty BCF header. + * @param mode "r" or "w" + * + * When opened for writing, the mandatory fileFormat and + * FILTER=PASS lines are added automatically. + */ + bcf_hdr_t *bcf_hdr_init(const char *mode); + + /** Destroy a BCF header struct */ + void bcf_hdr_destroy(bcf_hdr_t *h); + + /** Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) */ + bcf1_t *bcf_init(void); + + /** Deallocate a bcf1_t object */ + void bcf_destroy(bcf1_t *v); + + /** + * Same as bcf_destroy() but frees only the memory allocated by bcf1_t, + * not the bcf1_t object itself. + */ + void bcf_empty(bcf1_t *v); + + /** + * Make the bcf1_t object ready for next read. Intended mostly for + * internal use, the user should rarely need to call this function + * directly. + */ + void bcf_clear(bcf1_t *v); + + + /** bcf_open and vcf_open mode: please see hts_open() in hts.h */ + typedef htsFile vcfFile; + #define bcf_open(fn, mode) hts_open((fn), (mode)) + #define vcf_open(fn, mode) hts_open((fn), (mode)) + #define bcf_close(fp) hts_close(fp) + #define vcf_close(fp) hts_close(fp) + + /** Reads VCF or BCF header */ + bcf_hdr_t *bcf_hdr_read(htsFile *fp); + + /** + * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed + * @samples: samples to include or exclude from file or as a comma-separated string. + * LIST|FILE .. select samples in list/file + * ^LIST|FILE .. exclude samples from list/file + * - .. include all samples + * NULL .. exclude all samples + * @is_file: @samples is a file (1) or a comma-separated list (0) + * + * The bottleneck of VCF reading is parsing of genotype fields. If the + * reader knows in advance that only subset of samples is needed (possibly + * no samples at all), the performance of bcf_read() can be significantly + * improved by calling bcf_hdr_set_samples after bcf_hdr_read(). + * The function bcf_read() will subset the VCF/BCF records automatically + * with the notable exception when reading records via bcf_itr_next(). + * In this case, bcf_subset_format() must be called explicitly, because + * bcf_readrec() does not see the header. + * + * Returns 0 on success, -1 on error or a positive integer if the list + * contains samples not present in the VCF header. In such a case, the + * return value is the index of the offending sample. + */ + int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file); + int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec); + + + /** Writes VCF or BCF header */ + int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h); + + /** Parse VCF line contained in kstring and populate the bcf1_t struct */ + int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v); + + /** The opposite of vcf_parse. It should rarely be called directly, see vcf_write */ + int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s); + + /** + * bcf_read() - read next VCF or BCF record + * + * Returns -1 on critical errors, 0 otherwise. On errors which are not + * critical for reading, such as missing header definitions, v->errcode is + * set to one of BCF_ERR* code and must be checked before calling + * vcf_write(). + */ + int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); + + /** + * bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) + * + * Note that bcf_unpack() must be called even when reading VCF. It is safe + * to call the function repeatedly, it will not unpack the same field + * twice. + */ + #define BCF_UN_STR 1 // up to ALT inclusive + #define BCF_UN_FLT 2 // up to FILTER + #define BCF_UN_INFO 4 // up to INFO + #define BCF_UN_SHR (BCF_UN_STR|BCF_UN_FLT|BCF_UN_INFO) // all shared information + #define BCF_UN_FMT 8 // unpack format and each sample + #define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT + #define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything + int bcf_unpack(bcf1_t *b, int which); + + /* + * bcf_dup() - create a copy of BCF record. + * + * Note that bcf_unpack() must be called on the returned copy as if it was + * obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) + * internally to reflect any changes made by bcf_update_* functions. + */ + bcf1_t *bcf_dup(bcf1_t *src); + bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src); + + /** + * bcf_write() - write one VCF or BCF record. The type is determined at the open() call. + */ + int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v); + + /** + * The following functions work only with VCFs and should rarely be called + * directly. Usually one wants to use their bcf_* alternatives, which work + * transparently with both VCFs and BCFs. + */ + bcf_hdr_t *vcf_hdr_read(htsFile *fp); + int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h); + int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); + int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v); + + /** Helper function for the bcf_itr_next() macro; internal use, ignore it */ + int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end); + + + + /************************************************************************** + * Header querying and manipulation routines + **************************************************************************/ + + /** Create a new header using the supplied template */ + bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr); + + /** + * Copy header lines from src to dst if not already present in dst. See also bcf_translate(). + * Returns 0 on success or sets a bit on error: + * 1 .. conflicting definitions of tag length + * // todo + */ + int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) HTS_DEPRECATED("Please use bcf_hdr_merge instead"); + + /** + * bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate() + * @param dst: the destination header to be merged into, NULL on the first pass + * @param src: the source header + * + * Notes: + * - use as: + * bcf_hdr_t *dst = NULL; + * for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]); + * + * - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when + * combining multiple BCF headers. The current bcf_hdr_combine() + * does not have this problem, but became slow when used for many files. + */ + bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src); + + /** + * bcf_hdr_add_sample() - add a new sample. + * @param sample: sample name to be added + */ + int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample); + + /** Read VCF header from a file and update the header */ + int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname); + + /** Returns formatted header (newly allocated string) and its length, + * excluding the terminating \0. If is_bcf parameter is unset, IDX + * fields are discarded. + */ + char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len); + + /** Append new VCF header line, returns 0 on success */ + int bcf_hdr_append(bcf_hdr_t *h, const char *line); + int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...); + + /** VCF version, e.g. VCFv4.2 */ + const char *bcf_hdr_get_version(const bcf_hdr_t *hdr); + void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version); + + /** + * bcf_hdr_remove() - remove VCF header tag + * @param type: one of BCF_HL_* + * @param key: tag name or NULL to remove all tags of the given type + */ + void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key); + + /** + * bcf_hdr_subset() - creates a new copy of the header removing unwanted samples + * @param n: number of samples to keep + * @param samples: names of the samples to keep + * @param imap: mapping from index in @samples to the sample index in the original file + * + * Sample names not present in h0 are ignored. The number of unmatched samples can be checked + * by comparing n and bcf_hdr_nsamples(out_hdr). + * This function can be used to reorder samples. + * See also bcf_subset() which subsets individual records. + */ + bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap); + + /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */ + const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs); + + /** Get number of samples */ + #define bcf_hdr_nsamples(hdr) (hdr)->n[BCF_DT_SAMPLE] + + + /** The following functions are for internal use and should rarely be called directly */ + int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt); + int bcf_hdr_sync(bcf_hdr_t *h); + bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len); + void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str); + int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec); + /** + * bcf_hdr_get_hrec() - get header line info + * @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN + * @param key: the header key for generic lines (e.g. "fileformat"), any field + * for structured lines, typically "ID". + * @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN + * @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL + */ + bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class); + bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec); + void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len); + void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted); + int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key); + void hrec_add_idx(bcf_hrec_t *hrec, int idx); + void bcf_hrec_destroy(bcf_hrec_t *hrec); + + + + /************************************************************************** + * Individual record querying and manipulation routines + **************************************************************************/ + + /** See the description of bcf_hdr_subset() */ + int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap); + + /** + * bcf_translate() - translate tags ids to be consistent with different header. This function + * is useful when lines from multiple VCF need to be combined. + * @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine() + * @src_hdr: the source header, used in bcf_read() + * @src_line: line obtained by bcf_read() + */ + int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line); + + /** + * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc + */ + int bcf_get_variant_types(bcf1_t *rec); + int bcf_get_variant_type(bcf1_t *rec, int ith_allele); + int bcf_is_snp(bcf1_t *v); + + /** + * bcf_update_filter() - sets the FILTER column + * @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") + * @n: Number of filters. If n==0, all filters are removed + */ + int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n); + /** + * bcf_add_filter() - adds to the FILTER column + * @flt_id: filter ID to add, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") + * + * If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed. + */ + int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id); + /** + * bcf_remove_filter() - removes from the FILTER column + * @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") + * @pass: when set to 1 and no filters are present, set to PASS + */ + int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass); + /** + * Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably. + */ + int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter); + /** + * bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALLT column + * @alleles: Array of alleles + * @nals: Number of alleles + * @alleles_string: Comma-separated alleles, starting with the REF allele + */ + int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals); + int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string); + + /** + * bcf_update_id() - sets new ID string + * bcf_add_id() - adds to the ID string checking for duplicates + */ + int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id); + int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id); + + /* + * bcf_update_info_*() - functions for updating INFO fields + * @hdr: the BCF header + * @line: VCF line to be edited + * @key: the INFO tag to be updated + * @values: pointer to the array of values. Pass NULL to remove the tag. + * @n: number of values in the array. When set to 0, the INFO tag is removed + * + * The @string in bcf_update_info_flag() is optional, @n indicates whether + * the flag is set or removed. + * + * Returns 0 on success or negative value on error. + */ + #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT) + #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL) + #define bcf_update_info_flag(hdr,line,key,string,n) bcf_update_info((hdr),(line),(key),(string),(n),BCF_HT_FLAG) + #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR) + int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type); + + /* + * bcf_update_format_*() - functions for updating FORMAT fields + * @values: pointer to the array of values, the same number of elements + * is expected for each sample. Missing values must be padded + * with bcf_*_missing or bcf_*_vector_end values. + * @n: number of values in the array. If n==0, existing tag is removed. + * + * The function bcf_update_format_string() is a higher-level (slower) variant of + * bcf_update_format_char(). The former accepts array of \0-terminated strings + * whereas the latter requires that the strings are collapsed into a single array + * of fixed-length strings. In case of strings with variable length, shorter strings + * can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char() + * are not \0-terminated. + * + * Returns 0 on success or negative value on error. + */ + #define bcf_update_format_int32(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_INT) + #define bcf_update_format_float(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_REAL) + #define bcf_update_format_char(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_STR) + #define bcf_update_genotypes(hdr,line,gts,n) bcf_update_format((hdr),(line),"GT",(gts),(n),BCF_HT_INT) // See bcf_gt_ macros below + int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n); + int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type); + + // Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds + // to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained + // from bcf_get_genotypes() below. + #define bcf_gt_phased(idx) (((idx)+1)<<1|1) + #define bcf_gt_unphased(idx) (((idx)+1)<<1) + #define bcf_gt_missing 0 + #define bcf_gt_is_missing(val) ((val)>>1 ? 0 : 1) + #define bcf_gt_is_phased(idx) ((idx)&1) + #define bcf_gt_allele(val) (((val)>>1)-1) + + /** Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based) */ + #define bcf_alleles2gt(a,b) ((a)>(b)?((a)*((a)+1)/2+(b)):((b)*((b)+1)/2+(a))) + static inline void bcf_gt2alleles(int igt, int *a, int *b) + { + int k = 0, dk = 1; + while ( k<igt ) { dk++; k += dk; } + *b = dk - 1; *a = igt - k + *b; + } + + /** + * bcf_get_fmt() - returns pointer to FORMAT's field data + * @header: for access to BCF_DT_ID dictionary + * @line: VCF line obtained from vcf_parse1 + * @fmt: one of GT,PL,... + * + * Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field + * is not available. + */ + bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key); + bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key); + + /** + * bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID + * @line: VCF line obtained from vcf_parse1 + * @id: The header index for the tag, obtained from bcf_hdr_id2int() + * + * Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid + * as their goal is to avoid the header lookup. + */ + bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id); + bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id); + + /** + * bcf_get_info_*() - get INFO values, integers or floats + * @hdr: BCF header + * @line: BCF record + * @tag: INFO tag to retrieve + * @dst: *dst is pointer to a memory location, can point to NULL + * @ndst: pointer to the size of allocated memory + * + * Returns negative value on error or the number of written values on + * success. bcf_get_info_string() returns on success the number of + * characters written excluding the null-terminating byte. bcf_get_info_flag() + * returns 1 when flag is set or 0 if not. + * + * List of return codes: + * -1 .. no such INFO tag defined in the header + * -2 .. clash between types defined in the header and encountered in the VCF record + * -3 .. tag is not present in the VCF record + */ + #define bcf_get_info_int32(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT) + #define bcf_get_info_float(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL) + #define bcf_get_info_string(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR) + #define bcf_get_info_flag(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_FLAG) + int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type); + + /** + * bcf_get_format_*() - same as bcf_get_info*() above + * + * The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char(). + * see the description of bcf_update_format_string() and bcf_update_format_char() above. + * Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays: + * a single block of \0-terminated strings collapsed into a single array and an array of pointers + * to these strings. Both arrays must be cleaned by the user. + * + * Returns negative value on error or the number of written values on success. + * + * Example: + * int ndst = 0; char **dst = NULL; + * if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 ) + * for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i]); + * free(dst[0]); free(dst); + * + * Example: + * int ngt, *gt_arr = NULL, ngt_arr = 0; + * ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr); + */ + #define bcf_get_format_int32(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT) + #define bcf_get_format_float(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL) + #define bcf_get_format_char(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR) + #define bcf_get_genotypes(hdr,line,dst,ndst) bcf_get_format_values(hdr,line,"GT",(void**)(dst),ndst,BCF_HT_INT) + int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst); + int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type); + + + + /************************************************************************** + * Helper functions + **************************************************************************/ + + /** + * bcf_hdr_id2int() - Translates string into numeric ID + * bcf_hdr_int2id() - Translates numeric ID into string + * @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE + * @id: tag name, such as: PL, DP, GT, etc. + * + * Returns -1 if string is not in dictionary, otherwise numeric ID which identifies + * fields in BCF records. + */ + int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id); + #define bcf_hdr_int2id(hdr,type,int_id) ((hdr)->id[type][int_id].key) + + /** + * bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID + * bcf_hdr_id2name() - Translates numeric ID to sequence name + */ + static inline int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id) { return bcf_hdr_id2int(hdr, BCF_DT_CTG, id); } + static inline const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid) { return hdr->id[BCF_DT_CTG][rid].key; } + static inline const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec) { return hdr->id[BCF_DT_CTG][rec->rid].key; } + + /** + * bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t + * @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT + * @int_id: return value of bcf_hdr_id2int, must be >=0 + * + * The returned values are: + * bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* + * bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields + * bcf_hdr_id2type .. the field type, one of BCF_HT_* + * bcf_hdr_id2coltype .. the column type, one of BCF_HL_* + * + * Notes: Prior to using the macros, the presence of the info should be + * tested with bcf_hdr_idinfo_exists(). + */ + #define bcf_hdr_id2length(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>8 & 0xf) + #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12) + #define bcf_hdr_id2type(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) + #define bcf_hdr_id2coltype(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) + #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id<0 || bcf_hdr_id2coltype(hdr,type,int_id)==0xf) ? 0 : 1) + #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) + + void bcf_fmt_array(kstring_t *s, int n, int type, void *data); + uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr); + + void bcf_enc_vchar(kstring_t *s, int l, const char *a); + void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize); + void bcf_enc_vfloat(kstring_t *s, int n, float *a); + + + /************************************************************************** + * BCF index + * + * Note that these functions work with BCFs only. See synced_bcf_reader.h + * which provides (amongst other things) an API to work transparently with + * both indexed BCFs and VCFs. + **************************************************************************/ + + #define bcf_itr_destroy(iter) hts_itr_destroy(iter) + #define bcf_itr_queryi(idx, tid, beg, end) hts_itr_query((idx), (tid), (beg), (end), bcf_readrec) + #define bcf_itr_querys(idx, hdr, s) hts_itr_querys((idx), (s), (hts_name2id_f)(bcf_hdr_name2id), (hdr), hts_itr_query, bcf_readrec) + #define bcf_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) + #define bcf_index_load(fn) hts_idx_load(fn, HTS_FMT_CSI) + #define bcf_index_seqnames(idx, hdr, nptr) hts_idx_seqnames((idx),(nptr),(hts_id2name_f)(bcf_hdr_id2name),(hdr)) + + hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx); + int bcf_index_build(const char *fn, int min_shift); + int bcf_index_build2(const char *fn, const char *fnidx, int min_shift); + +/******************* + * Typed value I/O * + *******************/ + +/* + Note that in contrast with BCFv2.1 specification, HTSlib implementation + allows missing values in vectors. For integer types, the values 0x80, + 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, + 0x80000001 as end-of-vector indicators. Similarly for floats, the value of + 0x7F800001 is interpreted as a missing value and 0x7F800002 as an + end-of-vector indicator. + Note that the end-of-vector byte is not part of the vector. + + This trial BCF version (v2.2) is compatible with the VCF specification and + enables to handle correctly vectors with different ploidy in presence of + missing values. + */ +#define bcf_int8_vector_end (INT8_MIN+1) +#define bcf_int16_vector_end (INT16_MIN+1) +#define bcf_int32_vector_end (INT32_MIN+1) +#define bcf_str_vector_end 0 +#define bcf_int8_missing INT8_MIN +#define bcf_int16_missing INT16_MIN +#define bcf_int32_missing INT32_MIN +#define bcf_str_missing 0x07 +extern uint32_t bcf_float_vector_end; +extern uint32_t bcf_float_missing; +static inline void bcf_float_set(float *ptr, uint32_t value) +{ + union { uint32_t i; float f; } u; + u.i = value; + *ptr = u.f; +} +#define bcf_float_set_vector_end(x) bcf_float_set(&(x),bcf_float_vector_end) +#define bcf_float_set_missing(x) bcf_float_set(&(x),bcf_float_missing) +static inline int bcf_float_is_missing(float f) +{ + union { uint32_t i; float f; } u; + u.f = f; + return u.i==bcf_float_missing ? 1 : 0; +} +static inline int bcf_float_is_vector_end(float f) +{ + union { uint32_t i; float f; } u; + u.f = f; + return u.i==bcf_float_vector_end ? 1 : 0; +} + +static inline void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) +{ + #define BRANCH(type_t, missing, vector_end) { \ + type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \ + int i; \ + for (i=0; i<fmt->n && ptr[i]!=vector_end; i++) \ + { \ + if ( i ) kputc("/|"[ptr[i]&1], str); \ + if ( !(ptr[i]>>1) ) kputc('.', str); \ + else kputw((ptr[i]>>1) - 1, str); \ + } \ + if (i == 0) kputc('.', str); \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + default: fprintf(stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break; + } + #undef BRANCH +} + +static inline void bcf_enc_size(kstring_t *s, int size, int type) +{ + if (size >= 15) { + kputc(15<<4|type, s); + if (size >= 128) { + if (size >= 32768) { + int32_t x = size; + kputc(1<<4|BCF_BT_INT32, s); + kputsn((char*)&x, 4, s); + } else { + int16_t x = size; + kputc(1<<4|BCF_BT_INT16, s); + kputsn((char*)&x, 2, s); + } + } else { + kputc(1<<4|BCF_BT_INT8, s); + kputc(size, s); + } + } else kputc(size<<4|type, s); +} + +static inline int bcf_enc_inttype(long x) +{ + if (x <= INT8_MAX && x > bcf_int8_missing) return BCF_BT_INT8; + if (x <= INT16_MAX && x > bcf_int16_missing) return BCF_BT_INT16; + return BCF_BT_INT32; +} + +static inline void bcf_enc_int1(kstring_t *s, int32_t x) +{ + if (x == bcf_int32_vector_end) { + bcf_enc_size(s, 1, BCF_BT_INT8); + kputc(bcf_int8_vector_end, s); + } else if (x == bcf_int32_missing) { + bcf_enc_size(s, 1, BCF_BT_INT8); + kputc(bcf_int8_missing, s); + } else if (x <= INT8_MAX && x > bcf_int8_missing) { + bcf_enc_size(s, 1, BCF_BT_INT8); + kputc(x, s); + } else if (x <= INT16_MAX && x > bcf_int16_missing) { + int16_t z = x; + bcf_enc_size(s, 1, BCF_BT_INT16); + kputsn((char*)&z, 2, s); + } else { + int32_t z = x; + bcf_enc_size(s, 1, BCF_BT_INT32); + kputsn((char*)&z, 4, s); + } +} + +static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) +{ + if (type == BCF_BT_INT8) { + *q = (uint8_t*)p + 1; + return *(int8_t*)p; + } else if (type == BCF_BT_INT16) { + *q = (uint8_t*)p + 2; + return *(int16_t*)p; + } else { + *q = (uint8_t*)p + 4; + return *(int32_t*)p; + } +} + +static inline int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) +{ + return bcf_dec_int1(p + 1, *p&0xf, q); +} + +static inline int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) +{ + *type = *p & 0xf; + if (*p>>4 != 15) { + *q = (uint8_t*)p + 1; + return *p>>4; + } else return bcf_dec_typed_int1(p + 1, q); +} + +#ifdef __cplusplus +} +#endif + +#endif