7df6e18265341f87a69fba808aa1f92f8ebca841 markd Wed Apr 15 13:39:42 2026 -0700 move copy of htslib diff --git src/htslib/cram/sam_header.h src/htslib/cram/sam_header.h deleted file mode 100644 index e312df4e336..00000000000 --- src/htslib/cram/sam_header.h +++ /dev/null @@ -1,459 +0,0 @@ -/* -Copyright (c) 2013-2014 Genome Research Ltd. -Author: James Bonfield - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/*! \file - * SAM header parsing. - * - * These functions can be shared between SAM, BAM and CRAM file - * formats as all three internally use the same string encoding for - * header fields. - */ - -/* - * TODO. - * - * - Sort order (parse to struct, enum type, updating funcs) - * - Removal of lines. - * - Updating of lines - */ - -#ifndef _SAM_HDR_H_ -#define _SAM_HDR_H_ - -#include - -#include "cram/string_alloc.h" -#include "cram/pooled_alloc.h" - -#include "htslib/khash.h" -#include "htslib/kstring.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// For structure assignment. Eg kstring_t s = KS_INITIALIZER; -#define KS_INITIALIZER {0,0,0} - -// For initialisation elsewhere. Eg KS_INIT(x->str); -#define KS_INIT(ks) ((ks)->l = 0, (ks)->m = 0, (ks)->s = NULL) - -// Frees the string subfield only. Assumes 's' itself is static. -#define KS_FREE(ks) do { if ((ks)->s) free((ks)->s); } while(0) - -/* - * Proposed new SAM header parsing - -1 @SQ ID:foo LN:100 -2 @SQ ID:bar LN:200 -3 @SQ ID:ram LN:300 UR:xyz -4 @RG ID:r ... -5 @RG ID:s ... - -Hash table for 2-char @keys without dup entries. -If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}. - -HASH("SQ")--\ - | - (3) <-> 1 <-> 2 <-> 3 <-> (1) - -HASH("RG")--\ - | - (5) <-> 4 <-> 5 <-> (4) - -Items stored in the hash values also form their own linked lists: -Ie SQ->ID(foo)->LN(100) - SQ->ID(bar)->LN(200) - SQ->ID(ram)->LN(300)->UR(xyz) - RG->ID(r) - */ - -/*! A single key:value pair on a header line - * - * These form a linked list and hold strings. The strings are - * allocated from a string_alloc_t pool referenced in the master - * SAM_hdr structure. Do not attempt to free, malloc or manipulate - * these strings directly. - */ -typedef struct SAM_hdr_tag_s { - struct SAM_hdr_tag_s *next; - char *str; - int len; -} SAM_hdr_tag; - -/*! The parsed version of the SAM header string. - * - * Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type - * struct via the main hash table h in the SAM_hdr struct. - * - * These in turn consist of circular bi-directional linked lists (ie - * rings) to hold the multiple instances of the same header type - * code. For example if we have 5 \@SQ lines the primary hash table - * will key on \@SQ pointing to the first SAM_hdr_type and that in turn - * will be part of a ring of 5 elements. - * - * For each SAM_hdr_type structure we also point to a SAM_hdr_tag - * structure which holds the tokenised attributes; the tab separated - * key:value pairs per line. - */ -typedef struct SAM_hdr_item_s { - struct SAM_hdr_item_s *next; // cirular - struct SAM_hdr_item_s *prev; - SAM_hdr_tag *tag; // first tag - int order; // 0 upwards -} SAM_hdr_type; - -/*! Parsed \@SQ lines */ -typedef struct { - char *name; - uint32_t len; - SAM_hdr_type *ty; - SAM_hdr_tag *tag; -} SAM_SQ; - -/*! Parsed \@RG lines */ -typedef struct { - char *name; - SAM_hdr_type *ty; - SAM_hdr_tag *tag; - int name_len; - int id; // numerical ID -} SAM_RG; - -/*! Parsed \@PG lines */ -typedef struct { - char *name; - SAM_hdr_type *ty; - SAM_hdr_tag *tag; - int name_len; - int id; // numerical ID - int prev_id; // -1 if none -} SAM_PG; - -/*! Sort order parsed from @HD line */ -enum sam_sort_order { - ORDER_UNKNOWN =-1, - ORDER_UNSORTED = 0, - ORDER_NAME = 1, - ORDER_COORD = 2, - //ORDER_COLLATE = 3 // maybe one day! -}; - -KHASH_MAP_INIT_INT(sam_hdr, SAM_hdr_type*) -KHASH_MAP_INIT_STR(m_s2i, int) - -/*! Primary structure for header manipulation - * - * The initial header text is held in the text kstring_t, but is also - * parsed out into SQ, RG and PG arrays. These have a hash table - * associated with each to allow lookup by ID or SN fields instead of - * their numeric array indices. Additionally PG has an array to hold - * the linked list start points (the last in a PP chain). - * - * Use the appropriate sam_hdr_* functions to edit the header, and - * call sam_hdr_rebuild() any time the textual form needs to be - * updated again. - */ -typedef struct { - kstring_t text; //!< concatenated text, indexed by SAM_hdr_tag - khash_t(sam_hdr) *h; - string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings - pool_alloc_t *type_pool;//!< Pool of SAM_hdr_type structs - pool_alloc_t *tag_pool; //!< Pool of SAM_hdr_tag structs - - // @SQ lines / references - int nref; //!< Number of \@SQ lines - SAM_SQ *ref; //!< Array of parsed \@SQ lines - khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to sq[] index - - // @RG lines / read-groups - int nrg; //!< Number of \@RG lines - SAM_RG *rg; //!< Array of parsed \@RG lines - khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index - - // @PG lines / programs - int npg; //!< Number of \@PG lines - int npg_end; //!< Number of terminating \@PG lines - int npg_end_alloc; //!< Size of pg_end field - SAM_PG *pg; //!< Array of parsed \@PG lines - khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index - int *pg_end; //!< \@PG chain termination IDs - - // @HD data - enum sam_sort_order sort_order; //!< @HD SO: field - - // @cond internal - char ID_buf[1024]; // temporary buffer - int ID_cnt; - int ref_count; // number of uses of this SAM_hdr - // @endcond -} SAM_hdr; - -/*! Creates an empty SAM header, ready to be populated. - * - * @return - * Returns a SAM_hdr struct on success (free with sam_hdr_free()) - * NULL on failure - */ -SAM_hdr *sam_hdr_new(void); - -/*! Tokenises a SAM header into a hash table. - * - * Also extracts a few bits on specific data types, such as @RG lines. - * - * @return - * Returns a SAM_hdr struct on success (free with sam_hdr_free()); - * NULL on failure - */ -SAM_hdr *sam_hdr_parse_(const char *hdr, int len); - - -/*! Produces a duplicate copy of hdr and returns it. - * @return - * Returns NULL on failure - */ -SAM_hdr *sam_hdr_dup(SAM_hdr *hdr); - - -/*! Increments a reference count on hdr. - * - * This permits multiple files to share the same header, all calling - * sam_hdr_free when done, without causing errors for other open files. - */ -void sam_hdr_incr_ref(SAM_hdr *hdr); - - -/*! Increments a reference count on hdr. - * - * This permits multiple files to share the same header, all calling - * sam_hdr_free when done, without causing errors for other open files. - * - * If the reference count hits zero then the header is automatically - * freed. This makes it a synonym for sam_hdr_free(). - */ -void sam_hdr_decr_ref(SAM_hdr *hdr); - - -/*! Deallocates all storage used by a SAM_hdr struct. - * - * This also decrements the header reference count. If after decrementing - * it is still non-zero then the header is assumed to be in use by another - * caller and the free is not done. - * - * This is a synonym for sam_hdr_dec_ref(). - */ -void sam_hdr_free(SAM_hdr *hdr); - -/*! Returns the current length of the SAM_hdr in text form. - * - * Call sam_hdr_rebuild() first if editing has taken place. - */ -int sam_hdr_length(SAM_hdr *hdr); - -/*! Returns the string form of the SAM_hdr. - * - * Call sam_hdr_rebuild() first if editing has taken place. - */ -char *sam_hdr_str(SAM_hdr *hdr); - -/*! Appends a formatted line to an existing SAM header. - * - * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with - * optional new-line. If it contains more than 1 line then multiple lines - * will be added in order. - * - * Input text is of maximum length len or as terminated earlier by a NUL. - * Len may be 0 if unknown, in which case lines must be NUL-terminated. - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len); - -/*! Adds a single line to a SAM header. - * - * Specify type and one or more key,value pairs, ending with the NULL key. - * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_add(SAM_hdr *sh, const char *type, ...); - -/*! Adds a single line to a SAM header. - * - * This is much like sam_hdr_add() but with the additional va_list - * argument. This is followed by specifying type and one or more - * key,value pairs, ending with the NULL key. - * - * Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL). - * - * The purpose of the additional va_list parameter is to permit other - * varargs functions to call this while including their own additional - * parameters; an example is in sam_hdr_add_PG(). - * - * Note: this function invokes va_arg at least once, making the value - * of ap indeterminate after the return. The caller should call - * va_start/va_end before/after calling this function or use va_copy. - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...); - -/*! - * @return - * Returns the first header item matching 'type'. If ID is non-NULL it checks - * for the tag ID: and compares against the specified ID. - * - * Returns NULL if no type/ID is found - */ -SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, - char *ID_key, char *ID_value); - -/*! - * - * As per SAM_hdr_type, but returns a complete line of formatted text - * for a specific head type/ID combination. If ID is NULL then it returns - * the first line of the specified type. - * - * The returned string is malloced and should be freed by the calling - * function with free(). - * - * @return - * Returns NULL if no type/ID is found. - */ -char *sam_hdr_find_line(SAM_hdr *hdr, char *type, - char *ID_key, char *ID_value); - -/*! Looks for a specific key in a single sam header line. - * - * If prev is non-NULL it also fills this out with the previous tag, to - * permit use in key removal. *prev is set to NULL when the tag is the first - * key in the list. When a tag isn't found, prev (if non NULL) will be the last - * tag in the existing list. - * - * @return - * Returns the tag pointer on success; - * NULL on failure - */ -SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, - SAM_hdr_type *type, - char *key, - SAM_hdr_tag **prev); - -/*! Adds or updates tag key,value pairs in a header line. - * - * Eg for adding M5 tags to @SQ lines or updating sort order for the - * @HD line (although use the sam_hdr_sort_order() function for - * HD manipulation, which is a wrapper around this funuction). - * - * Specify multiple key,value pairs ending in NULL. - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...); - -/*! Returns the sort order from the @HD SO: field */ -enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr); - -/*! Reconstructs the kstring from the header hash table. - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_rebuild(SAM_hdr *hdr); - -/*! Looks up a reference sequence by name and returns the numerical ID. - * @return - * Returns -1 if unknown reference. - */ -int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref); - -/*! Looks up a read-group by name and returns a pointer to the start of the - * associated tag list. - * - * @return - * Returns NULL on failure - */ -SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg); - -/*! Fixes any PP links in @PG headers. - * - * If the entries are in order then this doesn't need doing, but incase - * our header is out of order this goes through the sh->pg[] array - * setting the prev_id field. - * - * @return - * Returns 0 on sucess; - * -1 on failure (indicating broken PG/PP records) - */ -int sam_hdr_link_pg(SAM_hdr *hdr); - - -/*! Add an @PG line. - * - * If we wish complete control over this use sam_hdr_add() directly. This - * function uses that, but attempts to do a lot of tedious house work for - * you too. - * - * - It will generate a suitable ID if the supplied one clashes. - * - It will generate multiple @PG records if we have multiple PG chains. - * - * Call it as per sam_hdr_add() with a series of key,value pairs ending - * in NULL. - * - * @return - * Returns 0 on success; - * -1 on failure - */ -int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...); - -/*! - * A function to help with construction of CL tags in @PG records. - * Takes an argc, argv pair and returns a single space-separated string. - * This string should be deallocated by the calling function. - * - * @return - * Returns malloced char * on success; - * NULL on failure - */ -char *stringify_argv(int argc, char *argv[]); - -#ifdef __cplusplus -} -#endif - -#endif /* _SAM_HDR_H_ */