7df6e18265341f87a69fba808aa1f92f8ebca841 markd Wed Apr 15 13:39:42 2026 -0700 move copy of htslib diff --git src/htslib/cram/sam_header.c src/htslib/cram/sam_header.c deleted file mode 100644 index 0a8f62053c0..00000000000 --- src/htslib/cram/sam_header.c +++ /dev/null @@ -1,1268 +0,0 @@ -/* -Copyright (c) 2013 Genome Research Ltd. -Author: James Bonfield <jkb@sanger.ac.uk> - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include <config.h> - -#include <string.h> -#include <assert.h> - -#include "cram/sam_header.h" -#include "cram/string_alloc.h" - -static void sam_hdr_error(char *msg, char *line, int len, int lno) { - int j; - - for (j = 0; j < len && line[j] != '\n'; j++) - ; - fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line); -} - -void sam_hdr_dump(SAM_hdr *hdr) { - khint_t k; - int i; - - printf("===DUMP===\n"); - for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { - SAM_hdr_type *t1, *t2; - char c[2]; - - if (!kh_exist(hdr->h, k)) - continue; - - t1 = t2 = kh_val(hdr->h, k); - c[0] = kh_key(hdr->h, k)>>8; - c[1] = kh_key(hdr->h, k)&0xff; - printf("Type %.2s, count %d\n", c, t1->prev->order+1); - - do { - SAM_hdr_tag *tag; - printf(">>>%d ", t1->order); - for (tag = t1->tag; tag; tag=tag->next) { - printf("\"%.2s\":\"%.*s\"\t", - tag->str, tag->len-3, tag->str+3); - } - putchar('\n'); - t1 = t1->next; - } while (t1 != t2); - } - - /* Dump out PG chains */ - printf("\n@PG chains:\n"); - for (i = 0; i < hdr->npg_end; i++) { - int j; - printf(" %d:", i); - for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) { - printf("%s%d(%.*s)", - j == hdr->pg_end[i] ? " " : "->", - j, hdr->pg[j].name_len, hdr->pg[j].name); - } - printf("\n"); - } - - puts("===END DUMP==="); -} - -/* Updates the hash tables in the SAM_hdr structure. - * - * Returns 0 on success; - * -1 on failure - */ -static int sam_hdr_update_hashes(SAM_hdr *sh, - int type, - SAM_hdr_type *h_type) { - /* Add to reference hash? */ - if ((type>>8) == 'S' && (type&0xff) == 'Q') { - SAM_hdr_tag *tag; - SAM_SQ *new_ref; - int nref = sh->nref; - - new_ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref)); - if (!new_ref) - return -1; - sh->ref = new_ref; - - tag = h_type->tag; - sh->ref[nref].name = NULL; - sh->ref[nref].len = 0; - sh->ref[nref].ty = h_type; - sh->ref[nref].tag = tag; - - while (tag) { - if (tag->str[0] == 'S' && tag->str[1] == 'N') { - if (!(sh->ref[nref].name = malloc(tag->len))) - return -1; - strncpy(sh->ref[nref].name, tag->str+3, tag->len-3); - sh->ref[nref].name[tag->len-3] = 0; - } else if (tag->str[0] == 'L' && tag->str[1] == 'N') { - sh->ref[nref].len = atoi(tag->str+3); - } - tag = tag->next; - } - - if (sh->ref[nref].name) { - khint_t k; - int r; - k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r); - if (-1 == r) return -1; - kh_val(sh->ref_hash, k) = nref; - } else { - return -1; // SN should be present, according to spec. - } - - sh->nref++; - } - - /* Add to read-group hash? */ - if ((type>>8) == 'R' && (type&0xff) == 'G') { - SAM_hdr_tag *tag; - SAM_RG *new_rg; - int nrg = sh->nrg; - - new_rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg)); - if (!new_rg) - return -1; - sh->rg = new_rg; - - tag = h_type->tag; - sh->rg[nrg].name = NULL; - sh->rg[nrg].name_len = 0; - sh->rg[nrg].ty = h_type; - sh->rg[nrg].tag = tag; - sh->rg[nrg].id = nrg; - - while (tag) { - if (tag->str[0] == 'I' && tag->str[1] == 'D') { - if (!(sh->rg[nrg].name = malloc(tag->len))) - return -1; - strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3); - sh->rg[nrg].name[tag->len-3] = 0; - sh->rg[nrg].name_len = strlen(sh->rg[nrg].name); - } - tag = tag->next; - } - - if (sh->rg[nrg].name) { - khint_t k; - int r; - k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r); - if (-1 == r) return -1; - kh_val(sh->rg_hash, k) = nrg; - } else { - return -1; // ID should be present, according to spec. - } - - sh->nrg++; - } - - /* Add to program hash? */ - if ((type>>8) == 'P' && (type&0xff) == 'G') { - SAM_hdr_tag *tag; - SAM_PG *new_pg; - int npg = sh->npg; - - new_pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg)); - if (!new_pg) - return -1; - sh->pg = new_pg; - - tag = h_type->tag; - sh->pg[npg].name = NULL; - sh->pg[npg].name_len = 0; - sh->pg[npg].ty = h_type; - sh->pg[npg].tag = tag; - sh->pg[npg].id = npg; - sh->pg[npg].prev_id = -1; - - while (tag) { - if (tag->str[0] == 'I' && tag->str[1] == 'D') { - if (!(sh->pg[npg].name = malloc(tag->len))) - return -1; - strncpy(sh->pg[npg].name, tag->str+3, tag->len-3); - sh->pg[npg].name[tag->len-3] = 0; - sh->pg[npg].name_len = strlen(sh->pg[npg].name); - } else if (tag->str[0] == 'P' && tag->str[1] == 'P') { - // Resolve later if needed - khint_t k; - char tmp = tag->str[tag->len]; tag->str[tag->len] = 0; - k = kh_get(m_s2i, sh->pg_hash, tag->str+3); - tag->str[tag->len] = tmp; - - if (k != kh_end(sh->pg_hash)) { - int p_id = kh_val(sh->pg_hash, k); - sh->pg[npg].prev_id = sh->pg[p_id].id; - - /* Unmark previous entry as a PG termination */ - if (sh->npg_end > 0 && - sh->pg_end[sh->npg_end-1] == p_id) { - sh->npg_end--; - } else { - int i; - for (i = 0; i < sh->npg_end; i++) { - if (sh->pg_end[i] == p_id) { - memmove(&sh->pg_end[i], &sh->pg_end[i+1], - (sh->npg_end-i-1)*sizeof(*sh->pg_end)); - sh->npg_end--; - } - } - } - } else { - sh->pg[npg].prev_id = -1; - } - } - tag = tag->next; - } - - if (sh->pg[npg].name) { - khint_t k; - int r; - k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r); - if (-1 == r) return -1; - kh_val(sh->pg_hash, k) = npg; - } else { - return -1; // ID should be present, according to spec. - } - - /* Add to npg_end[] array. Remove later if we find a PP line */ - if (sh->npg_end >= sh->npg_end_alloc) { - int *new_pg_end; - int new_alloc = sh->npg_end_alloc ? sh->npg_end_alloc*2 : 4; - - new_pg_end = realloc(sh->pg_end, new_alloc * sizeof(int)); - if (!new_pg_end) - return -1; - sh->npg_end_alloc = new_alloc; - sh->pg_end = new_pg_end; - } - sh->pg_end[sh->npg_end++] = npg; - - sh->npg++; - } - - return 0; -} - -/* - * Appends a formatted line to an existing SAM header. - * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with - * optional new-line. If it contains more than 1 line then multiple lines - * will be added in order. - * - * Input text is of maximum length len or as terminated earlier by a NUL. - * Len may be 0 if unknown, in which case lines must be NUL-terminated. - * - * Returns 0 on success - * -1 on failure - */ -int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) { - int i, lno, text_offset; - char *hdr; - - if (!len) - len = strlen(lines); - - text_offset = ks_len(&sh->text); - if (EOF == kputsn(lines, len, &sh->text)) - return -1; - hdr = ks_str(&sh->text) + text_offset; - - for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) { - khint32_t type; - khint_t k; - - int l_start = i, new; - SAM_hdr_type *h_type; - SAM_hdr_tag *h_tag, *last; - - if (hdr[i] != '@') { - int j; - for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++) - ; - sam_hdr_error("Header line does not start with '@'", - &hdr[l_start], len - l_start, lno); - return -1; - } - - type = (hdr[i+1]<<8) | hdr[i+2]; - if (hdr[i+1] < 'A' || hdr[i+1] > 'z' || - hdr[i+2] < 'A' || hdr[i+2] > 'z') { - sam_hdr_error("Header line does not have a two character key", - &hdr[l_start], len - l_start, lno); - return -1; - } - - i += 3; - if (hdr[i] == '\n') - continue; - - // Add the header line type - if (!(h_type = pool_alloc(sh->type_pool))) - return -1; - if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new))) - return -1; - - // Form the ring, either with self or other lines of this type - if (!new) { - SAM_hdr_type *t = kh_val(sh->h, k), *p; - p = t->prev; - - assert(p->next == t); - p->next = h_type; - h_type->prev = p; - - t->prev = h_type; - h_type->next = t; - h_type->order = p->order+1; - } else { - kh_val(sh->h, k) = h_type; - h_type->prev = h_type->next = h_type; - h_type->order = 0; - } - - // Parse the tags on this line - last = NULL; - if ((type>>8) == 'C' && (type&0xff) == 'O') { - int j; - if (hdr[i] != '\t') { - sam_hdr_error("Missing tab", - &hdr[l_start], len - l_start, lno); - return -1; - } - - for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++) - ; - - if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool))) - return -1; - h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); - h_tag->len = j-i; - h_tag->next = NULL; - if (!h_tag->str) - return -1; - - i = j; - - } else { - do { - int j; - if (hdr[i] != '\t') { - sam_hdr_error("Missing tab", - &hdr[l_start], len - l_start, lno); - return -1; - } - - for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++) - ; - - if (!(h_tag = pool_alloc(sh->tag_pool))) - return -1; - h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); - h_tag->len = j-i; - h_tag->next = NULL; - if (!h_tag->str) - return -1; - - if (h_tag->len < 3 || h_tag->str[2] != ':') { - sam_hdr_error("Malformed key:value pair", - &hdr[l_start], len - l_start, lno); - return -1; - } - - if (last) - last->next = h_tag; - else - h_type->tag = h_tag; - - last = h_tag; - i = j; - } while (i < len && hdr[i] != '\0' && hdr[i] != '\n'); - } - - /* Update RG/SQ hashes */ - if (-1 == sam_hdr_update_hashes(sh, type, h_type)) - return -1; - } - - return 0; -} - -/* - * Adds a single line to a SAM header. - * Specify type and one or more key,value pairs, ending with the NULL key. - * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). - * - * Returns index for specific entry on success (eg 2nd SQ, 4th RG) - * -1 on failure - */ -int sam_hdr_add(SAM_hdr *sh, const char *type, ...) { - va_list args; - va_start(args, type); - return sam_hdr_vadd(sh, type, args, NULL); -} - -/* - * sam_hdr_add with a va_list interface. - * - * Note: this function invokes va_arg at least once, making the value - * of ap indeterminate after the return. The caller should call - * va_start/va_end before/after calling this function or use va_copy. - */ -int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) { - va_list args; - SAM_hdr_type *h_type; - SAM_hdr_tag *h_tag, *last; - int new; - khint32_t type_i = (type[0]<<8) | type[1], k; - - if (EOF == kputc_('@', &sh->text)) - return -1; - if (EOF == kputsn(type, 2, &sh->text)) - return -1; - - if (!(h_type = pool_alloc(sh->type_pool))) - return -1; - if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new))) - return -1; - - // Form the ring, either with self or other lines of this type - if (!new) { - SAM_hdr_type *t = kh_val(sh->h, k), *p; - p = t->prev; - - assert(p->next == t); - p->next = h_type; - h_type->prev = p; - - t->prev = h_type; - h_type->next = t; - h_type->order = p->order + 1; - } else { - kh_val(sh->h, k) = h_type; - h_type->prev = h_type->next = h_type; - h_type->order = 0; - } - - last = NULL; - - // Any ... varargs - va_start(args, ap); - for (;;) { - char *k, *v; - int idx; - - if (!(k = (char *)va_arg(args, char *))) - break; - v = va_arg(args, char *); - - if (EOF == kputc_('\t', &sh->text)) - return -1; - - if (!(h_tag = pool_alloc(sh->tag_pool))) - return -1; - idx = ks_len(&sh->text); - - if (EOF == kputs(k, &sh->text)) - return -1; - if (EOF == kputc_(':', &sh->text)) - return -1; - if (EOF == kputs(v, &sh->text)) - return -1; - - h_tag->len = ks_len(&sh->text) - idx; - h_tag->str = string_ndup(sh->str_pool, - ks_str(&sh->text) + idx, - h_tag->len); - h_tag->next = NULL; - if (!h_tag->str) - return -1; - - if (last) - last->next = h_tag; - else - h_type->tag = h_tag; - - last = h_tag; - } - va_end(args); - - // Plus the specified va_list params - for (;;) { - char *k, *v; - int idx; - - if (!(k = (char *)va_arg(ap, char *))) - break; - v = va_arg(ap, char *); - - if (EOF == kputc_('\t', &sh->text)) - return -1; - - if (!(h_tag = pool_alloc(sh->tag_pool))) - return -1; - idx = ks_len(&sh->text); - - if (EOF == kputs(k, &sh->text)) - return -1; - if (EOF == kputc_(':', &sh->text)) - return -1; - if (EOF == kputs(v, &sh->text)) - return -1; - - h_tag->len = ks_len(&sh->text) - idx; - h_tag->str = string_ndup(sh->str_pool, - ks_str(&sh->text) + idx, - h_tag->len); - h_tag->next = NULL; - if (!h_tag->str) - return -1; - - if (last) - last->next = h_tag; - else - h_type->tag = h_tag; - - last = h_tag; - } - va_end(ap); - - if (EOF == kputc('\n', &sh->text)) - return -1; - - int itype = (type[0]<<8) | type[1]; - if (-1 == sam_hdr_update_hashes(sh, itype, h_type)) - return -1; - - return h_type->order; -} - -/* - * Returns the first header item matching 'type'. If ID is non-NULL it checks - * for the tag ID: and compares against the specified ID. - * - * Returns NULL if no type/ID is found - */ -SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, - char *ID_key, char *ID_value) { - SAM_hdr_type *t1, *t2; - int itype = (type[0]<<8)|(type[1]); - khint_t k; - - /* Special case for types we have prebuilt hashes on */ - if (ID_key) { - if (type[0] == 'S' && type[1] == 'Q' && - ID_key[0] == 'S' && ID_key[1] == 'N') { - k = kh_get(m_s2i, hdr->ref_hash, ID_value); - return k != kh_end(hdr->ref_hash) - ? hdr->ref[kh_val(hdr->ref_hash, k)].ty - : NULL; - } - - if (type[0] == 'R' && type[1] == 'G' && - ID_key[0] == 'I' && ID_key[1] == 'D') { - k = kh_get(m_s2i, hdr->rg_hash, ID_value); - return k != kh_end(hdr->rg_hash) - ? hdr->rg[kh_val(hdr->rg_hash, k)].ty - : NULL; - } - - if (type[0] == 'P' && type[1] == 'G' && - ID_key[0] == 'I' && ID_key[1] == 'D') { - k = kh_get(m_s2i, hdr->pg_hash, ID_value); - return k != kh_end(hdr->pg_hash) - ? hdr->pg[kh_val(hdr->pg_hash, k)].ty - : NULL; - } - } - - k = kh_get(sam_hdr, hdr->h, itype); - if (k == kh_end(hdr->h)) - return NULL; - - if (!ID_key) - return kh_val(hdr->h, k); - - t1 = t2 = kh_val(hdr->h, k); - do { - SAM_hdr_tag *tag; - for (tag = t1->tag; tag; tag = tag->next) { - if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) { - char *cp1 = tag->str+3; - char *cp2 = ID_value; - while (*cp1 && *cp1 == *cp2) - cp1++, cp2++; - if (*cp2 || *cp1) - continue; - return t1; - } - } - t1 = t1->next; - } while (t1 != t2); - - return NULL; -} - -/* - * As per SAM_hdr_type, but returns a complete line of formatted text - * for a specific head type/ID combination. If ID is NULL then it returns - * the first line of the specified type. - * - * The returned string is malloced and should be freed by the calling - * function with free(). - * - * Returns NULL if no type/ID is found. - */ -char *sam_hdr_find_line(SAM_hdr *hdr, char *type, - char *ID_key, char *ID_value) { - SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value); - kstring_t ks = KS_INITIALIZER; - SAM_hdr_tag *tag; - int r = 0; - - if (!ty) - return NULL; - - // Paste together the line from the hashed copy - r |= (kputc_('@', &ks) == EOF); - r |= (kputs(type, &ks) == EOF); - for (tag = ty->tag; tag; tag = tag->next) { - r |= (kputc_('\t', &ks) == EOF); - r |= (kputsn(tag->str, tag->len, &ks) == EOF); - } - - if (r) { - KS_FREE(&ks); - return NULL; - } - - return ks_str(&ks); -} - - -/* - * Looks for a specific key in a single sam header line. - * If prev is non-NULL it also fills this out with the previous tag, to - * permit use in key removal. *prev is set to NULL when the tag is the first - * key in the list. When a tag isn't found, prev (if non NULL) will be the last - * tag in the existing list. - * - * Returns the tag pointer on success - * NULL on failure - */ -SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, - SAM_hdr_type *type, - char *key, - SAM_hdr_tag **prev) { - SAM_hdr_tag *tag, *p = NULL; - - for (tag = type->tag; tag; p = tag, tag = tag->next) { - if (tag->str[0] == key[0] && tag->str[1] == key[1]) { - if (prev) - *prev = p; - return tag; - } - } - - if (prev) - *prev = p; - - return NULL; -} - - -/* - * Adds or updates tag key,value pairs in a header line. - * Eg for adding M5 tags to @SQ lines or updating sort order for the - * @HD line (although use the sam_hdr_sort_order() function for - * HD manipulation, which is a wrapper around this funuction). - * - * Specify multiple key,value pairs ending in NULL. - * - * Returns 0 on success - * -1 on failure - */ -int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) { - va_list ap; - - va_start(ap, type); - - for (;;) { - char *k, *v; - int idx; - SAM_hdr_tag *tag, *prev; - - if (!(k = (char *)va_arg(ap, char *))) - break; - v = va_arg(ap, char *); - - tag = sam_hdr_find_key(hdr, type, k, &prev); - if (!tag) { - if (!(tag = pool_alloc(hdr->tag_pool))) - return -1; - if (prev) - prev->next = tag; - else - type->tag = tag; - - tag->next = NULL; - } - - idx = ks_len(&hdr->text); - if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0) - return -1; - tag->len = ks_len(&hdr->text) - idx; - tag->str = string_ndup(hdr->str_pool, - ks_str(&hdr->text) + idx, - tag->len); - if (!tag->str) - return -1; - } - - va_end(ap); - - return 0; -} - -#define K(a) (((a)[0]<<8)|((a)[1])) - -/* - * Returns the sort order: - */ -enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) { - return hdr->sort_order; -} - -static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) { - khint_t k; - enum sam_sort_order so; - - so = ORDER_UNKNOWN; - k = kh_get(sam_hdr, hdr->h, K("HD")); - if (k != kh_end(hdr->h)) { - SAM_hdr_type *ty = kh_val(hdr->h, k); - SAM_hdr_tag *tag; - for (tag = ty->tag; tag; tag = tag->next) { - if (tag->str[0] == 'S' && tag->str[1] == 'O') { - if (strcmp(tag->str+3, "unsorted") == 0) - so = ORDER_UNSORTED; - else if (strcmp(tag->str+3, "queryname") == 0) - so = ORDER_NAME; - else if (strcmp(tag->str+3, "coordinate") == 0) - so = ORDER_COORD; - else - fprintf(stderr, "Unknown sort order field: %s\n", - tag->str+3); - } - } - } - - return so; -} - - -/* - * Reconstructs the kstring from the header hash table. - * Returns 0 on success - * -1 on failure - */ -int sam_hdr_rebuild(SAM_hdr *hdr) { - /* Order: HD then others */ - kstring_t ks = KS_INITIALIZER; - khint_t k; - - - k = kh_get(sam_hdr, hdr->h, K("HD")); - if (k != kh_end(hdr->h)) { - SAM_hdr_type *ty = kh_val(hdr->h, k); - SAM_hdr_tag *tag; - if (EOF == kputs("@HD", &ks)) - return -1; - for (tag = ty->tag; tag; tag = tag->next) { - if (EOF == kputc_('\t', &ks)) - return -1; - if (EOF == kputsn_(tag->str, tag->len, &ks)) - return -1; - } - if (EOF == kputc('\n', &ks)) - return -1; - } - - for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { - SAM_hdr_type *t1, *t2; - - if (!kh_exist(hdr->h, k)) - continue; - - if (kh_key(hdr->h, k) == K("HD")) - continue; - - t1 = t2 = kh_val(hdr->h, k); - do { - SAM_hdr_tag *tag; - char c[2]; - - if (EOF == kputc_('@', &ks)) - return -1; - c[0] = kh_key(hdr->h, k)>>8; - c[1] = kh_key(hdr->h, k)&0xff; - if (EOF == kputsn_(c, 2, &ks)) - return -1; - for (tag = t1->tag; tag; tag=tag->next) { - if (EOF == kputc_('\t', &ks)) - return -1; - if (EOF == kputsn_(tag->str, tag->len, &ks)) - return -1; - } - if (EOF == kputc('\n', &ks)) - return -1; - t1 = t1->next; - } while (t1 != t2); - } - - if (ks_str(&hdr->text)) - KS_FREE(&hdr->text); - - hdr->text = ks; - - return 0; -} - - -/* - * Creates an empty SAM header, ready to be populated. - * - * Returns a SAM_hdr struct on success (free with sam_hdr_free()) - * NULL on failure - */ -SAM_hdr *sam_hdr_new() { - SAM_hdr *sh = calloc(1, sizeof(*sh)); - - if (!sh) - return NULL; - - sh->h = kh_init(sam_hdr); - if (!sh->h) - goto err; - - sh->ID_cnt = 1; - sh->ref_count = 1; - - sh->nref = 0; - sh->ref = NULL; - if (!(sh->ref_hash = kh_init(m_s2i))) - goto err; - - sh->nrg = 0; - sh->rg = NULL; - if (!(sh->rg_hash = kh_init(m_s2i))) - goto err; - - sh->npg = 0; - sh->pg = NULL; - sh->npg_end = sh->npg_end_alloc = 0; - sh->pg_end = NULL; - if (!(sh->pg_hash = kh_init(m_s2i))) - goto err; - - KS_INIT(&sh->text); - - if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag)))) - goto err; - - if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type)))) - goto err; - - if (!(sh->str_pool = string_pool_create(8192))) - goto err; - - return sh; - - err: - if (sh->h) - kh_destroy(sam_hdr, sh->h); - - if (sh->tag_pool) - pool_destroy(sh->tag_pool); - - if (sh->type_pool) - pool_destroy(sh->type_pool); - - if (sh->str_pool) - string_pool_destroy(sh->str_pool); - - free(sh); - - return NULL; -} - - -/* - * Tokenises a SAM header into a hash table. - * Also extracts a few bits on specific data types, such as @RG lines. - * - * Returns a SAM_hdr struct on success (free with sam_hdr_free()) - * NULL on failure - */ -SAM_hdr *sam_hdr_parse_(const char *hdr, int len) { - /* Make an empty SAM_hdr */ - SAM_hdr *sh; - - sh = sam_hdr_new(); - if (NULL == sh) return NULL; - - if (NULL == hdr) return sh; // empty header is permitted - - /* Parse the header, line by line */ - if (-1 == sam_hdr_add_lines(sh, hdr, len)) { - sam_hdr_free(sh); - return NULL; - } - - /* Obtain sort order */ - sh->sort_order = sam_hdr_parse_sort_order(sh); - - //sam_hdr_dump(sh); - //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL); - //sam_hdr_rebuild(sh); - //printf(">>%s<<", ks_str(sh->text)); - - //parse_references(sh); - //parse_read_groups(sh); - - sam_hdr_link_pg(sh); - //sam_hdr_dump(sh); - - return sh; -} - -/* - * Produces a duplicate copy of hdr and returns it. - * Returns NULL on failure - */ -SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) { - if (-1 == sam_hdr_rebuild(hdr)) - return NULL; - - return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr)); -} - -/*! Increments a reference count on hdr. - * - * This permits multiple files to share the same header, all calling - * sam_hdr_free when done, without causing errors for other open files. - */ -void sam_hdr_incr_ref(SAM_hdr *hdr) { - hdr->ref_count++; -} - -/*! Increments a reference count on hdr. - * - * This permits multiple files to share the same header, all calling - * sam_hdr_free when done, without causing errors for other open files. - * - * If the reference count hits zero then the header is automatically - * freed. This makes it a synonym for sam_hdr_free(). - */ -void sam_hdr_decr_ref(SAM_hdr *hdr) { - sam_hdr_free(hdr); -} - -/*! Deallocates all storage used by a SAM_hdr struct. - * - * This also decrements the header reference count. If after decrementing - * it is still non-zero then the header is assumed to be in use by another - * caller and the free is not done. - * - * This is a synonym for sam_hdr_dec_ref(). - */ -void sam_hdr_free(SAM_hdr *hdr) { - if (!hdr) - return; - - if (--hdr->ref_count > 0) - return; - - if (ks_str(&hdr->text)) - KS_FREE(&hdr->text); - - if (hdr->h) - kh_destroy(sam_hdr, hdr->h); - - if (hdr->ref_hash) - kh_destroy(m_s2i, hdr->ref_hash); - - if (hdr->ref) { - int i; - for (i = 0; i < hdr->nref; i++) - if (hdr->ref[i].name) - free(hdr->ref[i].name); - free(hdr->ref); - } - - if (hdr->rg_hash) - kh_destroy(m_s2i, hdr->rg_hash); - - if (hdr->rg) { - int i; - for (i = 0; i < hdr->nrg; i++) - if (hdr->rg[i].name) - free(hdr->rg[i].name); - free(hdr->rg); - } - - if (hdr->pg_hash) - kh_destroy(m_s2i, hdr->pg_hash); - - if (hdr->pg) { - int i; - for (i = 0; i < hdr->npg; i++) - if (hdr->pg[i].name) - free(hdr->pg[i].name); - free(hdr->pg); - } - - if (hdr->pg_end) - free(hdr->pg_end); - - if (hdr->type_pool) - pool_destroy(hdr->type_pool); - - if (hdr->tag_pool) - pool_destroy(hdr->tag_pool); - - if (hdr->str_pool) - string_pool_destroy(hdr->str_pool); - - free(hdr); -} - -int sam_hdr_length(SAM_hdr *hdr) { - return ks_len(&hdr->text); -} - -char *sam_hdr_str(SAM_hdr *hdr) { - return ks_str(&hdr->text); -} - -/* - * Looks up a reference sequence by name and returns the numerical ID. - * Returns -1 if unknown reference. - */ -int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) { - khint_t k = kh_get(m_s2i, hdr->ref_hash, ref); - return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k); -} - -/* - * Looks up a read-group by name and returns a pointer to the start of the - * associated tag list. - * - * Returns NULL on failure - */ -SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) { - khint_t k = kh_get(m_s2i, hdr->rg_hash, rg); - return k == kh_end(hdr->rg_hash) - ? NULL - : &hdr->rg[kh_val(hdr->rg_hash, k)]; -} - - -/* - * Fixes any PP links in @PG headers. - * If the entries are in order then this doesn't need doing, but incase - * our header is out of order this goes through the sh->pg[] array - * setting the prev_id field. - * - * Note we can have multiple complete chains. This code should identify the - * tails of these chains as these are the entries we have to link to in - * subsequent PP records. - * - * Returns 0 on sucess - * -1 on failure (indicating broken PG/PP records) - */ -int sam_hdr_link_pg(SAM_hdr *hdr) { - int i, j, ret = 0; - - hdr->npg_end_alloc = hdr->npg; - hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end)); - if (!hdr->pg_end) - return -1; - - for (i = 0; i < hdr->npg; i++) - hdr->pg_end[i] = i; - - for (i = 0; i < hdr->npg; i++) { - khint_t k; - SAM_hdr_tag *tag; - char tmp; - - for (tag = hdr->pg[i].tag; tag; tag = tag->next) { - if (tag->str[0] == 'P' && tag->str[1] == 'P') - break; - } - if (!tag) { - /* Chain start points */ - continue; - } - - tmp = tag->str[tag->len]; tag->str[tag->len] = 0; - k = kh_get(m_s2i, hdr->pg_hash, tag->str+3); - tag->str[tag->len] = tmp; - - if (k == kh_end(hdr->pg_hash)) { - ret = -1; - continue; - } - - hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id; - hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1; - } - - for (i = j = 0; i < hdr->npg; i++) { - if (hdr->pg_end[i] != -1) - hdr->pg_end[j++] = hdr->pg_end[i]; - } - hdr->npg_end = j; - - return ret; -} - -/* - * Returns a unique ID from a base name. - * - * The value returned is valid until the next call to - * this function. - */ -const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) { - khint_t k = kh_get(m_s2i, sh->pg_hash, name); - if (k == kh_end(sh->pg_hash)) - return name; - - do { - sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++); - k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf); - } while (k != kh_end(sh->pg_hash)); - - return sh->ID_buf; -} - -/* - * Add an @PG line. - * - * If we wish complete control over this use sam_hdr_add() directly. This - * function uses that, but attempts to do a lot of tedious house work for - * you too. - * - * - It will generate a suitable ID if the supplied one clashes. - * - It will generate multiple @PG records if we have multiple PG chains. - * - * Call it as per sam_hdr_add() with a series of key,value pairs ending - * in NULL. - * - * Returns 0 on success - * -1 on failure - */ -int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) { - va_list args; - - if (sh->npg_end) { - /* Copy ends array to avoid us looping while modifying it */ - int *end = malloc(sh->npg_end * sizeof(int)); - int i, nends = sh->npg_end; - - if (!end) - return -1; - - memcpy(end, sh->pg_end, nends * sizeof(*end)); - - for (i = 0; i < nends; i++) { - va_start(args, name); - if (-1 == sam_hdr_vadd(sh, "PG", args, - "ID", sam_hdr_PG_ID(sh, name), - "PN", name, - "PP", sh->pg[end[i]].name, - NULL)) { - free(end); - return -1; - } - va_end(args); - } - - free(end); - } else { - va_start(args, name); - if (-1 == sam_hdr_vadd(sh, "PG", args, - "ID", sam_hdr_PG_ID(sh, name), - "PN", name, - NULL)) - return -1; - va_end(args); - } - - //sam_hdr_dump(sh); - - return 0; -} - -/* - * A function to help with construction of CL tags in @PG records. - * Takes an argc, argv pair and returns a single space-separated string. - * This string should be deallocated by the calling function. - * - * Returns malloced char * on success - * NULL on failure - */ -char *stringify_argv(int argc, char *argv[]) { - char *str, *cp; - size_t nbytes = 1; - int i, j; - - /* Allocate */ - for (i = 0; i < argc; i++) { - nbytes += strlen(argv[i]) + 1; - } - if (!(str = malloc(nbytes))) - return NULL; - - /* Copy */ - cp = str; - for (i = 0; i < argc; i++) { - j = 0; - while (argv[i][j]) { - if (argv[i][j] == '\t') - *cp++ = ' '; - else - *cp++ = argv[i][j]; - j++; - } - *cp++ = ' '; - } - *cp++ = 0; - - return str; -}