7df6e18265341f87a69fba808aa1f92f8ebca841
markd
  Wed Apr 15 13:39:42 2026 -0700
move copy of htslib

diff --git src/htslib/cram/cram_index.c src/htslib/cram/cram_index.c
deleted file mode 100644
index d20c767529b..00000000000
--- src/htslib/cram/cram_index.c
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
-Copyright (c) 2013-2014 Genome Research Ltd.
-Author: James Bonfield <jkb@sanger.ac.uk>
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
-Institute nor the names of its contributors may be used to endorse or promote
-products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
-DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/*
- * The index is a gzipped tab-delimited text file with one line per slice.
- * The columns are:
- * 1: reference number (0 to N-1, as per BAM ref_id)
- * 2: reference position of 1st read in slice (1..?)
- * 3: number of reads in slice
- * 4: offset of container start (relative to end of SAM header, so 1st
- *    container is offset 0).
- * 5: slice number within container (ie which landmark).
- *
- * In memory, we hold this in a nested containment list. Each list element is
- * a cram_index struct. Each element in turn can contain its own list of
- * cram_index structs.
- *
- * Any start..end range which is entirely contained within another (and
- * earlier as it is sorted) range will be held within it. This ensures that
- * the outer list will never have containments and we can safely do a
- * binary search to find the first range which overlaps any given coordinate.
- */
-
-#include <config.h>
-
-#include <stdio.h>
-#include <errno.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <zlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <math.h>
-#include <ctype.h>
-
-#include "htslib/hfile.h"
-#include "hts_internal.h"
-#include "cram/cram.h"
-#include "cram/os.h"
-#include "cram/zfio.h"
-
-#if 0
-static void dump_index_(cram_index *e, int level) {
-    int i, n;
-    n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
-    printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
-    for (i = 0; i < e->nslice; i++) {
-	dump_index_(&e->e[i], level+1);
-    }
-}
-
-static void dump_index(cram_fd *fd) {
-    int i;
-    for (i = 0; i < fd->index_sz; i++) {
-	dump_index_(&fd->index[i], 0);
-    }
-}
-#endif
-
-static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
-    int sign = 1;
-    int32_t val = 0;
-    size_t p = *pos;
-
-    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
-	   p++;
-
-    if (p < k->l && k->s[p] == '-')
-	sign = -1, p++;
-
-    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
-	return -1;
-
-    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
-	val = val*10 + k->s[p++]-'0';
-    
-    *pos = p;
-    *val_p = sign*val;
-
-    return 0;
-}
-
-static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
-    int sign = 1;
-    int64_t val = 0;
-    size_t p = *pos;
-
-    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
-	   p++;
-
-    if (p < k->l && k->s[p] == '-')
-	sign = -1, p++;
-
-    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
-	return -1;
-
-    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
-	val = val*10 + k->s[p++]-'0';
-    
-    *pos = p;
-    *val_p = sign*val;
-
-    return 0;
-}
-
-/*
- * Loads a CRAM .crai index into memory.
- *
- * Returns 0 for success
- *        -1 for failure
- */
-int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
-    char *fn2 = NULL;
-    char buf[65536];
-    ssize_t len;
-    kstring_t kstr = {0};
-    hFILE *fp;
-    cram_index *idx;
-    cram_index **idx_stack = NULL, *ep, e;
-    int idx_stack_alloc = 0, idx_stack_ptr = 0;
-    size_t pos = 0;
-
-    /* Check if already loaded */
-    if (fd->index)
-	return 0;
-
-    fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
-    if (!fd->index)
-	return -1;
-
-    idx = &fd->index[0];
-    idx->refid = -1;
-    idx->start = INT_MIN;
-    idx->end   = INT_MAX;
-
-    idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
-    idx_stack[idx_stack_ptr] = idx;
-
-    if (!fn_idx) {
-	fn2 = hts_idx_getfn(fn, ".crai");
-	if (!fn2) {
-	    free(idx_stack);
-	    return -1;
-	}
-	fn_idx = fn2;
-    }
-
-    if (!(fp = hopen(fn_idx, "r"))) {
-	perror(fn_idx);
-	free(idx_stack);
-	free(fn2);
-	return -1;
-    }
-
-    // Load the file into memory
-    //while ((len = fread(buf, 1, 65536, fp)) > 0)
-    while ((len = hread(fp, buf, 65536)) > 0)
-	kputsn(buf, len, &kstr);
-    if (len < 0 || kstr.l < 2) {
-	if (kstr.s)
-	    free(kstr.s);
-	free(idx_stack);
-	free(fn2);
-	return -1;
-    }
-
-    if (hclose(fp)) {
-	if (kstr.s)
-	    free(kstr.s);
-	free(idx_stack);
-	free(fn2);
-	return -1;
-    }
-	
-
-    // Uncompress if required
-    if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
-	size_t l;
-	char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
-	free(kstr.s);
-	if (!s) {
-	    free(idx_stack);
-	    free(fn2);
-	    return -1;
-	}
-	kstr.s = s;
-	kstr.l = l;
-	kstr.m = l; // conservative estimate of the size allocated
-	kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated
-    }
-
-
-    // Parse it line at a time
-    do {
-	/* 1.1 layout */
-	if (kget_int32(&kstr, &pos, &e.refid) == -1) {
-	    free(kstr.s); free(idx_stack); free(fn2); return -1;
-	}
-	if (kget_int32(&kstr, &pos, &e.start) == -1) {
-	    free(kstr.s); free(idx_stack); free(fn2); return -1;
-	}
-	if (kget_int32(&kstr, &pos, &e.end) == -1) {
-	    free(kstr.s); free(idx_stack); free(fn2); return -1;
-	}
-	if (kget_int64(&kstr, &pos, &e.offset) == -1) {
-	    free(kstr.s); free(idx_stack); free(fn2); return -1;
-	}
-	if (kget_int32(&kstr, &pos, &e.slice) == -1) {
-	    free(kstr.s); free(idx_stack); free(fn2); return -1;
-	}
-	if (kget_int32(&kstr, &pos, &e.len) == -1) {
-	    free(kstr.s); free(idx_stack); free(fn2); return -1;
-	}
-
-	e.end += e.start-1;
-	//printf("%d/%d..%d\n", e.refid, e.start, e.end);
-
-	if (e.refid < -1) {
-	    free(kstr.s);
-	    free(idx_stack);
-	    free(fn2);
-	    fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
-	    return -1;
-	}
-
-	if (e.refid != idx->refid) {
-	    if (fd->index_sz < e.refid+2) {
-		size_t index_end = fd->index_sz * sizeof(*fd->index);
-		fd->index_sz = e.refid+2;
-		fd->index = realloc(fd->index,
-				    fd->index_sz * sizeof(*fd->index));
-		memset(((char *)fd->index) + index_end, 0,
-		       fd->index_sz * sizeof(*fd->index) - index_end);
-	    }
-	    idx = &fd->index[e.refid+1];
-	    idx->refid = e.refid;
-	    idx->start = INT_MIN;
-	    idx->end   = INT_MAX;
-	    idx->nslice = idx->nalloc = 0;
-	    idx->e = NULL;
-	    idx_stack[(idx_stack_ptr = 0)] = idx;
-	}
-
-	while (!(e.start >= idx->start && e.end <= idx->end)) {
-	    idx = idx_stack[--idx_stack_ptr];
-	}
-
-	// Now contains, so append
-	if (idx->nslice+1 >= idx->nalloc) {
-	    idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
-	    idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
-	}
-
-	e.nalloc = e.nslice = 0; e.e = NULL;
-	*(ep = &idx->e[idx->nslice++]) = e;
-	idx = ep;
-
-	if (++idx_stack_ptr >= idx_stack_alloc) {
-	    idx_stack_alloc *= 2;
-	    idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
-	}
-	idx_stack[idx_stack_ptr] = idx;
-
-	while (pos < kstr.l && kstr.s[pos] != '\n')
-	    pos++;
-	pos++;
-    } while (pos < kstr.l);
-
-    free(idx_stack);
-    free(kstr.s);
-    free(fn2);
-
-    // dump_index(fd);
-
-    return 0;
-}
-
-static void cram_index_free_recurse(cram_index *e) {
-    if (e->e) {
-	int i;
-	for (i = 0; i < e->nslice; i++) {
-	    cram_index_free_recurse(&e->e[i]);
-	}
-	free(e->e);
-    }
-}
-
-void cram_index_free(cram_fd *fd) {
-    int i;
-
-    if (!fd->index)
-	return;
-    
-    for (i = 0; i < fd->index_sz; i++) {
-	cram_index_free_recurse(&fd->index[i]);
-    }
-    free(fd->index);
-
-    fd->index = NULL;
-}
-
-/*
- * Searches the index for the first slice overlapping a reference ID
- * and position, or one immediately preceding it if none is found in
- * the index to overlap this position. (Our index may have missing
- * entries, but we require at least one per reference.)
- *
- * If the index finds multiple slices overlapping this position we
- * return the first one only. Subsequent calls should specifying
- * "from" as the last slice we checked to find the next one. Otherwise
- * set "from" to be NULL to find the first one.
- *
- * Returns the cram_index pointer on sucess
- *         NULL on failure
- */
-cram_index *cram_index_query(cram_fd *fd, int refid, int pos, 
-			     cram_index *from) {
-    int i, j, k;
-    cram_index *e;
-
-    if (refid+1 < 0 || refid+1 >= fd->index_sz)
-	return NULL;
-
-    if (!from)
-	from = &fd->index[refid+1];
-
-    // Ref with nothing aligned against it.
-    if (!from->e)
-	return NULL;
-
-    // This sequence is covered by the index, so binary search to find
-    // the optimal starting block.
-    i = 0, j = fd->index[refid+1].nslice-1;
-    for (k = j/2; k != i; k = (j-i)/2 + i) {
-	if (from->e[k].refid > refid) {
-	    j = k;
-	    continue;
-	}
-
-	if (from->e[k].refid < refid) {
-	    i = k;
-	    continue;
-	}
-
-	if (from->e[k].start >= pos) {
-	    j = k;
-	    continue;
-	}
-
-	if (from->e[k].start < pos) {
-	    i = k;
-	    continue;
-	}
-    }
-    // i==j or i==j-1. Check if j is better.
-    if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
-	i = j;
-
-    /* The above found *a* bin overlapping, but not necessarily the first */
-    while (i > 0 && from->e[i-1].end >= pos)
-	i--;
-
-    /* We may be one bin before the optimum, so check */
-    while (i+1 < from->nslice &&
-	   (from->e[i].refid < refid ||
-	    from->e[i].end < pos))
-	i++;
-
-    e = &from->e[i];
-
-    return e;
-}
-
-
-/*
- * Skips to a container overlapping the start coordinate listed in
- * cram_range.
- *
- * In theory we call cram_index_query multiple times, once per slice
- * overlapping the range. However slices may be absent from the index
- * which makes this problematic. Instead we find the left-most slice
- * and then read from then on, skipping decoding of slices and/or
- * whole containers when they don't overlap the specified cram_range.
- *
- * Returns 0 on success
- *        -1 on general failure
- *        -2 on no-data (empty chromosome)
- */
-int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
-    cram_index *e;
-
-    // Ideally use an index, so see if we have one.
-    if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
-	if (0 != cram_seek(fd, e->offset, SEEK_SET))
-	    if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
-		return -1;
-    } else {
-	// Absent from index, but this most likely means it simply has no data.
-	return -2;
-    }
-
-    if (fd->ctr) {
-	cram_free_container(fd->ctr);
-	fd->ctr = NULL;
-	fd->ooc = 0;
-    }
-
-    return 0;
-}
-
-
-/*
- * A specialised form of cram_index_build (below) that deals with slices
- * having multiple references in this (ref_id -2). In this scenario we
- * decode the slice to look at the RI data series instead.
- *
- * Returns 0 on success
- *        -1 on failure
- */
-static int cram_index_build_multiref(cram_fd *fd,
-				     cram_container *c,
-				     cram_slice *s,
-				     zfp *fp,
-				     off_t cpos,
-				     int32_t landmark,
-				     int sz) {
-    int i, ref = -2, ref_start = 0, ref_end;
-    char buf[1024];
-
-    if (0 != cram_decode_slice(fd, c, s, fd->header))
-	return -1;
-
-    ref_end = INT_MIN;
-    for (i = 0; i < s->hdr->num_records; i++) {
-	if (s->crecs[i].ref_id == ref) {
-	    if (ref_end < s->crecs[i].aend)
-		ref_end = s->crecs[i].aend;
-	    continue;
-	}
-
-	if (ref != -2) {
-	    sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
-		    ref, ref_start, ref_end - ref_start + 1,
-		    (int64_t)cpos, landmark, sz);
-	    zfputs(buf, fp);
-	}
-
-	ref = s->crecs[i].ref_id;
-	ref_start = s->crecs[i].apos;
-	ref_end = INT_MIN;
-    }
-
-    if (ref != -2) {
-	sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
-		ref, ref_start, ref_end - ref_start + 1,
-		(int64_t)cpos, landmark, sz);
-	zfputs(buf, fp);
-    }
-
-    return 0;
-}
-
-/*
- * Builds an index file.
- *
- * fd is a newly opened cram file that we wish to index.
- * fn_base is the filename of the associated CRAM file.
- * fn_idx is the filename of the index file to be written;
- * if NULL, we add ".crai" to fn_base to get the index filename.
- *
- * Returns 0 on success
- *        -1 on failure
- */
-int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
-    cram_container *c;
-    off_t cpos, spos, hpos;
-    zfp *fp;
-    kstring_t fn_idx_str = {0};
-
-    if (! fn_idx) {
-        kputs(fn_base, &fn_idx_str);
-        kputs(".crai", &fn_idx_str);
-        fn_idx = fn_idx_str.s;
-    }
-
-    if (!(fp = zfopen(fn_idx, "wz"))) {
-        perror(fn_idx);
-        free(fn_idx_str.s);
-        return -1;
-    }
-
-    free(fn_idx_str.s);
-
-    cpos = htell(fd->fp);
-    while ((c = cram_read_container(fd))) {
-        int j;
-
-        if (fd->err) {
-            perror("Cram container read");
-            return -1;
-        }
-
-        hpos = htell(fd->fp);
-
-        if (!(c->comp_hdr_block = cram_read_block(fd)))
-            return -1;
-        assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
-
-        c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
-        if (!c->comp_hdr)
-            return -1;
-
-        // 2.0 format
-        for (j = 0; j < c->num_landmarks; j++) {
-            char buf[1024];
-            cram_slice *s;
-            int sz;
-
-            spos = htell(fd->fp);
-            assert(spos - cpos - c->offset == c->landmark[j]);
-
-            if (!(s = cram_read_slice(fd))) {
-		zfclose(fp);
-		return -1;
-	    }
-
-            sz = (int)(htell(fd->fp) - spos);
-
-	    if (s->hdr->ref_seq_id == -2) {
-		cram_index_build_multiref(fd, c, s, fp,
-					  cpos, c->landmark[j], sz);
-	    } else {
-		sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
-			s->hdr->ref_seq_id, s->hdr->ref_seq_start,
-			s->hdr->ref_seq_span, (int64_t)cpos,
-			c->landmark[j], sz);
-		zfputs(buf, fp);
-	    }
-
-            cram_free_slice(s);
-        }
-
-        cpos = htell(fd->fp);
-        assert(cpos == hpos + c->length);
-
-        cram_free_container(c);
-    }
-    if (fd->err) {
-	zfclose(fp);
-	return -1;
-    }
-	
-
-    return (zfclose(fp) >= 0)? 0 : -1;
-}