7df6e18265341f87a69fba808aa1f92f8ebca841 markd Wed Apr 15 13:39:42 2026 -0700 move copy of htslib diff --git src/htslib/cram/cram_stats.c src/htslib/cram/cram_stats.c deleted file mode 100644 index 700b3926056..00000000000 --- src/htslib/cram/cram_stats.c +++ /dev/null @@ -1,448 +0,0 @@ -/* -Copyright (c) 2012-2013 Genome Research Ltd. -Author: James Bonfield <jkb@sanger.ac.uk> - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include <config.h> - -#include <stdio.h> -#include <errno.h> -#include <assert.h> -#include <stdlib.h> -#include <string.h> -#include <zlib.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <math.h> -#include <ctype.h> - -#include "cram/cram.h" -#include "cram/os.h" - -cram_stats *cram_stats_create(void) { - return calloc(1, sizeof(cram_stats)); -} - -void cram_stats_add(cram_stats *st, int32_t val) { - st->nsamp++; - - //assert(val >= 0); - - if (val < MAX_STAT_VAL && val >= 0) { - st->freqs[val]++; - } else { - khint_t k; - int r; - - if (!st->h) { - st->h = kh_init(m_i2i); - } - - k = kh_put(m_i2i, st->h, val, &r); - if (r == 0) - kh_val(st->h, k)++; - else if (r != -1) - kh_val(st->h, k) = 1; - //else - //; // FIXME: handle error - } -} - -void cram_stats_del(cram_stats *st, int32_t val) { - st->nsamp--; - - //assert(val >= 0); - - if (val < MAX_STAT_VAL && val >= 0) { - st->freqs[val]--; - assert(st->freqs[val] >= 0); - } else if (st->h) { - khint_t k = kh_get(m_i2i, st->h, val); - - if (k != kh_end(st->h)) { - if (--kh_val(st->h, k) == 0) - kh_del(m_i2i, st->h, k); - } else { - fprintf(stderr, "Failed to remove val %d from cram_stats\n", val); - st->nsamp++; - } - } else { - fprintf(stderr, "Failed to remove val %d from cram_stats\n", val); - st->nsamp++; - } -} - -void cram_stats_dump(cram_stats *st) { - int i; - fprintf(stderr, "cram_stats:\n"); - for (i = 0; i < MAX_STAT_VAL; i++) { - if (!st->freqs[i]) - continue; - fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]); - } - if (st->h) { - khint_t k; - for (k = kh_begin(st->h); k != kh_end(st->h); k++) { - if (!kh_exist(st->h, k)) - continue; - - fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k)); - } - } -} - -#if 1 -/* Returns the number of bits set in val; it the highest bit used */ -static int nbits(int v) { - static const int MultiplyDeBruijnBitPosition[32] = { - 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31, - 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32 - }; - - v |= v >> 1; // first up to set all bits 1 after the first 1 */ - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - - // DeBruijn magic to find top bit - return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27]; -} -#endif - -/* - * Computes entropy from integer frequencies for various encoding methods and - * picks the best encoding. - * - * FIXME: we could reuse some of the code here for the actual encoding - * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman. - * - * Returns the best codec to use. - */ -enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { - enum cram_encoding best_encoding = E_NULL; - int best_size = INT_MAX, bits; - int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; - int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes; - - //cram_stats_dump(st); - - /* Count number of unique symbols */ - for (nvals = i = 0; i < MAX_STAT_VAL; i++) { - if (!st->freqs[i]) - continue; - if (nvals >= vals_alloc) { - vals_alloc = vals_alloc ? vals_alloc*2 : 1024; - vals = realloc(vals, vals_alloc * sizeof(int)); - freqs = realloc(freqs, vals_alloc * sizeof(int)); - if (!vals || !freqs) { - if (vals) free(vals); - if (freqs) free(freqs); - return E_HUFFMAN; // Cannot do much else atm - } - } - vals[nvals] = i; - freqs[nvals] = st->freqs[i]; - ntot += freqs[nvals]; - if (max_val < i) max_val = i; - if (min_val > i) min_val = i; - nvals++; - } - if (st->h) { - khint_t k; - int i; - - for (k = kh_begin(st->h); k != kh_end(st->h); k++) { - if (!kh_exist(st->h, k)) - continue; - - if (nvals >= vals_alloc) { - vals_alloc = vals_alloc ? vals_alloc*2 : 1024; - vals = realloc(vals, vals_alloc * sizeof(int)); - freqs = realloc(freqs, vals_alloc * sizeof(int)); - if (!vals || !freqs) - return E_HUFFMAN; // Cannot do much else atm - } - i = kh_key(st->h, k); - vals[nvals]=i; - freqs[nvals] = kh_val(st->h, k); - ntot += freqs[nvals]; - if (max_val < i) max_val = i; - if (min_val > i) min_val = i; - nvals++; - } - } - - st->nvals = nvals; - assert(ntot == st->nsamp); - - if (nvals <= 1) { - free(vals); - free(freqs); - return E_HUFFMAN; - } - - if (fd->verbose > 1) - fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n", - min_val, max_val, nvals, ntot); - - /* Theoretical entropy */ -// if (fd->verbose > 1) { -// double dbits = 0; -// for (i = 0; i < nvals; i++) { -// dbits += freqs[i] * log((double)freqs[i]/ntot); -// } -// dbits /= -log(2); -// if (fd->verbose > 1) -// fprintf(stderr, "Entropy = %f\n", dbits); -// } - - if (nvals > 1 && ntot > 256) { -#if 0 - /* - * CRUDE huffman estimator. Round to closest and round up from 0 - * to 1 bit. - * - * With and without ITF8 incase we have a few discrete values but with - * large magnitude. - * - * Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be - * compared in this way, but order-1 (eg rans1) or maybe LZ77 modes - * may detect the correlation of high bytes to low bytes in multi- - * byte values. So this predictor breaks down. - */ - double dbits = 0; // entropy + ~huffman - double dbitsH = 0; - double dbitsE = 0; // external entropy + ~huffman - double dbitsEH = 0; - int F[256] = {0}, n = 0; - double e = 0; // accumulated error bits - for (i = 0; i < nvals; i++) { - double x; int X; - unsigned int v = vals[i]; - - //Better encoding would cope with sign. - //v = ABS(vals[i])*2+(vals[i]<0); - - if (!(v & ~0x7f)) { - F[v] += freqs[i], n+=freqs[i]; - } else if (!(v & ~0x3fff)) { - F[(v>>8) |0x80] += freqs[i]; - F[ v &0xff] += freqs[i], n+=2*freqs[i]; - } else if (!(v & ~0x1fffff)) { - F[(v>>16)|0xc0] += freqs[i]; - F[(v>>8 )&0xff] += freqs[i]; - F[ v &0xff] += freqs[i], n+=3*freqs[i]; - } else if (!(v & ~0x0fffffff)) { - F[(v>>24)|0xe0] += freqs[i]; - F[(v>>16)&0xff] += freqs[i]; - F[(v>>8 )&0xff] += freqs[i]; - F[ v &0xff] += freqs[i], n+=4*freqs[i]; - } else { - F[(v>>28)|0xf0] += freqs[i]; - F[(v>>20)&0xff] += freqs[i]; - F[(v>>12)&0xff] += freqs[i]; - F[(v>>4 )&0xff] += freqs[i]; - F[ v &0x0f] += freqs[i], n+=5*freqs[i]; - } - - x = -log((double)freqs[i]/ntot)/.69314718055994530941; - X = x+0.5; - if ((int)(x+((double)e/freqs[i])+.5)>X) { - X++; - } else if ((int)(x+((double)e/freqs[i])+.5)<X) { - X--; - } - e-=freqs[i]*(X-x); - X += (X==0); - - //fprintf(stderr, "Val %d = %d x %d (ent %f, %d) e %f\n", i, v, freqs[i], x, X, e); - - dbits += freqs[i] * x; - dbitsH += freqs[i] * X; - } - - for (i = 0; i < 256; i++) { - if (F[i]) { - double x = -log((double)F[i]/n)/.69314718055994530941; - int X = x+0.5; - X += (X==0); - dbitsE += F[i] * x; - dbitsEH += F[i] * X; - - //fprintf(stderr, "Val %d = %d x %d (e %f, %d)\n", i, i, F[i], x, X); - } - } - - //fprintf(stderr, "CORE Entropy = %f, %f\n", dbits/8, dbitsH/8); - //fprintf(stderr, "Ext. Entropy = %f, %f\n", dbitsE/8, dbitsEH/8); - - if (dbitsE < 1000 || dbitsE / dbits > 1.1) { - //fprintf(stderr, "=> %d < 200 ? E_HUFFMAN : E_BETA\n", nvals); - free(vals); free(freqs); - return nvals < 200 ? E_HUFFMAN : E_BETA; - } -#endif - free(vals); free(freqs); - return E_EXTERNAL; - } - - /* - * Avoid complex stats for now, just do heuristic of HUFFMAN for small - * alphabets and BETA for anything large. - */ - free(vals); free(freqs); - return nvals < 200 ? E_HUFFMAN : E_BETA; - //return E_HUFFMAN; - //return E_EXTERNAL; - - - /* We only support huffman now anyway... */ - //free(vals); free(freqs); return E_HUFFMAN; - - /* Beta */ - bits = nbits(max_val - min_val) * ntot; - if (fd->verbose > 1) - fprintf(stderr, "BETA = %d\n", bits); - if (best_size > bits) - best_size = bits, best_encoding = E_BETA; - -#if 0 - /* Unary */ - if (min_val >= 0) { - for (bits = i = 0; i < nvals; i++) - bits += freqs[i]*(vals[i]+1); - if (fd->verbose > 1) - fprintf(stderr, "UNARY = %d\n", bits); - if (best_size > bits) - best_size = bits, best_encoding = E_NULL; //E_UNARY; - } - - /* Gamma */ - for (bits = i = 0; i < nvals; i++) - bits += ((nbits(vals[i]-min_val+1)-1) + nbits(vals[i]-min_val+1)) * freqs[i]; - if (fd->verbose > 1) - fprintf(stderr, "GAMMA = %d\n", bits); - if (best_size > bits) - best_size = bits, best_encoding = E_GAMMA; - - /* Subexponential */ - for (k = 0; k < 10; k++) { - for (bits = i = 0; i < nvals; i++) { - if (vals[i]-min_val < (1<<k)) - bits += (1 + k)*freqs[i]; - else - bits += (nbits(vals[i]-min_val)*2-k)*freqs[i]; - } - - if (fd->verbose > 1) - fprintf(stderr, "SUBEXP%d = %d\n", k, bits); - if (best_size > bits) - best_size = bits, best_encoding = E_SUBEXP; - } -#endif - - /* byte array len */ - - /* byte array stop */ - - /* External? Guesswork! */ - - /* Huffman */ -// qsort(freqs, nvals, sizeof(freqs[0]), sort_freqs); -// for (i = 0; i < nvals; i++) { -// fprintf(stderr, "%d = %d\n", i, freqs[i]); -// vals[i] = 0; -// } - - /* Grow freqs to 2*freqs, to store sums */ - /* Vals holds link data */ - freqs = realloc(freqs, 2*nvals*sizeof(*freqs)); - codes = calloc(2*nvals, sizeof(*codes)); - if (!freqs || !codes) - return E_HUFFMAN; // Cannot do much else atm - - /* Inefficient, use pointers to form chain so we can insert and maintain - * a sorted list? This is currently O(nvals^2) complexity. - */ - for (;;) { - int low1 = INT_MAX, low2 = INT_MAX; - int ind1 = 0, ind2 = 0; - for (i = 0; i < nvals; i++) { - if (freqs[i] < 0) - continue; - if (low1 > freqs[i]) - low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i; - else if (low2 > freqs[i]) - low2 = freqs[i], ind2 = i; - } - if (low2 == INT_MAX) - break; - - //fprintf(stderr, "Merge ind %d (%d), %d (%d) = %d+%d, => %d=%d\n", - // ind1, vals[ind1], ind2, vals[ind2], low1, low2, - // nvals, low1+low2); - - freqs[nvals] = low1 + low2; - codes[ind1] = nvals; - codes[ind2] = nvals; - freqs[ind1] *= -1; - freqs[ind2] *= -1; - nvals++; - } - nvals = nvals/2+1; - - for (i = 0; i < nvals; i++) { - int code_len = 0; - for (k = codes[i]; k; k = codes[k]) - code_len++; - codes[i] = code_len; - freqs[i] *= -1; - //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], codes[i]); - } - - for (bits = i = 0; i < nvals; i++) { - bits += freqs[i] * codes[i]; - } - if (fd->verbose > 1) - fprintf(stderr, "HUFFMAN = %d\n", bits); - if (best_size >= bits) - best_size = bits, best_encoding = E_HUFFMAN; - free(codes); - - free(vals); - free(freqs); - - return best_encoding; -} - -void cram_stats_free(cram_stats *st) { - if (st->h) - kh_destroy(m_i2i, st->h); - free(st); -}