7df6e18265341f87a69fba808aa1f92f8ebca841 markd Wed Apr 15 13:39:42 2026 -0700 move copy of htslib diff --git src/htslib/cram/cram_encode.c src/htslib/cram/cram_encode.c deleted file mode 100644 index b12ef61bd39..00000000000 --- src/htslib/cram/cram_encode.c +++ /dev/null @@ -1,3094 +0,0 @@ -/* -Copyright (c) 2012-2013 Genome Research Ltd. -Author: James Bonfield - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cram/cram.h" -#include "cram/os.h" -#include "htslib/hts.h" - -#define Z_CRAM_STRAT Z_FILTERED -//#define Z_CRAM_STRAT Z_RLE -//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY -//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY - -static int process_one_read(cram_fd *fd, cram_container *c, - cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum); - -/* - * Returns index of val into key. - * Basically strchr(key, val)-key; - */ -static int sub_idx(char *key, char val) { - int i; - - for (i = 0; *key && *key++ != val; i++); - return i; -} - -/* - * Encodes a compression header block into a generic cram_block structure. - * - * Returns cram_block ptr on success - * NULL on failure - */ -cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h) { - cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0); - cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); - int i, mc; - - if (!cb || !map) - return NULL; - - /* - * This is a concatenation of several blocks of data: - * header + landmarks, preservation map, read encoding map, and the tag - * encoding map. - * All 4 are variable sized and we need to know how large these are - * before creating the compression header itself as this starts with - * the total size (stored as a variable length string). - */ - - // Duplicated from container itself, and removed in 1.1 - if (CRAM_MAJOR_VERS(fd->version) == 1) { - itf8_put_blk(cb, h->ref_seq_id); - itf8_put_blk(cb, h->ref_seq_start); - itf8_put_blk(cb, h->ref_seq_span); - itf8_put_blk(cb, h->num_records); - itf8_put_blk(cb, h->num_landmarks); - for (i = 0; i < h->num_landmarks; i++) { - itf8_put_blk(cb, h->landmark[i]); - } - } - - if (h->preservation_map) - kh_destroy(map, h->preservation_map); - - /* Create in-memory preservation map */ - /* FIXME: should create this when we create the container */ - { - khint_t k; - int r; - - if (!(h->preservation_map = kh_init(map))) - return NULL; - - k = kh_put(map, h->preservation_map, "RN", &r); - if (-1 == r) return NULL; - kh_val(h->preservation_map, k).i = 1; - - if (CRAM_MAJOR_VERS(fd->version) == 1) { - k = kh_put(map, h->preservation_map, "PI", &r); - if (-1 == r) return NULL; - kh_val(h->preservation_map, k).i = 0; - - k = kh_put(map, h->preservation_map, "UI", &r); - if (-1 == r) return NULL; - kh_val(h->preservation_map, k).i = 1; - - k = kh_put(map, h->preservation_map, "MI", &r); - if (-1 == r) return NULL; - kh_val(h->preservation_map, k).i = 1; - - } else { - // Technically SM was in 1.0, but wasn't in Java impl. - k = kh_put(map, h->preservation_map, "SM", &r); - if (-1 == r) return NULL; - kh_val(h->preservation_map, k).i = 0; - - k = kh_put(map, h->preservation_map, "TD", &r); - if (-1 == r) return NULL; - kh_val(h->preservation_map, k).i = 0; - - k = kh_put(map, h->preservation_map, "AP", &r); - if (-1 == r) return NULL; - kh_val(h->preservation_map, k).i = h->AP_delta; - - if (fd->no_ref || fd->embed_ref) { - // Reference Required == No - k = kh_put(map, h->preservation_map, "RR", &r); - if (-1 == r) return NULL; - kh_val(h->preservation_map, k).i = 0; - } - } - } - - /* Encode preservation map; could collapse this and above into one */ - mc = 0; - BLOCK_SIZE(map) = 0; - if (h->preservation_map) { - khint_t k; - - for (k = kh_begin(h->preservation_map); - k != kh_end(h->preservation_map); - k++) { - const char *key; - khash_t(map) *pmap = h->preservation_map; - - - if (!kh_exist(pmap, k)) - continue; - - key = kh_key(pmap, k); - BLOCK_APPEND(map, key, 2); - - switch(CRAM_KEY(key[0], key[1])) { - case CRAM_KEY('M','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - - case CRAM_KEY('U','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - - case CRAM_KEY('P','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - - case CRAM_KEY('A','P'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - - case CRAM_KEY('R','N'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - - case CRAM_KEY('R','R'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - - case CRAM_KEY('S','M'): { - char smat[5], *mp = smat; - *mp++ = - (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) | - (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) | - (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) | - (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0); - *mp++ = - (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) | - (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) | - (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) | - (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0); - *mp++ = - (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) | - (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) | - (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) | - (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0); - *mp++ = - (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) | - (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) | - (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) | - (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0); - *mp++ = - (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) | - (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) | - (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) | - (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0); - BLOCK_APPEND(map, smat, 5); - break; - } - - case CRAM_KEY('T','D'): { - itf8_put_blk(map, BLOCK_SIZE(h->TD_blk)); - BLOCK_APPEND(map, - BLOCK_DATA(h->TD_blk), - BLOCK_SIZE(h->TD_blk)); - break; - } - - default: - fprintf(stderr, "Unknown preservation key '%.2s'\n", key); - break; - } - - mc++; - } - } - itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - itf8_put_blk(cb, mc); - BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); - - /* rec encoding map */ - mc = 0; - BLOCK_SIZE(map) = 0; - if (h->codecs[DS_BF]) { - if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_CF]) { - if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_RL]) { - if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_AP]) { - if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_RG]) { - if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_MF]) { - if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_NS]) { - if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_NP]) { - if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_TS]) { - if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_NF]) { - if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_TC]) { - if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_TN]) { - if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_TL]) { - if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_FN]) { - if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_FC]) { - if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_FP]) { - if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_BS]) { - if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_IN]) { - if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_DL]) { - if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_BA]) { - if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_BB]) { - if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_MQ]) { - if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_RN]) { - if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_QS]) { - if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_QQ]) { - if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_RI]) { - if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI", - fd->version)) - return NULL; - mc++; - } - if (CRAM_MAJOR_VERS(fd->version) != 1) { - if (h->codecs[DS_SC]) { - if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_RS]) { - if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_PD]) { - if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_HC]) { - if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC", - fd->version)) - return NULL; - mc++; - } - } - if (h->codecs[DS_TM]) { - if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM", - fd->version)) - return NULL; - mc++; - } - if (h->codecs[DS_TV]) { - if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV", - fd->version)) - return NULL; - mc++; - } - itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - itf8_put_blk(cb, mc); - BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); - - /* tag encoding map */ -#if 0 - mp = map; mc = 0; - if (h->tag_encoding_map) { - HashItem *hi; - HashIter *iter = HashTableIterCreate(); - if (!iter) - return NULL; - - while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) { - cram_map *m = hi->data.p; - int sz; - - mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]); - if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version))) - return NULL; - mp += sz; - mc++; - } - - HashTableIterDestroy(iter); - } -#else - mc = 0; - BLOCK_SIZE(map) = 0; - if (c->tags_used) { - khint_t k; - -#define TAG_ID(a) ((#a[0]<<8)+#a[1]) - - for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) { - int key; - if (!kh_exist(c->tags_used, k)) - continue; - - mc++; - itf8_put_blk(map, kh_key(c->tags_used, k)); - - // use block content id 4 - switch((key = kh_key(c->tags_used, k)) & 0xff) { - case 'Z': case 'H': - // string as byte_array_stop - if (CRAM_MAJOR_VERS(fd->version) == 1) { - BLOCK_APPEND(map, - "\005" // BYTE_ARRAY_STOP - "\005" // len - "\t" // stop-byte is also SAM separator - DS_aux_S "\000\000\000", - 7); - } else { - if (key>>8 == TAG_ID(OQ)) - BLOCK_APPEND(map, - "\005" // BYTE_ARRAY_STOP - "\002" // len - "\t" // stop-byte is also SAM separator - DS_aux_OQ_S, - 4); - else if (key>>8 == TAG_ID(BQ)) - BLOCK_APPEND(map, - "\005" // BYTE_ARRAY_STOP - "\002" // len - "\t" // stop-byte is also SAM separator - DS_aux_BQ_S, - 4); - else if (key>>8 == TAG_ID(BD)) - BLOCK_APPEND(map, - "\005" // BYTE_ARRAY_STOP - "\002" // len - "\t" // stop-byte is also SAM separator - DS_aux_BD_S, - 4); - else if (key>>8 == TAG_ID(BI)) - BLOCK_APPEND(map, - "\005" // BYTE_ARRAY_STOP - "\002" // len - "\t" // stop-byte is also SAM separator - DS_aux_BI_S, - 4); - else if ((key>>8 == TAG_ID(Q2)) || - (key>>8 == TAG_ID(U2)) || - (key>>8 == TAG_ID(QT)) || - (key>>8 == TAG_ID(CQ))) - BLOCK_APPEND(map, - "\005" // BYTE_ARRAY_STOP - "\002" // len - "\t" // stop-byte is also SAM separator - DS_aux_oq_S, - 4); - else if ((key>>8 == TAG_ID(R2)) || - (key>>8 == TAG_ID(E2)) || - (key>>8 == TAG_ID(CS)) || - (key>>8 == TAG_ID(BC)) || - (key>>8 == TAG_ID(RT))) - BLOCK_APPEND(map, - "\005" // BYTE_ARRAY_STOP - "\002" // len - "\t" // stop-byte is also SAM separator - DS_aux_os_S, - 4); - else - BLOCK_APPEND(map, - "\005" // BYTE_ARRAY_STOP - "\002" // len - "\t" // stop-byte is also SAM separator - DS_aux_oz_S, - 4); - } - break; - - case 'A': case 'c': case 'C': - // byte array len, 1 byte - BLOCK_APPEND(map, - "\004" // BYTE_ARRAY_LEN - "\011" // length - "\003" // HUFFMAN (len) - "\004" // huffman-len - "\001" // 1 symbol - "\001" // symbol=1 byte value - "\001" // 1 length - "\000" // length=0 - "\001" // EXTERNAL (val) - "\001" // external-len - DS_aux_S,// content-id - 11); - break; - - case 's': case 'S': - // byte array len, 2 byte - BLOCK_APPEND(map, - "\004" // BYTE_ARRAY_LEN - "\011" // length - "\003" // HUFFMAN (len) - "\004" // huffman-len - "\001" // 1 symbol - "\002" // symbol=2 byte value - "\001" // 1 length - "\000" // length=0 - "\001" // EXTERNAL (val) - "\001" // external-len - DS_aux_S,// content-id - 11); - break; - - case 'i': case 'I': case 'f': - // byte array len, 4 byte - BLOCK_APPEND(map, - "\004" // BYTE_ARRAY_LEN - "\011" // length - "\003" // HUFFMAN (len) - "\004" // huffman-len - "\001" // 1 symbol - "\004" // symbol=4 byte value - "\001" // 1 length - "\000" // length=0 - "\001" // EXTERNAL (val) - "\001" // external-len - DS_aux_S,// content-id - 11); - break; - - case 'B': - // Byte array of variable size, but we generate our tag - // byte stream at the wrong stage (during reading and not - // after slice header construction). So we use - // BYTE_ARRAY_LEN with the length codec being external - // too. - if ((key>>8 == TAG_ID(FZ)) || (key>>8 == TAG_ID(ZM))) - BLOCK_APPEND(map, - "\004" // BYTE_ARRAY_LEN - "\006" // length - "\001" // EXTERNAL (len) - "\001" // external-len - DS_aux_FZ_S // content-id - "\001" // EXTERNAL (val) - "\001" // external-len - DS_aux_FZ_S,// content-id - 8); - else - BLOCK_APPEND(map, - "\004" // BYTE_ARRAY_LEN - "\006" // length - "\001" // EXTERNAL (len) - "\001" // external-len - DS_aux_S // content-id - "\001" // EXTERNAL (val) - "\001" // external-len - DS_aux_S,// content-id - 8); - break; - - default: - fprintf(stderr, "Unsupported SAM aux type '%c'\n", - kh_key(c->tags_used, k) & 0xff); - } - //mp += m->codec->store(m->codec, mp, NULL, fd->version); - } - } -#endif - itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - itf8_put_blk(cb, mc); - BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); - - if (fd->verbose) - fprintf(stderr, "Wrote compression block header in %d bytes\n", - (int)BLOCK_SIZE(cb)); - - BLOCK_UPLEN(cb); - - cram_free_block(map); - - return cb; -} - - -/* - * Encodes a slice compression header. - * - * Returns cram_block on success - * NULL on failure - */ -cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { - char *buf; - char *cp; - cram_block *b = cram_new_block(MAPPED_SLICE, 0); - int j; - - if (!b) - return NULL; - - if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) { - cram_free_block(b); - return NULL; - } - - cp += itf8_put(cp, s->hdr->ref_seq_id); - cp += itf8_put(cp, s->hdr->ref_seq_start); - cp += itf8_put(cp, s->hdr->ref_seq_span); - cp += itf8_put(cp, s->hdr->num_records); - if (CRAM_MAJOR_VERS(fd->version) == 2) - cp += itf8_put(cp, s->hdr->record_counter); - else if (CRAM_MAJOR_VERS(fd->version) >= 3) - cp += ltf8_put(cp, s->hdr->record_counter); - cp += itf8_put(cp, s->hdr->num_blocks); - cp += itf8_put(cp, s->hdr->num_content_ids); - for (j = 0; j < s->hdr->num_content_ids; j++) { - cp += itf8_put(cp, s->hdr->block_content_ids[j]); - } - if (s->hdr->content_type == MAPPED_SLICE) - cp += itf8_put(cp, s->hdr->ref_base_id); - - if (CRAM_MAJOR_VERS(fd->version) != 1) { - memcpy(cp, s->hdr->md5, 16); cp += 16; - } - - assert(cp-buf <= 16+5*(8+s->hdr->num_blocks)); - - b->data = (unsigned char *)buf; - b->comp_size = b->uncomp_size = cp-buf; - - return b; -} - - -/* - * Encodes a single read. - * - * Returns 0 on success - * -1 on failure - */ -static int cram_encode_slice_read(cram_fd *fd, - cram_container *c, - cram_block_compression_hdr *h, - cram_slice *s, - cram_record *cr, - int *last_pos) { - int r = 0; - int32_t i32; - unsigned char uc; - - //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name); - - //printf("BF=0x%x\n", cr->flags); - // bf = cram_flag_swap[cr->flags]; - i32 = fd->cram_flag_swap[cr->flags & 0xfff]; - r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1); - - i32 = cr->cram_flags; - r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1); - - if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2) - r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1); - - r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); - - if (c->pos_sorted) { - i32 = cr->apos - *last_pos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); - *last_pos = cr->apos; - } else { - i32 = cr->apos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); - } - - r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); - - if (c->comp_hdr->read_names_included) { - // RN codec: Already stored in block[3]. - } - - if (cr->cram_flags & CRAM_FLAG_DETACHED) { - i32 = cr->mate_flags; - r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1); - - if (!c->comp_hdr->read_names_included) { - // RN codec: Already stored in block[3]. - } - - r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], - (char *)&cr->mate_ref_id, 1); - - r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], - (char *)&cr->mate_pos, 1); - - r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], - (char *)&cr->tlen, 1); - } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { - r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], - (char *)&cr->mate_line, 1); - } - - /* Aux tags */ - if (CRAM_MAJOR_VERS(fd->version) == 1) { - int j; - uc = cr->ntags; - r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1); - - for (j = 0; j < cr->ntags; j++) { - uint32_t i32 = s->TN[cr->TN_idx + j]; // id - r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1); - } - } else { - r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1); - } - - // qual - // QS codec : Already stored in block[2]. - - // features (diffs) - if (!(cr->flags & BAM_FUNMAP)) { - int prev_pos = 0, j; - - r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN], - (char *)&cr->nfeature, 1); - for (j = 0; j < cr->nfeature; j++) { - cram_feature *f = &s->features[cr->feature + j]; - - uc = f->X.code; - r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1); - i32 = f->X.pos - prev_pos; - r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1); - prev_pos = f->X.pos; - - switch(f->X.code) { - //char *seq; - - case 'X': - //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base); - - uc = f->X.base; - r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS], - (char *)&uc, 1); - break; - case 'S': - // Already done -// r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC], -// BLOCK_DATA(s->soft_blk) + f->S.seq_idx, -// f->S.len); - -// if (IS_CRAM_3_VERS(fd)) { -// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], -// BLOCK_DATA(s->seqs_blk) + f->S.seq_idx, -// f->S.len); -// } - break; - case 'I': - //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; - //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], - // seq, f->S.len); -// if (IS_CRAM_3_VERS(fd)) { -// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], -// BLOCK_DATA(s->seqs_blk) + f->I.seq_idx, -// f->I.len); -// } - break; - case 'i': - uc = f->i.base; - r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], - (char *)&uc, 1); - //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; - //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], - // seq, 1); - break; - case 'D': - i32 = f->D.len; - r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL], - (char *)&i32, 1); - break; - - case 'B': - // // Used when we try to store a non ACGTN base or an N - // // that aligns against a non ACGTN reference - - uc = f->B.base; - r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], - (char *)&uc, 1); - - // Already added - // uc = f->B.qual; - // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS], - // (char *)&uc, 1); - break; - - case 'b': - // string of bases - r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], - (char *)BLOCK_DATA(s->seqs_blk) - + f->b.seq_idx, - f->b.len); - break; - - case 'Q': - // Already added - // uc = f->B.qual; - // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS], - // (char *)&uc, 1); - break; - - case 'N': - i32 = f->N.len; - r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS], - (char *)&i32, 1); - break; - - case 'P': - i32 = f->P.len; - r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD], - (char *)&i32, 1); - break; - - case 'H': - i32 = f->H.len; - r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC], - (char *)&i32, 1); - break; - - - default: - fprintf(stderr, "unhandled feature code %c\n", - f->X.code); - return -1; - } - } - - r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ], - (char *)&cr->mqual, 1); - } else { - char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq; - if (cr->len) - r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len); - } - - return r ? -1 : 0; -} - - -/* - * Applies various compression methods to specific blocks, depending on - * known observations of how data series compress. - * - * Returns 0 on success - * -1 on failure - */ -static int cram_compress_slice(cram_fd *fd, cram_slice *s) { - int level = fd->level, i; - int method = 1< 5 && s->block[0]->uncomp_size > 500) - cram_compress_block(fd, s->block[0], NULL, GZIP, 1); - - if (fd->use_bz2) - method |= 1<use_rans) - method |= (1<use_lzma) - method |= (1<= 6) - methodF = method; - - - /* Specific compression methods for certain block types */ - if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq) - method, level)) - return -1; - - if (fd->level == 0) { - /* Do nothing */ - } else if (fd->level == 1) { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - methodF, 1)) - return -1; - for (i = DS_aux; i <= DS_aux_oz; i++) { - if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, 1)) - return -1; - } - } else if (fd->level < 3) { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - method, 1)) - return -1; - if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], - method, 1)) - return -1; - if (s->block[DS_BB]) - if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], - method, 1)) - return -1; - for (i = DS_aux; i <= DS_aux_oz; i++) { - if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, level)) - return -1; - } - } else { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - method, level)) - return -1; - if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], - method, level)) - return -1; - if (s->block[DS_BB]) - if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], - method, level)) - return -1; - for (i = DS_aux; i <= DS_aux_oz; i++) { - if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, level)) - return -1; - } - } - - // NAME: best is generally xz, bzip2, zlib then rans1 - // It benefits well from a little bit extra compression level. - if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN], - method & ~(1<block[DS_NS] != s->block[0]) - if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS], - method, level)) - return -1; - - - /* - * Minimal compression of any block still uncompressed, bar CORE - */ - { - int i; - for (i = 1; i < DS_END; i++) { - if (!s->block[i] || s->block[i] == s->block[0]) - continue; - - // fast methods only - if (s->block[i]->method == RAW) { - cram_compress_block(fd, s->block[i], fd->m[i], - methodF, level); - } - } - } - - return 0; -} - -/* - * Encodes a single slice from a container - * - * Returns 0 on success - * -1 on failure - */ -static int cram_encode_slice(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h, cram_slice *s) { - int rec, r = 0, last_pos; - int embed_ref; - enum cram_DS_ID id; - - embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0; - - /* - * Slice external blocks: - * ID 0 => base calls (insertions, soft-clip) - * ID 1 => qualities - * ID 2 => names - * ID 3 => TS (insert size), NP (next frag) - * ID 4 => tag values - * ID 6 => tag IDs (TN), if CRAM_V1.0 - * ID 7 => TD tag dictionary, if !CRAM_V1.0 - */ - - /* Create cram slice header */ - s->hdr->ref_base_id = embed_ref ? DS_ref : -1; - s->hdr->record_counter = c->num_records + c->record_counter; - c->num_records += s->hdr->num_records; - - s->block = calloc(DS_END, sizeof(s->block[0])); - s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t)); - if (!s->block || !s->hdr->block_content_ids) - return -1; - - // Create first fixed blocks, always external. - // CORE - if (!(s->block[0] = cram_new_block(CORE, 0))) - return -1; - - // TN block for CRAM v1 - if (CRAM_MAJOR_VERS(fd->version) == 1) { - if (h->codecs[DS_TN]->codec == E_EXTERNAL) { - if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1; - h->codecs[DS_TN]->external.content_id = DS_TN; - } else { - s->block[DS_TN] = s->block[0]; - } - s->block[DS_TN] = s->block[DS_TN]; - } - - // Embedded reference - if (embed_ref) { - if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref))) - return -1; - s->ref_id = DS_ref; // needed? - BLOCK_APPEND(s->block[DS_ref], - c->ref + c->first_base - c->ref_start, - c->last_base - c->first_base + 1); - } - - /* - * All the data-series blocks if appropriate. - */ - for (id = DS_BF; id < DS_TN; id++) { - if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL || - h->codecs[id]->codec == E_BYTE_ARRAY_STOP || - h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) { - switch (h->codecs[id]->codec) { - case E_EXTERNAL: - if (!(s->block[id] = cram_new_block(EXTERNAL, id))) - return -1; - h->codecs[id]->external.content_id = id; - break; - - case E_BYTE_ARRAY_STOP: - if (!(s->block[id] = cram_new_block(EXTERNAL, id))) - return -1; - h->codecs[id]->byte_array_stop.content_id = id; - break; - - case E_BYTE_ARRAY_LEN: { - cram_codec *cc; - - cc = h->codecs[id]->e_byte_array_len.len_codec; - if (cc->codec == E_EXTERNAL) { - int eid = cc->external.content_id; - if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) - return -1; - cc->external.content_id = eid; - cc->out = s->block[eid]; - } - - cc = h->codecs[id]->e_byte_array_len.val_codec; - if (cc->codec == E_EXTERNAL) { - int eid = cc->external.content_id; - if (!s->block[eid]) - if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) - return -1; - cc->external.content_id = eid; - cc->out = s->block[eid]; - } - break; - } - default: - break; - } - } else { - if (!(id == DS_BB && !h->codecs[DS_BB])) - s->block[id] = s->block[0]; - } - if (h->codecs[id]) - h->codecs[id]->out = s->block[id]; - } - - /* Encode reads */ - last_pos = s->hdr->ref_seq_start; - for (rec = 0; rec < s->hdr->num_records; rec++) { - cram_record *cr = &s->crecs[rec]; - if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1) - return -1; - } - - s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7); - s->block[0]->comp_size = s->block[0]->uncomp_size; - - // Make sure the fixed blocks point to the correct sources - s->block[DS_IN] = s->base_blk; s->base_blk = NULL; - s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL; - s->block[DS_RN] = s->name_blk; s->name_blk = NULL; - s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL; - s->block[DS_aux]= s->aux_blk; s->aux_blk = NULL; - s->block[DS_aux_OQ]= s->aux_OQ_blk; s->aux_OQ_blk = NULL; - s->block[DS_aux_BQ]= s->aux_BQ_blk; s->aux_BQ_blk = NULL; - s->block[DS_aux_BD]= s->aux_BD_blk; s->aux_BD_blk = NULL; - s->block[DS_aux_BI]= s->aux_BI_blk; s->aux_BI_blk = NULL; - s->block[DS_aux_FZ]= s->aux_FZ_blk; s->aux_FZ_blk = NULL; - s->block[DS_aux_oq]= s->aux_oq_blk; s->aux_oq_blk = NULL; - s->block[DS_aux_os]= s->aux_os_blk; s->aux_os_blk = NULL; - s->block[DS_aux_oz]= s->aux_oz_blk; s->aux_oz_blk = NULL; - - // Ensure block sizes are up to date. - for (id = 1; id < DS_END; id++) { - if (!s->block[id] || s->block[id] == s->block[0]) - continue; - - if (s->block[id]->uncomp_size == 0) - BLOCK_UPLEN(s->block[id]); - } - - // Compress it all - if (cram_compress_slice(fd, s) == -1) - return -1; - - // Collapse empty blocks and create hdr_block - { - int i, j; - for (i = j = 1; i < DS_END; i++) { - if (!s->block[i] || s->block[i] == s->block[0]) - continue; - if (s->block[i]->uncomp_size == 0) { - cram_free_block(s->block[i]); - s->block[i] = NULL; - continue; - } - s->block[j] = s->block[i]; - s->hdr->block_content_ids[j-1] = s->block[i]->content_id; - j++; - } - s->hdr->num_content_ids = j-1; - s->hdr->num_blocks = j; - - if (!(s->hdr_block = cram_encode_slice_header(fd, s))) - return -1; - } - - return r ? -1 : 0; -} - -/* - * Encodes all slices in a container into blocks. - * Returns 0 on success - * -1 on failure - */ -int cram_encode_container(cram_fd *fd, cram_container *c) { - int i, j, slice_offset; - cram_block_compression_hdr *h = c->comp_hdr; - cram_block *c_hdr; - int multi_ref = 0; - int r1, r2, sn, nref; - spare_bams *spares; - - /* Cache references up-front if we have unsorted access patterns */ - pthread_mutex_lock(&fd->ref_lock); - nref = fd->refs->nref; - pthread_mutex_unlock(&fd->ref_lock); - - if (!fd->no_ref && c->refs_used) { - for (i = 0; i < nref; i++) { - if (c->refs_used[i]) - cram_get_ref(fd, i, 1, 0); - } - } - - /* To create M5 strings */ - /* Fetch reference sequence */ - if (!fd->no_ref) { - bam_seq_t *b = c->bams[0]; - char *ref; - - ref = cram_get_ref(fd, bam_ref(b), 1, 0); - if (!ref && bam_ref(b) >= 0) { - fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b)); - return -1; - } - if ((c->ref_id = bam_ref(b)) >= 0) { - c->ref_seq_id = c->ref_id; - c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; - c->ref_start = 1; - c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; - } else { - c->ref_seq_id = c->ref_id; // FIXME remove one var! - } - } else { - c->ref_id = bam_ref(c->bams[0]); - cram_ref_incr(fd->refs, c->ref_id); - c->ref_seq_id = c->ref_id; - } - - /* Turn bams into cram_records and gather basic stats */ - for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { - cram_slice *s = c->slices[sn]; - int first_base = INT_MAX, last_base = INT_MIN; - - assert(sn < c->curr_slice); - - /* FIXME: we could create our slice objects here too instead of - * in cram_put_bam_seq. It's more natural here and also this is - * bit is threaded so it's less work in the main thread. - */ - - for (r2 = 0; r1 < c->curr_c_rec && r2 < c->max_rec; r1++, r2++) { - cram_record *cr = &s->crecs[r2]; - bam_seq_t *b = c->bams[r1]; - - /* If multi-ref we need to cope with changing reference per seq */ - if (c->multi_seq && !fd->no_ref) { - if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) { - if (c->ref_seq_id >= 0) - cram_ref_decr(fd->refs, c->ref_seq_id); - - if (!cram_get_ref(fd, bam_ref(b), 1, 0)) { - fprintf(stderr, "Failed to load reference #%d\n", - bam_ref(b)); - return -1; - } - - c->ref_seq_id = bam_ref(b); // overwritten later by -2 - assert(fd->refs->ref_id[c->ref_seq_id]->seq); - c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; - c->ref_start = 1; - c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; - } - } - - if (process_one_read(fd, c, s, cr, b, r2) != 0) - return -1; - - if (first_base > cr->apos) - first_base = cr->apos; - - if (last_base < cr->aend) - last_base = cr->aend; - } - - if (c->multi_seq) { - s->hdr->ref_seq_id = -2; - s->hdr->ref_seq_start = 0; - s->hdr->ref_seq_span = 0; - } else { - s->hdr->ref_seq_id = c->ref_id; - s->hdr->ref_seq_start = first_base; - s->hdr->ref_seq_span = last_base - first_base + 1; - } - s->hdr->num_records = r2; - } - - if (c->multi_seq && !fd->no_ref) { - if (c->ref_seq_id >= 0) - cram_ref_decr(fd->refs, c->ref_seq_id); - } - - /* Link our bams[] array onto the spare bam list for reuse */ - spares = malloc(sizeof(*spares)); - pthread_mutex_lock(&fd->bam_list_lock); - spares->bams = c->bams; - spares->next = fd->bl; - fd->bl = spares; - pthread_mutex_unlock(&fd->bam_list_lock); - c->bams = NULL; - - /* Detect if a multi-seq container */ - cram_stats_encoding(fd, c->stats[DS_RI]); - multi_ref = c->stats[DS_RI]->nvals > 1; - - if (multi_ref) { - if (fd->verbose) - fprintf(stderr, "Multi-ref container\n"); - c->ref_seq_id = -2; - c->ref_seq_start = 0; - c->ref_seq_span = 0; - } - - - /* Compute MD5s */ - for (i = 0; i < c->curr_slice; i++) { - cram_slice *s = c->slices[i]; - - if (CRAM_MAJOR_VERS(fd->version) != 1) { - if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) { - hts_md5_context *md5 = hts_md5_init(); - if (!md5) - return -1; - hts_md5_update(md5, - c->ref + s->hdr->ref_seq_start - c->ref_start, - s->hdr->ref_seq_span); - hts_md5_final(s->hdr->md5, md5); - hts_md5_destroy(md5); - } else { - memset(s->hdr->md5, 0, 16); - } - } - } - - c->num_records = 0; - c->num_blocks = 1; // cram_block_compression_hdr - c->length = 0; - - //fprintf(stderr, "=== BF ===\n"); - h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]), - c->stats[DS_BF], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== CF ===\n"); - h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]), - c->stats[DS_CF], E_INT, NULL, - fd->version); -// fprintf(stderr, "=== RN ===\n"); -// h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]), -// c->stats[DS_RN], E_BYTE_ARRAY, NULL, -// fd->version); - - //fprintf(stderr, "=== AP ===\n"); - if (c->pos_sorted) { - h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), - c->stats[DS_AP], E_INT, NULL, - fd->version); - } else { - int p[2] = {0, c->max_apos}; - h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, - fd->version); - } - - //fprintf(stderr, "=== RG ===\n"); - h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), - c->stats[DS_RG], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== MQ ===\n"); - h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]), - c->stats[DS_MQ], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== NS ===\n"); - h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), - c->stats[DS_NS], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== MF ===\n"); - h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]), - c->stats[DS_MF], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== TS ===\n"); - h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), - c->stats[DS_TS], E_INT, NULL, - fd->version); - //fprintf(stderr, "=== NP ===\n"); - h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), - c->stats[DS_NP], E_INT, NULL, - fd->version); - //fprintf(stderr, "=== NF ===\n"); - h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]), - c->stats[DS_NF], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== RL ===\n"); - h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]), - c->stats[DS_RL], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== FN ===\n"); - h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]), - c->stats[DS_FN], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== FC ===\n"); - h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]), - c->stats[DS_FC], E_BYTE, NULL, - fd->version); - - //fprintf(stderr, "=== FP ===\n"); - h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]), - c->stats[DS_FP], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== DL ===\n"); - h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]), - c->stats[DS_DL], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== BA ===\n"); - h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]), - c->stats[DS_BA], E_BYTE, NULL, - fd->version); - - if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cram_byte_array_len_encoder e; - - e.len_encoding = E_EXTERNAL; - e.len_dat = (void *)DS_BB_len; - //e.len_dat = (void *)DS_BB; - - e.val_encoding = E_EXTERNAL; - e.val_dat = (void *)DS_BB; - - h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, - E_BYTE_ARRAY, (void *)&e, - fd->version); - } else { - h->codecs[DS_BB] = NULL; - } - - //fprintf(stderr, "=== BS ===\n"); - h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]), - c->stats[DS_BS], E_BYTE, NULL, - fd->version); - - if (CRAM_MAJOR_VERS(fd->version) == 1) { - h->codecs[DS_TL] = NULL; - h->codecs[DS_RI] = NULL; - h->codecs[DS_RS] = NULL; - h->codecs[DS_PD] = NULL; - h->codecs[DS_HC] = NULL; - h->codecs[DS_SC] = NULL; - - //fprintf(stderr, "=== TC ===\n"); - h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]), - c->stats[DS_TC], E_BYTE, NULL, - fd->version); - - //fprintf(stderr, "=== TN ===\n"); - h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]), - c->stats[DS_TN], E_INT, NULL, - fd->version); - } else { - h->codecs[DS_TC] = NULL; - h->codecs[DS_TN] = NULL; - - //fprintf(stderr, "=== TL ===\n"); - h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]), - c->stats[DS_TL], E_INT, NULL, - fd->version); - - - //fprintf(stderr, "=== RI ===\n"); - h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), - c->stats[DS_RI], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== RS ===\n"); - h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]), - c->stats[DS_RS], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== PD ===\n"); - h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]), - c->stats[DS_PD], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== HC ===\n"); - h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]), - c->stats[DS_HC], E_INT, NULL, - fd->version); - - //fprintf(stderr, "=== SC ===\n"); - if (1) { - int i2[2] = {0, DS_SC}; - - h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, - E_BYTE_ARRAY, (void *)i2, - fd->version); - } else { - // Appears to be no practical benefit to using this method, - // but it may work better if we start mixing SC, IN and BB - // elements into the same external block. - cram_byte_array_len_encoder e; - - e.len_encoding = E_EXTERNAL; - e.len_dat = (void *)DS_SC_len; - - e.val_encoding = E_EXTERNAL; - e.val_dat = (void *)DS_SC; - - h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, - E_BYTE_ARRAY, (void *)&e, - fd->version); - } - } - - //fprintf(stderr, "=== IN ===\n"); - { - int i2[2] = {0, DS_IN}; - h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, - E_BYTE_ARRAY, (void *)i2, - fd->version); - } - - h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, - (void *)DS_QS, - fd->version); - { - int i2[2] = {0, DS_RN}; - h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, - E_BYTE_ARRAY, (void *)i2, - fd->version); - } - - - /* Encode slices */ - for (i = 0; i < c->curr_slice; i++) { - if (fd->verbose) - fprintf(stderr, "Encode slice %d\n", i); - if (cram_encode_slice(fd, c, h, c->slices[i]) != 0) - return -1; - } - - /* Create compression header */ - { - h->ref_seq_id = c->ref_seq_id; - h->ref_seq_start = c->ref_seq_start; - h->ref_seq_span = c->ref_seq_span; - h->num_records = c->num_records; - - h->mapped_qs_included = 0; // fixme - h->unmapped_qs_included = 0; // fixme - h->AP_delta = c->pos_sorted; - // h->... fixme - memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); - - if (!(c_hdr = cram_encode_compression_header(fd, c, h))) - return -1; - } - - /* Compute landmarks */ - /* Fill out slice landmarks */ - c->num_landmarks = c->curr_slice; - c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark)); - if (!c->landmark) - return -1; - - /* - * Slice offset starts after the first block, so we need to simulate - * writing it to work out the correct offset - */ - { - slice_offset = c_hdr->method == RAW - ? c_hdr->uncomp_size - : c_hdr->comp_size; - slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(c_hdr->content_id) + - itf8_size(c_hdr->comp_size) + - itf8_size(c_hdr->uncomp_size); - } - - c->ref_seq_id = c->slices[0]->hdr->ref_seq_id; - c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; - c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; - for (i = 0; i < c->curr_slice; i++) { - cram_slice *s = c->slices[i]; - - c->num_blocks += s->hdr->num_blocks + 1; // slice header - c->landmark[i] = slice_offset; - - if (s->hdr->ref_seq_start + s->hdr->ref_seq_span > - c->ref_seq_start + c->ref_seq_span) { - c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span - - c->ref_seq_start; - } - - slice_offset += s->hdr_block->method == RAW - ? s->hdr_block->uncomp_size - : s->hdr_block->comp_size; - - slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(s->hdr_block->content_id) + - itf8_size(s->hdr_block->comp_size) + - itf8_size(s->hdr_block->uncomp_size); - - for (j = 0; j < s->hdr->num_blocks; j++) { - slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(s->block[j]->content_id) + - itf8_size(s->block[j]->comp_size) + - itf8_size(s->block[j]->uncomp_size); - - slice_offset += s->block[j]->method == RAW - ? s->block[j]->uncomp_size - : s->block[j]->comp_size; - } - } - c->length += slice_offset; // just past the final slice - - c->comp_hdr_block = c_hdr; - - if (c->ref_seq_id >= 0) { - cram_ref_decr(fd->refs, c->ref_seq_id); - } - - /* Cache references up-front if we have unsorted access patterns */ - if (!fd->no_ref && c->refs_used) { - for (i = 0; i < fd->refs->nref; i++) { - if (c->refs_used[i]) - cram_ref_decr(fd->refs, i); - } - } - - return 0; -} - - -/* - * Adds a feature code to a read within a slice. For purposes of minimising - * memory allocations and fragmentation we have one array of features for all - * reads within the slice. We return the index into this array for this new - * feature. - * - * Returns feature index on success - * -1 on failure. - */ -static int cram_add_feature(cram_container *c, cram_slice *s, - cram_record *r, cram_feature *f) { - if (s->nfeatures >= s->afeatures) { - s->afeatures = s->afeatures ? s->afeatures*2 : 1024; - s->features = realloc(s->features, s->afeatures*sizeof(*s->features)); - if (!s->features) - return -1; - } - - if (!r->nfeature++) { - r->feature = s->nfeatures; - cram_stats_add(c->stats[DS_FP], f->X.pos); - } else { - cram_stats_add(c->stats[DS_FP], - f->X.pos - s->features[r->feature + r->nfeature-2].X.pos); - } - cram_stats_add(c->stats[DS_FC], f->X.code); - - s->features[s->nfeatures++] = *f; - - return 0; -} - -static int cram_add_substitution(cram_fd *fd, cram_container *c, - cram_slice *s, cram_record *r, - int pos, char base, char qual, char ref) { - cram_feature f; - - // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN - if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) { - f.X.pos = pos+1; - f.X.code = 'X'; - f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f]; - cram_stats_add(c->stats[DS_BS], f.X.base); - } else { - f.B.pos = pos+1; - f.B.code = 'B'; - f.B.base = base; - f.B.qual = qual; - cram_stats_add(c->stats[DS_BA], f.B.base); - cram_stats_add(c->stats[DS_QS], f.B.qual); - BLOCK_APPEND_CHAR(s->qual_blk, qual); - } - return cram_add_feature(c, s, r, &f); -} - -static int cram_add_bases(cram_fd *fd, cram_container *c, - cram_slice *s, cram_record *r, - int pos, int len, char *base) { - cram_feature f; - - f.b.pos = pos+1; - f.b.code = 'b'; - f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk); - f.b.len = len; - - return cram_add_feature(c, s, r, &f); -} - -static int cram_add_base(cram_fd *fd, cram_container *c, - cram_slice *s, cram_record *r, - int pos, char base, char qual) { - cram_feature f; - f.B.pos = pos+1; - f.B.code = 'B'; - f.B.base = base; - f.B.qual = qual; - cram_stats_add(c->stats[DS_BA], base); - cram_stats_add(c->stats[DS_QS], qual); - BLOCK_APPEND_CHAR(s->qual_blk, qual); - return cram_add_feature(c, s, r, &f); -} - -static int cram_add_quality(cram_fd *fd, cram_container *c, - cram_slice *s, cram_record *r, - int pos, char qual) { - cram_feature f; - f.Q.pos = pos+1; - f.Q.code = 'Q'; - f.Q.qual = qual; - cram_stats_add(c->stats[DS_QS], qual); - BLOCK_APPEND_CHAR(s->qual_blk, qual); - return cram_add_feature(c, s, r, &f); -} - -static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r, - int pos, int len, char *base) { - cram_feature f; - f.D.pos = pos+1; - f.D.code = 'D'; - f.D.len = len; - cram_stats_add(c->stats[DS_DL], len); - return cram_add_feature(c, s, r, &f); -} - -static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r, - int pos, int len, char *base, int version) { - cram_feature f; - f.S.pos = pos+1; - f.S.code = 'S'; - f.S.len = len; - switch (CRAM_MAJOR_VERS(version)) { - case 1: - f.S.seq_idx = BLOCK_SIZE(s->base_blk); - BLOCK_APPEND(s->base_blk, base, len); - BLOCK_APPEND_CHAR(s->base_blk, '\0'); - break; - - case 2: - default: - f.S.seq_idx = BLOCK_SIZE(s->soft_blk); - if (base) { - BLOCK_APPEND(s->soft_blk, base, len); - } else { - int i; - for (i = 0; i < len; i++) - BLOCK_APPEND_CHAR(s->soft_blk, 'N'); - } - BLOCK_APPEND_CHAR(s->soft_blk, '\0'); - break; - -// default: -// // v3.0 onwards uses BB data-series -// f.S.seq_idx = BLOCK_SIZE(s->soft_blk); - } - return cram_add_feature(c, s, r, &f); -} - -static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r, - int pos, int len, char *base) { - cram_feature f; - f.S.pos = pos+1; - f.S.code = 'H'; - f.S.len = len; - cram_stats_add(c->stats[DS_HC], len); - return cram_add_feature(c, s, r, &f); -} - -static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r, - int pos, int len, char *base) { - cram_feature f; - f.S.pos = pos+1; - f.S.code = 'N'; - f.S.len = len; - cram_stats_add(c->stats[DS_RS], len); - return cram_add_feature(c, s, r, &f); -} - -static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r, - int pos, int len, char *base) { - cram_feature f; - f.S.pos = pos+1; - f.S.code = 'P'; - f.S.len = len; - cram_stats_add(c->stats[DS_PD], len); - return cram_add_feature(c, s, r, &f); -} - -static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, - int pos, int len, char *base) { - cram_feature f; - f.I.pos = pos+1; - if (len == 1) { - char b = base ? *base : 'N'; - f.i.code = 'i'; - f.i.base = b; - cram_stats_add(c->stats[DS_BA], b); - } else { - f.I.code = 'I'; - f.I.len = len; - f.S.seq_idx = BLOCK_SIZE(s->base_blk); - if (base) { - BLOCK_APPEND(s->base_blk, base, len); - } else { - int i; - for (i = 0; i < len; i++) - BLOCK_APPEND_CHAR(s->base_blk, 'N'); - } - BLOCK_APPEND_CHAR(s->base_blk, '\0'); - } - return cram_add_feature(c, s, r, &f); -} - -/* - * Encodes auxiliary data. - * Returns the read-group parsed out of the BAM aux fields on success - * NULL on failure or no rg present (FIXME) - */ -static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, - cram_slice *s, cram_record *cr) { - char *aux, *tmp, *rg = NULL; - int aux_size = bam_blk_size(b) - - ((char *)bam_aux(b) - (char *)&bam_ref(b)); - - /* Worst case is 1 nul char on every ??:Z: string, so +33% */ - BLOCK_GROW(s->aux_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_blk); - - aux = (char *)bam_aux(b); - cr->TN_idx = s->nTN; - - while (aux[0] != 0) { - int32_t i32; - int r; - - if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { - rg = &aux[3]; - while (*aux++); - continue; - } - if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { - while (*aux++); - continue; - } - if (aux[0] == 'N' && aux[1] == 'M') { - switch(aux[2]) { - case 'A': case 'C': case 'c': aux+=4; break; - case 'I': case 'i': case 'f': aux+=7; break; - default: - fprintf(stderr, "Unhandled type code for NM tag\n"); - return NULL; - } - continue; - } - - cr->ntags++; - - i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; - kh_put(s_i2i, c->tags_used, i32, &r); - if (-1 == r) - return NULL; - - if (s->nTN >= s->aTN) { - s->aTN = s->aTN ? s->aTN*2 : 1024; - if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN)))) - return NULL; - } - s->TN[s->nTN++] = i32; - cram_stats_add(c->stats[DS_TN], i32); - - switch(aux[2]) { - case 'A': case 'C': case 'c': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; - break; - - case 'S': case 's': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'I': case 'i': case 'f': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'd': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'Z': case 'H': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; // stop byte - break; - - case 'B': { - int type = aux[3], blen; - uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + - (((unsigned char *)aux)[5]<< 8) + - (((unsigned char *)aux)[6]<<16) + - (((unsigned char *)aux)[7]<<24)); - // skip TN field - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - - // We use BYTE_ARRAY_LEN with external length, so store that first - switch (type) { - case 'c': case 'C': - blen = count; - break; - case 's': case 'S': - blen = 2*count; - break; - case 'i': case 'I': case 'f': - blen = 4*count; - break; - default: - fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", - type); - return NULL; - - } - - tmp += itf8_put(tmp, blen+5); - - *tmp++=*aux++; // sub-type & length - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - - // The tag data itself - memcpy(tmp, aux, blen); tmp += blen; aux += blen; - - //cram_stats_add(c->aux_B_stats, blen); - break; - } - default: - fprintf(stderr, "Unknown aux type '%c'\n", aux[2]); - return NULL; - } - } - cram_stats_add(c->stats[DS_TC], cr->ntags); - - cr->aux = BLOCK_SIZE(s->aux_blk); - cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); - BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); - assert(s->aux_blk->byte <= s->aux_blk->alloc); - - return rg; -} - -/* - * Encodes auxiliary data. Largely duplicated from above, but done so to - * keep it simple and avoid a myriad of version ifs. - * - * Returns the read-group parsed out of the BAM aux fields on success - * NULL on failure or no rg present (FIXME) - */ -static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, - cram_slice *s, cram_record *cr) { - char *aux, *orig, *tmp, *rg = NULL; - int aux_size = bam_get_l_aux(b); - cram_block *td_b = c->comp_hdr->TD_blk; - int TD_blk_size = BLOCK_SIZE(td_b), new; - char *key; - khint_t k; - - - /* Worst case is 1 nul char on every ??:Z: string, so +33% */ - BLOCK_GROW(s->aux_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_blk); - - - orig = aux = (char *)bam_aux(b); - - // Copy aux keys to td_b and aux values to s->aux_blk - while (aux - orig < aux_size && aux[0] != 0) { - uint32_t i32; - int r; - - // RG:Z - if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { - rg = &aux[3]; - while (*aux++); - continue; - } - - // MD:Z - if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { - if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) { - while (*aux++); - continue; - } - } - - // NM:i - if (aux[0] == 'N' && aux[1] == 'M') { - if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) { - switch(aux[2]) { - case 'A': case 'C': case 'c': aux+=4; break; - case 'S': case 's': aux+=5; break; - case 'I': case 'i': case 'f': aux+=7; break; - default: - fprintf(stderr, "Unhandled type code for NM tag\n"); - return NULL; - } - continue; - } - } - - BLOCK_APPEND(td_b, aux, 3); - - i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; - kh_put(s_i2i, c->tags_used, i32, &r); - if (-1 == r) - return NULL; - - // BQ:Z - if (aux[0] == 'B' && aux[1] == 'Q' && aux[2] == 'Z') { - char *tmp; - if (!s->aux_BQ_blk) - if (!(s->aux_BQ_blk = cram_new_block(EXTERNAL, DS_aux_BQ))) - return NULL; - BLOCK_GROW(s->aux_BQ_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_BQ_blk); - aux += 3; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; - BLOCK_SIZE(s->aux_BQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BQ_blk); - continue; - } - - // BD:Z - if (aux[0] == 'B' && aux[1]=='D' && aux[2] == 'Z') { - char *tmp; - if (!s->aux_BD_blk) - if (!(s->aux_BD_blk = cram_new_block(EXTERNAL, DS_aux_BD))) - return NULL; - BLOCK_GROW(s->aux_BD_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_BD_blk); - aux += 3; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; - BLOCK_SIZE(s->aux_BD_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BD_blk); - continue; - } - - // BI:Z - if (aux[0] == 'B' && aux[1]=='I' && aux[2] == 'Z') { - char *tmp; - if (!s->aux_BI_blk) - if (!(s->aux_BI_blk = cram_new_block(EXTERNAL, DS_aux_BI))) - return NULL; - BLOCK_GROW(s->aux_BI_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_BI_blk); - aux += 3; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; - BLOCK_SIZE(s->aux_BI_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BI_blk); - continue; - } - - // OQ:Z: - if (aux[0] == 'O' && aux[1] == 'Q' && aux[2] == 'Z') { - char *tmp; - if (!s->aux_OQ_blk) - if (!(s->aux_OQ_blk = cram_new_block(EXTERNAL, DS_aux_OQ))) - return NULL; - BLOCK_GROW(s->aux_OQ_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_OQ_blk); - aux += 3; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; - BLOCK_SIZE(s->aux_OQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_OQ_blk); - continue; - } - - // FZ:B or ZM:B - if ((aux[0] == 'F' && aux[1] == 'Z' && aux[2] == 'B') || - (aux[0] == 'Z' && aux[1] == 'M' && aux[2] == 'B')) { - int type = aux[3], blen; - uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + - (((unsigned char *)aux)[5]<< 8) + - (((unsigned char *)aux)[6]<<16) + - (((unsigned char *)aux)[7]<<24)); - char *tmp; - if (!s->aux_FZ_blk) - if (!(s->aux_FZ_blk = cram_new_block(EXTERNAL, DS_aux_FZ))) - return NULL; - BLOCK_GROW(s->aux_FZ_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_FZ_blk); - - // skip TN field - aux+=3; - - // We use BYTE_ARRAY_LEN with external length, so store that first - switch (type) { - case 'c': case 'C': - blen = count; - break; - case 's': case 'S': - blen = 2*count; - break; - case 'i': case 'I': case 'f': - blen = 4*count; - break; - default: - fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", - type); - return NULL; - - } - - blen += 5; // sub-type & length - tmp += itf8_put(tmp, blen); - - // The tag data itself - memcpy(tmp, aux, blen); tmp += blen; aux += blen; - - BLOCK_SIZE(s->aux_FZ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_FZ_blk); - continue; - } - - // Other quality data - {Q2,E2,U2,CQ}:Z and similar - if (((aux[0] == 'Q' && aux[1] == '2') || - (aux[0] == 'U' && aux[1] == '2') || - (aux[0] == 'Q' && aux[1] == 'T') || - (aux[0] == 'C' && aux[1] == 'Q')) && aux[2] == 'Z') { - char *tmp; - if (!s->aux_oq_blk) - if (!(s->aux_oq_blk = cram_new_block(EXTERNAL, DS_aux_oq))) - return NULL; - BLOCK_GROW(s->aux_oq_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_oq_blk); - aux += 3; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; - BLOCK_SIZE(s->aux_oq_blk) = (uc *)tmp - BLOCK_DATA(s->aux_oq_blk); - continue; - } - - // Other sequence data - {R2,E2,CS,BC,RT}:Z and similar - if (((aux[0] == 'R' && aux[1] == '2') || - (aux[0] == 'E' && aux[1] == '2') || - (aux[0] == 'C' && aux[1] == 'S') || - (aux[0] == 'B' && aux[1] == 'C') || - (aux[0] == 'R' && aux[1] == 'T')) && aux[2] == 'Z') { - char *tmp; - if (!s->aux_os_blk) - if (!(s->aux_os_blk = cram_new_block(EXTERNAL, DS_aux_os))) - return NULL; - BLOCK_GROW(s->aux_os_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_os_blk); - aux += 3; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; - BLOCK_SIZE(s->aux_os_blk) = (uc *)tmp - BLOCK_DATA(s->aux_os_blk); - continue; - } - - - switch(aux[2]) { - case 'A': case 'C': case 'c': - aux+=3; - *tmp++=*aux++; - break; - - case 'S': case 's': - aux+=3; - *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'I': case 'i': case 'f': - aux+=3; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'd': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'Z': case 'H': - { - char *tmp; - if (!s->aux_oz_blk) - if (!(s->aux_oz_blk = cram_new_block(EXTERNAL, DS_aux_oz))) - return NULL; - BLOCK_GROW(s->aux_oz_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_oz_blk); - aux += 3; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; - BLOCK_SIZE(s->aux_oz_blk) = (uc *)tmp - - BLOCK_DATA(s->aux_oz_blk); - } - break; - - case 'B': { - int type = aux[3], blen; - uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + - (((unsigned char *)aux)[5]<< 8) + - (((unsigned char *)aux)[6]<<16) + - (((unsigned char *)aux)[7]<<24)); - // skip TN field - aux+=3; - - // We use BYTE_ARRAY_LEN with external length, so store that first - switch (type) { - case 'c': case 'C': - blen = count; - break; - case 's': case 'S': - blen = 2*count; - break; - case 'i': case 'I': case 'f': - blen = 4*count; - break; - default: - fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", - type); - return NULL; - - } - - blen += 5; // sub-type & length - tmp += itf8_put(tmp, blen); - - // The tag data itself - memcpy(tmp, aux, blen); tmp += blen; aux += blen; - - //cram_stats_add(c->aux_B_stats, blen); - break; - } - default: - fprintf(stderr, "Unknown aux type '%c'\n", aux[2]); - return NULL; - } - } - - // FIXME: sort BLOCK_DATA(td_b) by char[3] triples - - // And and increment TD hash entry - BLOCK_APPEND_CHAR(td_b, 0); - - // Duplicate key as BLOCK_DATA() can be realloced to a new pointer. - key = string_ndup(c->comp_hdr->TD_keys, - (char *)BLOCK_DATA(td_b) + TD_blk_size, - BLOCK_SIZE(td_b) - TD_blk_size); - k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new); - if (new < 0) { - return NULL; - } else if (new == 0) { - BLOCK_SIZE(td_b) = TD_blk_size; - } else { - kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL; - c->comp_hdr->nTL++; - } - - cr->TL = kh_val(c->comp_hdr->TD_hash, k); - cram_stats_add(c->stats[DS_TL], cr->TL); - - cr->aux = BLOCK_SIZE(s->aux_blk); - cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); - BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); - assert(s->aux_blk->byte <= s->aux_blk->alloc); - - return rg; -} - - -/* - * Handles creation of a new container or new slice, flushing any - * existing containers when appropriate. - * - * Really this is next slice, which may or may not lead to a new container. - * - * Returns cram_container pointer on success - * NULL on failure. - */ -static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { - cram_container *c = fd->ctr; - cram_slice *s; - int i; - - /* First occurence */ - if (c->curr_ref == -2) - c->curr_ref = bam_ref(b); - - if (c->slice) { - s = c->slice; - if (c->multi_seq) { - s->hdr->ref_seq_id = -2; - s->hdr->ref_seq_start = 0; - s->hdr->ref_seq_span = 0; - } else { - s->hdr->ref_seq_id = c->curr_ref; - s->hdr->ref_seq_start = c->first_base; - s->hdr->ref_seq_span = c->last_base - c->first_base + 1; - } - s->hdr->num_records = c->curr_rec; - - if (c->curr_slice == 0) { - if (c->ref_seq_id != s->hdr->ref_seq_id) - c->ref_seq_id = s->hdr->ref_seq_id; - c->ref_seq_start = c->first_base; - } - - c->curr_slice++; - } - - /* Flush container */ - if (c->curr_slice == c->max_slice || - (bam_ref(b) != c->curr_ref && !c->multi_seq)) { - c->ref_seq_span = fd->last_base - c->ref_seq_start + 1; - if (fd->verbose) - fprintf(stderr, "Flush container %d/%d..%d\n", - c->ref_seq_id, c->ref_seq_start, - c->ref_seq_start + c->ref_seq_span -1); - - /* Encode slices */ - if (fd->pool) { - if (-1 == cram_flush_container_mt(fd, c)) - return NULL; - } else { - if (-1 == cram_flush_container(fd, c)) - return NULL; - - // Move to sep func, as we need cram_flush_container for - // the closing phase to flush the partial container. - for (i = 0; i < c->max_slice; i++) { - cram_free_slice(c->slices[i]); - c->slices[i] = NULL; - } - - c->slice = NULL; - c->curr_slice = 0; - - /* Easy approach for purposes of freeing stats */ - cram_free_container(c); - } - - c = fd->ctr = cram_new_container(fd->seqs_per_slice, - fd->slices_per_container); - if (!c) - return NULL; - c->record_counter = fd->record_counter; - c->curr_ref = bam_ref(b); - } - - c->last_pos = c->first_base = c->last_base = bam_pos(b)+1; - - /* New slice */ - c->slice = c->slices[c->curr_slice] = - cram_new_slice(MAPPED_SLICE, c->max_rec); - if (!c->slice) - return NULL; - - if (c->multi_seq) { - c->slice->hdr->ref_seq_id = -2; - c->slice->hdr->ref_seq_start = 0; - c->slice->last_apos = 1; - } else { - c->slice->hdr->ref_seq_id = bam_ref(b); - // wrong for unsorted data, will fix during encoding. - c->slice->hdr->ref_seq_start = bam_pos(b)+1; - c->slice->last_apos = bam_pos(b)+1; - } - - c->curr_rec = 0; - - return c; -} - -/* - * Converts a single bam record into a cram record. - * Possibly used within a thread. - * - * Returns 0 on success; - * -1 on failure - */ -static int process_one_read(cram_fd *fd, cram_container *c, - cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum) { - int i, fake_qual = -1; - char *cp, *rg; - char *ref, *seq, *qual; - - // FIXME: multi-ref containers - - ref = c->ref; - cr->flags = bam_flag(b); - cr->len = bam_seq_len(b); - - //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); - - // Fields to resolve later - //cr->mate_line; // index to another cram_record - //cr->mate_flags; // MF - //cr->ntags; // TC - cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); - if (CRAM_MAJOR_VERS(fd->version) == 1) - rg = cram_encode_aux_1_0(fd, b, c, s, cr); - else - rg = cram_encode_aux(fd, b, c, s, cr); - - //cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b)); - //cr->aux = DSTRING_LEN(s->aux_ds); - //dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size); - - /* Read group, identified earlier */ - if (rg) { - SAM_RG *brg = sam_hdr_find_rg(fd->header, rg); - cr->rg = brg ? brg->id : -1; - } else if (CRAM_MAJOR_VERS(fd->version) == 1) { - SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN"); - assert(brg); - } else { - cr->rg = -1; - } - cram_stats_add(c->stats[DS_RG], cr->rg); - - - cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id); - cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]); - - // Non reference based encoding means storing the bases verbatim as features, which in - // turn means every base also has a quality already stored. - if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) - cr->cram_flags = CRAM_FLAG_PRESERVE_QUAL_SCORES; - else - cr->cram_flags = 0; - - if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3) - cr->cram_flags |= CRAM_FLAG_NO_SEQ; - //cram_stats_add(c->stats[DS_CF], cr->cram_flags); - - c->num_bases += cr->len; - cr->apos = bam_pos(b)+1; - if (c->pos_sorted) { - if (cr->apos < s->last_apos) { - c->pos_sorted = 0; - } else { - cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos); - s->last_apos = cr->apos; - } - } else { - //cram_stats_add(c->stats[DS_AP], cr->apos); - } - c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos); - - cr->name = BLOCK_SIZE(s->name_blk); - cr->name_len = bam_name_len(b); - cram_stats_add(c->stats[DS_RN], cr->name_len); - - BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); - - - /* - * This seqs_ds is largely pointless and it could reuse the same memory - * over and over. - * s->base_blk is what we need for encoding. - */ - cr->seq = BLOCK_SIZE(s->seqs_blk); - cr->qual = BLOCK_SIZE(s->qual_blk); - BLOCK_GROW(s->seqs_blk, cr->len+1); - BLOCK_GROW(s->qual_blk, cr->len); - seq = cp = (char *)BLOCK_END(s->seqs_blk); - - *seq = 0; -#ifdef ALLOW_UAC - { - // Convert seq 2 bases at a time for speed. - static const uint16_t code2base[256] = { - 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077, - 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029, - 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081, - 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033, - 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083, - 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035, - 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093, - 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045, - 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087, - 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039, - 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098, - 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050, - 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099, - 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051, - 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102, - 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054, - 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100, - 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052, - 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103, - 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055, - 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105, - 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057, - 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088, - 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040, - 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091, - 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043, - 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084, - 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036, - 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082, - 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034, - 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094, - 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046 - }; - - int l2 = cr->len / 2; - unsigned char *from = (unsigned char *)bam_seq(b); - uint16_t *cpi = (uint16_t *)cp; - cp[0] = 0; - for (i = 0; i < l2; i++) - cpi[i] = le_int2(code2base[from[i]]); - if ((i *= 2) < cr->len) - cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; - } -#else - for (i = 0; i < cr->len; i++) - cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; -#endif - BLOCK_SIZE(s->seqs_blk) += cr->len; - - qual = cp = (char *)bam_qual(b); - - /* Copy and parse */ - if (!(cr->flags & BAM_FUNMAP)) { - int32_t *cig_to, *cig_from; - int apos = cr->apos-1, spos = 0; - - cr->cigar = s->ncigar; - cr->ncigar = bam_cigar_len(b); - while (cr->cigar + cr->ncigar >= s->cigar_alloc) { - s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024; - s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar)); - if (!s->cigar) - return -1; - } - - cig_to = (int32_t *)s->cigar; - cig_from = (int32_t *)bam_cigar(b); - - cr->feature = 0; - cr->nfeature = 0; - for (i = 0; i < cr->ncigar; i++) { - enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK; - int cig_len = cig_from[i] >> BAM_CIGAR_SHIFT; - cig_to[i] = cig_from[i]; - - /* Can also generate events from here for CRAM diffs */ - - switch (cig_op) { - int l; - - // Don't trust = and X ops to be correct. - case BAM_CMATCH: - case BAM_CBASE_MATCH: - case BAM_CBASE_MISMATCH: - //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n", - // cig_len, &ref[apos], cig_len, &seq[spos]); - l = 0; - if (!fd->no_ref && cr->len) { - int end = cig_len+apos < c->ref_end - ? cig_len : c->ref_end - apos; - char *sp = &seq[spos]; - char *rp = &ref[apos]; - char *qp = &qual[spos]; - if (end > cr->len) { - fprintf(stderr, "CIGAR and query sequence are of " - "different length\n"); - return -1; - } - for (l = 0; l < end; l++) { - if (rp[l] != sp[l]) { - if (!sp[l]) - break; - if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) { - // Disabled for the time being as it doesn't - // seem to gain us much. - int ol=l; - while (l 1) { - if (cram_add_bases(fd, c, s, cr, spos+ol, - l-ol, &seq[spos+ol])) - return -1; - l--; - } else { - l = ol; - if (cram_add_substitution(fd, c, s, cr, - spos+l, sp[l], - qp[l], rp[l])) - return -1; - } - } else { - if (cram_add_substitution(fd, c, s, cr, spos+l, - sp[l], qp[l], rp[l])) - return -1; - } - } - } - spos += l; - apos += l; - } - - if (l < cig_len && cr->len) { - if (fd->no_ref) { - if (CRAM_MAJOR_VERS(fd->version) == 3) { - if (cram_add_bases(fd, c, s, cr, spos, - cig_len-l, &seq[spos])) - return -1; - spos += cig_len-l; - } else { - for (; l < cig_len && seq[spos]; l++, spos++) { - if (cram_add_base(fd, c, s, cr, spos, - seq[spos], qual[spos])) - return -1; - } - } - } else { - /* off end of sequence or non-ref based output */ - for (; l < cig_len && seq[spos]; l++, spos++) { - if (cram_add_base(fd, c, s, cr, spos, - seq[spos], qual[spos])) - return -1; - } - } - apos += cig_len; - } else if (!cr->len) { - /* Seq "*" */ - apos += cig_len; - spos += cig_len; - } - break; - - case BAM_CDEL: - if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos])) - return -1; - apos += cig_len; - break; - - case BAM_CREF_SKIP: - if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos])) - return -1; - apos += cig_len; - break; - - case BAM_CINS: - if (cram_add_insertion(c, s, cr, spos, cig_len, - cr->len ? &seq[spos] : NULL)) - return -1; - if (fd->no_ref && cr->len) { - for (l = 0; l < cig_len; l++, spos++) { - cram_add_quality(fd, c, s, cr, spos, qual[spos]); - } - } else { - spos += cig_len; - } - break; - - case BAM_CSOFT_CLIP: - if (cram_add_softclip(c, s, cr, spos, cig_len, - cr->len ? &seq[spos] : NULL, - fd->version)) - return -1; - if (fd->no_ref && - !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { - if (cr->len) { - for (l = 0; l < cig_len; l++, spos++) { - cram_add_quality(fd, c, s, cr, spos, qual[spos]); - } - } else { - for (l = 0; l < cig_len; l++, spos++) { - cram_add_quality(fd, c, s, cr, spos, -1); - } - } - } else { - spos += cig_len; - } - break; - - case BAM_CHARD_CLIP: - if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos])) - return -1; - break; - - case BAM_CPAD: - if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos])) - return -1; - break; - } - } - if (cr->len && spos != cr->len) { - fprintf(stderr, "CIGAR and query sequence are of different " - "length\n"); - return -1; - } - fake_qual = spos; - cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end); - cram_stats_add(c->stats[DS_FN], cr->nfeature); - } else { - // Unmapped - cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES; - cr->cigar = 0; - cr->ncigar = 0; - cr->nfeature = 0; - cr->aend = cr->apos; - for (i = 0; i < cr->len; i++) - cram_stats_add(c->stats[DS_BA], seq[i]); - fake_qual = 0; - } - - /* - * Append to the qual block now. We do this here as - * cram_add_substitution() can generate BA/QS events which need to - * be in the qual block before we append the rest of the data. - */ - if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) { - /* Special case of seq "*" */ - if (cr->len == 0) { - cr->len = fake_qual; - BLOCK_GROW(s->qual_blk, cr->len); - cp = (char *)BLOCK_END(s->qual_blk); - memset(cp, 255, cr->len); - } else { - BLOCK_GROW(s->qual_blk, cr->len); - cp = (char *)BLOCK_END(s->qual_blk); - char *from = (char *)&bam_qual(b)[0]; - char *to = &cp[0]; - memcpy(to, from, cr->len); - //for (i = 0; i < cr->len; i++) cp[i] = from[i]; - } - BLOCK_SIZE(s->qual_blk) += cr->len; - } else { - if (cr->len == 0) - cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1; - } - - cram_stats_add(c->stats[DS_RL], cr->len); - - /* Now we know apos and aend both, update mate-pair information */ - { - int new; - khint_t k; - int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0; - - //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum, - // cr->name_len, DSTRING_STR(s->name_ds)+cr->name); - if (cr->flags & BAM_FPAIRED) { - char *key = string_ndup(s->pair_keys, - (char *)BLOCK_DATA(s->name_blk)+cr->name, - cr->name_len); - if (!key) - return -1; - - k = kh_put(m_s2i, s->pair[sec], key, &new); - if (-1 == new) - return -1; - else if (new > 0) - kh_val(s->pair[sec], k) = rnum; - } else { - new = 1; - } - - if (new == 0) { - cram_record *p = &s->crecs[kh_val(s->pair[sec], k)]; - int aleft, aright, sign; - - aleft = MIN(cr->apos, p->apos); - aright = MAX(cr->aend, p->aend); - if (cr->apos < p->apos) { - sign = 1; - } else if (cr->apos > p->apos) { - sign = -1; - } else if (cr->flags & BAM_FREAD1) { - sign = 1; - } else { - sign = -1; - } - - //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair[sec], k)); - - // This vs p: tlen, matepos, flags - if (bam_ins_size(b) != sign*(aright-aleft+1)) - goto detached; - - if (MAX(bam_mate_pos(b)+1, 0) != p->apos) - goto detached; - - if (((bam_flag(b) & BAM_FMUNMAP) != 0) != - ((p->flags & BAM_FUNMAP) != 0)) - goto detached; - - if (((bam_flag(b) & BAM_FMREVERSE) != 0) != - ((p->flags & BAM_FREVERSE) != 0)) - goto detached; - - - // p vs this: tlen, matepos, flags - if (p->tlen != -sign*(aright-aleft+1)) - goto detached; - - if (p->mate_pos != cr->apos) - goto detached; - - if (((p->flags & BAM_FMUNMAP) != 0) != - ((p->mate_flags & CRAM_M_UNMAP) != 0)) - goto detached; - - if (((p->flags & BAM_FMREVERSE) != 0) != - ((p->mate_flags & CRAM_M_REVERSE) != 0)) - goto detached; - - // Supplementary reads are just too ill defined - if ((cr->flags & BAM_FSUPPLEMENTARY) || - (p->flags & BAM_FSUPPLEMENTARY)) - goto detached; - - /* - * The fields below are unused when encoding this read as it is - * no longer detached. In theory they may get referred to when - * processing a 3rd or 4th read in this template?, so we set them - * here just to be sure. - * - * They do not need cram_stats_add() calls those as they are - * not emitted. - */ - cr->mate_pos = p->apos; - cr->tlen = sign*(aright-aleft+1); - cr->mate_flags = - ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + - ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE; - - // Decrement statistics aggregated earlier - cram_stats_del(c->stats[DS_NP], p->mate_pos); - cram_stats_del(c->stats[DS_MF], p->mate_flags); - cram_stats_del(c->stats[DS_TS], p->tlen); - cram_stats_del(c->stats[DS_NS], p->mate_ref_id); - - /* Similarly we could correct the p-> values too, but these will no - * longer have any code that refers back to them as the new 'p' - * for this template is our current 'cr'. - */ - //p->mate_pos = cr->apos; - //p->mate_flags = - // ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + - // ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE; - //p->tlen = p->apos - cr->aend; - - // Clear detached from cr flags - cr->cram_flags &= ~CRAM_FLAG_DETACHED; - cram_stats_add(c->stats[DS_CF], cr->cram_flags); - - // Clear detached from p flags and set downstream - cram_stats_del(c->stats[DS_CF], p->cram_flags); - p->cram_flags &= ~CRAM_FLAG_DETACHED; - p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM; - cram_stats_add(c->stats[DS_CF], p->cram_flags); - - p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1); - cram_stats_add(c->stats[DS_NF], p->mate_line); - - kh_val(s->pair[sec], k) = rnum; - } else { - detached: - //fprintf(stderr, "unpaired\n"); - - /* Derive mate flags from this flag */ - cr->mate_flags = 0; - if (bam_flag(b) & BAM_FMUNMAP) - cr->mate_flags |= CRAM_M_UNMAP; - if (bam_flag(b) & BAM_FMREVERSE) - cr->mate_flags |= CRAM_M_REVERSE; - - cram_stats_add(c->stats[DS_MF], cr->mate_flags); - - cr->mate_pos = MAX(bam_mate_pos(b)+1, 0); - cram_stats_add(c->stats[DS_NP], cr->mate_pos); - - cr->tlen = bam_ins_size(b); - cram_stats_add(c->stats[DS_TS], cr->tlen); - - cr->cram_flags |= CRAM_FLAG_DETACHED; - cram_stats_add(c->stats[DS_CF], cr->cram_flags); - cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)); - } - } - - cr->mqual = bam_map_qual(b); - cram_stats_add(c->stats[DS_MQ], cr->mqual); - - cr->mate_ref_id = bam_mate_ref(b); - - if (!(bam_flag(b) & BAM_FUNMAP)) { - if (c->first_base > cr->apos) - c->first_base = cr->apos; - - if (c->last_base < cr->aend) - c->last_base = cr->aend; - } - - return 0; -} - -/* - * Write iterator: put BAM format sequences into a CRAM file. - * We buffer up a containers worth of data at a time. - * - * Returns 0 on success - * -1 on failure - */ -int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { - cram_container *c; - - if (!fd->ctr) { - fd->ctr = cram_new_container(fd->seqs_per_slice, - fd->slices_per_container); - if (!fd->ctr) - return -1; - fd->ctr->record_counter = fd->record_counter; - } - c = fd->ctr; - - if (!c->slice || c->curr_rec == c->max_rec || - (bam_ref(b) != c->curr_ref && c->curr_ref >= -1)) { - int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1; - int curr_ref = c->slice ? c->curr_ref : bam_ref(b); - - - /* - * Start packing slices when we routinely have under 1/4tr full. - * - * This option isn't available if we choose to embed references - * since we can only have one per slice. - */ - if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && - fd->last_slice && fd->last_slice < c->max_rec/4+10 && - !fd->embed_ref) { - if (fd->verbose && !c->multi_seq) - fprintf(stderr, "Multi-ref enabled for this container\n"); - multi_seq = 1; - } - - slice_rec = c->slice_rec; - curr_rec = c->curr_rec; - - if (CRAM_MAJOR_VERS(fd->version) == 1 || - c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) { - if (NULL == (c = cram_next_container(fd, b))) { - if (fd->ctr) { - // prevent cram_close attempting to flush - cram_free_container(fd->ctr); - fd->ctr = NULL; - } - return -1; - } - } - - /* - * Due to our processing order, some things we've already done we - * cannot easily undo. So when we first notice we should be packing - * multiple sequences per container we emit the small partial - * container as-is and then start a fresh one in a different mode. - */ - if (multi_seq) { - fd->multi_seq = 1; - c->multi_seq = 1; - c->pos_sorted = 0; // required atm for multi_seq slices - - if (!c->refs_used) { - pthread_mutex_lock(&fd->ref_lock); - c->refs_used = calloc(fd->refs->nref, sizeof(int)); - pthread_mutex_unlock(&fd->ref_lock); - if (!c->refs_used) - return -1; - } - } - - fd->last_slice = curr_rec - slice_rec; - c->slice_rec = c->curr_rec; - - // Have we seen this reference before? - if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref && - !fd->unsorted && multi_seq) { - - if (!c->refs_used) { - pthread_mutex_lock(&fd->ref_lock); - c->refs_used = calloc(fd->refs->nref, sizeof(int)); - pthread_mutex_unlock(&fd->ref_lock); - if (!c->refs_used) - return -1; - } else if (c->refs_used && c->refs_used[bam_ref(b)]) { - pthread_mutex_lock(&fd->ref_lock); - fd->unsorted = 1; - pthread_mutex_unlock(&fd->ref_lock); - fd->multi_seq = 1; - } - } - - c->curr_ref = bam_ref(b); - if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++; - } - - if (!c->bams) { - /* First time through, allocate a set of bam pointers */ - pthread_mutex_lock(&fd->bam_list_lock); - if (fd->bl) { - spare_bams *spare = fd->bl; - c->bams = spare->bams; - fd->bl = spare->next; - free(spare); - } else { - c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *)); - if (!c->bams) - return -1; - } - pthread_mutex_unlock(&fd->bam_list_lock); - } - - /* Copy or alloc+copy the bam record, for later encoding */ - if (c->bams[c->curr_c_rec]) - bam_copy1(c->bams[c->curr_c_rec], b); - else - c->bams[c->curr_c_rec] = bam_dup(b); - - c->curr_rec++; - c->curr_c_rec++; - fd->record_counter++; - - return 0; -}