7df6e18265341f87a69fba808aa1f92f8ebca841 markd Wed Apr 15 13:39:42 2026 -0700 move copy of htslib diff --git src/htslib/vcfutils.c src/htslib/vcfutils.c deleted file mode 100644 index 141fe0e3fef..00000000000 --- src/htslib/vcfutils.c +++ /dev/null @@ -1,691 +0,0 @@ -/* vcfutils.c -- allele-related utility functions. - - Copyright (C) 2012-2015 Genome Research Ltd. - - Author: Petr Danecek - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include "htslib/vcfutils.h" -#include "htslib/kbitset.h" - -int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) -{ - int i; - for (i=0; in_allele; i++) ac[i]=0; - - // Use INFO/AC,AN field only when asked - if ( which&BCF_UN_INFO ) - { - bcf_unpack(line, BCF_UN_INFO); - int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN"); - int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC"); - int i, an=-1, ac_len=0, ac_type=0; - uint8_t *ac_ptr=NULL; - if ( an_id>=0 && ac_id>=0 ) - { - for (i=0; in_info; i++) - { - bcf_info_t *z = &line->d.info[i]; - if ( z->key == an_id ) an = z->v1.i; - else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } - } - } - if ( an>=0 && ac_ptr ) - { - int nac = 0; - #define BRANCH_INT(type_t) { \ - type_t *p = (type_t *) ac_ptr; \ - for (i=0; iid[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; - } - #undef BRANCH_INT - if ( anid[BCF_DT_CTG][line->rid].key, line->pos+1); - exit(1); - } - ac[0] = an - nac; - return 1; - } - } - - // Split genotype fields only when asked - if ( which&BCF_UN_FMT ) - { - int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT"); - if ( gt_id<0 ) return 0; - bcf_unpack(line, BCF_UN_FMT); - bcf_fmt_t *fmt_gt = NULL; - for (i=0; i<(int)line->n_fmt; i++) - if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } - if ( !fmt_gt ) return 0; - #define BRANCH_INT(type_t,vector_end) { \ - for (i=0; in_sample; i++) \ - { \ - type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ - int ial; \ - for (ial=0; ialn; ial++) \ - { \ - if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ - if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ - if ( p[ial]>>1 > line->n_allele ) \ - { \ - fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ - exit(1); \ - } \ - ac[(p[ial]>>1)-1]++; \ - } \ - } \ - } - switch (fmt_gt->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; - } - #undef BRANCH_INT - return 1; - } - return 0; -} - -int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal) -{ - int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0; - #define BRANCH_INT(type_t,vector_end) { \ - type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ - for (i=0; in; i++) \ - { \ - if ( p[i] == vector_end ) break; /* smaller ploidy */ \ - if ( bcf_gt_is_missing(p[i]) ) return GT_UNKN; /* missing allele */ \ - int tmp = p[i]>>1; \ - if ( tmp>1 ) \ - { \ - if ( !ial ) { ial = tmp; has_alt = 1; } \ - else if ( tmp!=ial ) \ - { \ - if ( tmptype) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; - } - #undef BRANCH_INT - - if ( _ial ) *_ial = ial>0 ? ial-1 : ial; - if ( _jal ) *_jal = jal>0 ? jal-1 : jal; - if ( !nals ) return GT_UNKN; - if ( nals==1 ) - return has_ref ? GT_HAPL_R : GT_HAPL_A; - if ( !has_ref ) - return has_alt==1 ? GT_HOM_AA : GT_HET_AA; - if ( !has_alt ) - return GT_HOM_RR; - return GT_HET_RA; -} - -int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) -{ - int i; - bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT"); - if ( !gt ) return 0; - - int *ac = (int*) calloc(line->n_allele,sizeof(int)); - - // check if all alleles are populated - #define BRANCH(type_t,vector_end) { \ - for (i=0; in_sample; i++) \ - { \ - type_t *p = (type_t*) (gt->p + i*gt->size); \ - int ial; \ - for (ial=0; ialn; ial++) \ - { \ - if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ - if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ - if ( (p[ial]>>1)-1 >= line->n_allele ) { free(ac); return -1; } \ - ac[(p[ial]>>1)-1]++; \ - } \ - } \ - } - switch (gt->type) { - case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; - default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; - } - #undef BRANCH - - int nrm = 0; - kbitset_t *rm_set = kbs_init(line->n_allele); - for (i=1; in_allele; i++) - { - if ( !ac[i] ) { kbs_insert(rm_set, i); nrm++; } - } - free(ac); - - if ( nrm ) bcf_remove_allele_set(header, line, rm_set); - kbs_destroy(rm_set); - return nrm; -} - -void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) -{ - int i; - kbitset_t *rm_set = kbs_init(line->n_allele); - for (i=1; in_allele; i++) - if ( rm_mask & 1<n_allele, sizeof(int)); - - // create map of indexes from old to new ALT numbering and modify ALT - kstring_t str = {0,0,0}; - kputs(line->d.allele[0], &str); - - int nrm = 0, i,j; // i: ori alleles, j: new alleles - for (i=1, j=1; in_allele; i++) - { - if ( kbs_exists(rm_set, i) ) - { - // remove this allele - line->d.allele[i] = NULL; - nrm++; - continue; - } - kputc(',', &str); - kputs(line->d.allele[i], &str); - map[i] = j; - j++; - } - if ( !nrm ) { free(map); free(str.s); return; } - - int nR_ori = line->n_allele; - int nR_new = line->n_allele-nrm; - assert(nR_new > 0); // should not be able to remove reference allele - int nA_ori = nR_ori-1; - int nA_new = nR_new-1; - - int nG_ori = nR_ori*(nR_ori + 1)/2; - int nG_new = nR_new*(nR_new + 1)/2; - - bcf_update_alleles_str(header, line, str.s); - - // remove from Number=G, Number=R and Number=A INFO fields. - uint8_t *dat = NULL; - int mdat = 0, ndat = 0, mdat_bytes = 0, nret; - for (i=0; in_info; i++) - { - bcf_info_t *info = &line->d.info[i]; - int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); - - if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change - - int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); - if ( type==BCF_HT_FLAG ) continue; - int size = 1; - if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; - - mdat = mdat_bytes / size; - nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type); - mdat_bytes = mdat * size; - if ( nret<0 ) - { - fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, - bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); - exit(1); - } - if ( type==BCF_HT_STR ) - { - str.l = 0; - char *ss = (char*) dat, *se = (char*) dat; - if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) - { - int nexp, inc = 0; - if ( vlen==BCF_VL_A ) - { - nexp = nA_ori; - inc = 1; - } - else - nexp = nR_ori; - for (j=0; jkey), (void*)str.s, str.l, type); - if ( nret<0 ) - { - fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, - bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); - exit(1); - } - continue; - } - - if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) - { - int inc = 0, ntop; - if ( vlen==BCF_VL_A ) - { - assert( nret==nA_ori ); - ntop = nA_ori; - ndat = nA_new; - inc = 1; - } - else - { - assert( nret==nR_ori ); - ntop = nR_ori; - ndat = nR_new; - } - int k = 0; - - #define BRANCH(type_t,is_vector_end) \ - { \ - type_t *ptr = (type_t*) dat; \ - int size = sizeof(type_t); \ - for (j=0; jkey), (void*)dat, ndat, type); - if ( nret<0 ) - { - fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, - bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); - exit(1); - } - } - - // Update GT fields, the allele indexes might have changed - for (i=1; in_allele; i++) if ( map[i]!=i ) break; - if ( in_allele ) - { - mdat = mdat_bytes / 4; // sizeof(int32_t) - nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat); - mdat_bytes = mdat * 4; - if ( nret>0 ) - { - nret /= line->n_sample; - int32_t *ptr = (int32_t*) dat; - for (i=0; in_sample; i++) - { - for (j=0; j=0 ); - ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); - } - ptr += nret; - } - bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); - } - } - - // Remove from Number=G, Number=R and Number=A FORMAT fields. - // Assuming haploid or diploid GTs - for (i=0; in_fmt; i++) - { - bcf_fmt_t *fmt = &line->d.fmt[i]; - int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); - - if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change - - int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); - if ( type==BCF_HT_FLAG ) continue; - - int size = 1; - if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; - - mdat = mdat_bytes / size; - nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type); - mdat_bytes = mdat * size; - if ( nret<0 ) - { - fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, - bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); - exit(1); - } - - if ( type==BCF_HT_STR ) - { - int size = nret/line->n_sample; // number of bytes per sample - str.l = 0; - if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) - { - int nexp, inc = 0; - if ( vlen==BCF_VL_A ) - { - nexp = nA_ori; - inc = 1; - } - else - nexp = nR_ori; - for (j=0; jn_sample; j++) - { - char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; - int k_src = 0, k_dst = 0, l = str.l; - for (k_src=0; k_src=se || !*ptr) break; - while ( ptrn_sample; j++) - { - char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; - int k_src = 0, k_dst = 0, l = str.l; - int nexp = 0; // diploid or haploid? - while ( ptr=se || !*ptr ) break; - while ( ptr=se || !*ptr ) break; - } - } - else // haploid - { - for (k_src=0; k_src=se || !*ptr ) break; - while ( ptrid), (void*)str.s, str.l, type); - if ( nret<0 ) - { - fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, - bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); - exit(1); - } - continue; - } - - int nori = nret / line->n_sample; - if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G - { - int inc = 0, nnew; - if ( vlen==BCF_VL_A ) - { - assert( nori==nA_ori ); // todo: will fail if all values are missing - ndat = nA_new*line->n_sample; - nnew = nA_new; - inc = 1; - } - else - { - assert( nori==nR_ori ); // todo: will fail if all values are missing - ndat = nR_new*line->n_sample; - nnew = nR_new; - } - - #define BRANCH(type_t,is_vector_end) \ - { \ - for (j=0; jn_sample; j++) \ - { \ - type_t *ptr_src = ((type_t*)dat) + j*nori; \ - type_t *ptr_dst = ((type_t*)dat) + j*nnew; \ - int size = sizeof(type_t); \ - int k_src, k_dst = 0; \ - for (k_src=0; k_srcn_sample; - - #define BRANCH(type_t,is_vector_end) \ - { \ - for (j=0; jn_sample; j++) \ - { \ - type_t *ptr_src = ((type_t*)dat) + j*nori; \ - type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \ - int size = sizeof(type_t); \ - int ia, ib, k_dst = 0, k_src; \ - int nset = 0; /* haploid or diploid? */ \ - for (k_src=0; k_srcid), (void*)dat, ndat, type); - if ( nret<0 ) - { - fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, - bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); - exit(1); - } - } - free(dat); - free(str.s); - free(map); -} -