7df6e18265341f87a69fba808aa1f92f8ebca841 markd Wed Apr 15 13:39:42 2026 -0700 move copy of htslib diff --git src/htslib/tabix.c src/htslib/tabix.c deleted file mode 100644 index 6859068b03b..00000000000 --- src/htslib/tabix.c +++ /dev/null @@ -1,538 +0,0 @@ -/* tabix.c -- Generic indexer for TAB-delimited genome position files. - - Copyright (C) 2009-2011 Broad Institute. - Copyright (C) 2010-2012, 2014, 2015 Genome Research Ltd. - - Author: Heng Li - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include "htslib/tbx.h" -#include "htslib/sam.h" -#include "htslib/vcf.h" -#include "htslib/kseq.h" -#include "htslib/bgzf.h" -#include "htslib/hts.h" -#include "htslib/regidx.h" - -typedef struct -{ - char *regions_fname, *targets_fname; - int print_header, header_only; -} -args_t; - -static void error(const char *format, ...) -{ - va_list ap; - va_start(ap, format); - vfprintf(stderr, format, ap); - va_end(ap); - exit(EXIT_FAILURE); -} - -#define IS_GFF (1<<0) -#define IS_BED (1<<1) -#define IS_SAM (1<<2) -#define IS_VCF (1<<3) -#define IS_BCF (1<<4) -#define IS_BAM (1<<5) -#define IS_CRAM (1<<6) -#define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF) - -int file_type(const char *fname) -{ - int l = strlen(fname); - int strcasecmp(const char *s1, const char *s2); - if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF; - else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED; - else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM; - else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF; - else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF; - else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM; - else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM; - - htsFile *fp = hts_open(fname,"r"); - enum htsExactFormat format = fp->format.format; - hts_close(fp); - if ( format == bcf ) return IS_BCF; - if ( format == bam ) return IS_BAM; - if ( format == cram ) return IS_CRAM; - if ( format == vcf ) return IS_VCF; - - return 0; -} - -static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs) -{ - kstring_t str = {0,0,0}; - int iseq = 0, ireg = 0; - char **regs = NULL; - *nregs = argc; - - if ( regions_fname ) - { - // improve me: this is a too heavy machinery for parsing regions... - - regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL); - if ( !idx ) error("Could not read %s\n", regions_fname); - - (*nregs) += regidx_nregs(idx); - regs = (char**) malloc(sizeof(char*)*(*nregs)); - - int nseq; - char **seqs = regidx_seq_names(idx, &nseq); - for (iseq=0; iseqformat; - - regidx_t *reg_idx = NULL; - if ( args->targets_fname ) - { - reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); - if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); - } - - if ( format == bcf ) - { - htsFile *out = hts_open("-","w"); - if ( !out ) error("Could not open stdout\n", fname); - hts_idx_t *idx = bcf_index_load(fname); - if ( !idx ) error("Could not load .csi index of %s\n", fname); - bcf_hdr_t *hdr = bcf_hdr_read(fp); - if ( !hdr ) error("Could not read the header: %s\n", fname); - if ( args->print_header ) - bcf_hdr_write(out,hdr); - if ( !args->header_only ) - { - bcf1_t *rec = bcf_init(); - for (i=0; i=0 ) - { - if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; - bcf_write(out,hdr,rec); - } - tbx_itr_destroy(itr); - } - bcf_destroy(rec); - } - if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); - bcf_hdr_destroy(hdr); - hts_idx_destroy(idx); - } - else if ( format==vcf || format==sam || format==unknown_format ) - { - tbx_t *tbx = tbx_index_load(fname); - if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); - kstring_t str = {0,0,0}; - if ( args->print_header ) - { - while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) - { - if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; - puts(str.s); - } - } - if ( !args->header_only ) - { - int nseq; - const char **seq = NULL; - if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); - for (i=0; i= 0) - { - if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; - puts(str.s); - } - tbx_itr_destroy(itr); - } - free(seq); - } - free(str.s); - tbx_destroy(tbx); - } - else if ( format==bam ) - error("Please use \"samtools view\" for querying BAM files.\n"); - - if ( reg_idx ) regidx_destroy(reg_idx); - if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); - - for (i=0; iblock_length ) return -1; - - char *buffer = fp->uncompressed_block; - int skip_until = 0; - - // Skip the header: find out the position of the data block - if ( buffer[0]==conf->meta_char ) - { - skip_until = 1; - while (1) - { - if ( buffer[skip_until]=='\n' ) - { - skip_until++; - if ( skip_until>=fp->block_length ) - { - if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname); - skip_until = 0; - } - // The header has finished - if ( buffer[skip_until]!=conf->meta_char ) break; - } - skip_until++; - if ( skip_until>=fp->block_length ) - { - if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname); - skip_until = 0; - } - } - } - - // Output the new header - FILE *hdr = fopen(header,"r"); - if ( !hdr ) error("%s: %s", header,strerror(errno)); - const size_t page_size = 32768; - char *buf = malloc(page_size); - BGZF *bgzf_out = bgzf_open("-", "w"); - ssize_t nread; - while ( (nread=fread(buf,1,page_size-1,hdr))>0 ) - { - if ( nreaderrcode); - } - if ( fclose(hdr) ) error("close failed: %s\n", header); - - // Output all remainig data read with the header block - if ( fp->block_length - skip_until > 0 ) - { - if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode); - } - if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); - - while (1) - { - nread = bgzf_raw_read(fp, buf, page_size); - if ( nread<=0 ) break; - - int count = bgzf_raw_write(bgzf_out, buf, nread); - if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); - } - if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); - if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode); - free(buf); - } - else - error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header. - return 0; -} - -static int usage(void) -{ - fprintf(stderr, "\n"); - fprintf(stderr, "Version: %s\n", hts_version()); - fprintf(stderr, "Usage: tabix [OPTIONS] [FILE] [REGION [...]]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Indexing Options:\n"); - fprintf(stderr, " -0, --zero-based coordinates are zero-based\n"); - fprintf(stderr, " -b, --begin INT column number for region start [4]\n"); - fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n"); - fprintf(stderr, " -C, --csi generate CSI index for VCF (default is TBI)\n"); - fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n"); - fprintf(stderr, " -f, --force overwrite existing index without asking\n"); - fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); - fprintf(stderr, " -p, --preset STR gff, bed, sam, vcf\n"); - fprintf(stderr, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n"); - fprintf(stderr, " -S, --skip-lines INT skip first INT lines [0]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Querying and other options:\n"); - fprintf(stderr, " -h, --print-header print also the header lines\n"); - fprintf(stderr, " -H, --only-header print only the header lines\n"); - fprintf(stderr, " -l, --list-chroms list chromosome names\n"); - fprintf(stderr, " -r, --reheader FILE replace the header with the content of FILE\n"); - fprintf(stderr, " -R, --regions FILE restrict to regions listed in the file\n"); - fprintf(stderr, " -T, --targets FILE similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, "\n"); - return 1; -} - -int main(int argc, char *argv[]) -{ - int c, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0; - tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL; - char *reheader = NULL; - args_t args; - memset(&args,0,sizeof(args_t)); - - static struct option loptions[] = - { - {"help",0,0,'h'}, - {"regions",1,0,'R'}, - {"targets",1,0,'T'}, - {"csi",0,0,'C'}, - {"zero-based",0,0,'0'}, - {"print-header",0,0,'h'}, - {"only-header",0,0,'H'}, - {"begin",1,0,'b'}, - {"comment",1,0,'c'}, - {"end",1,0,'e'}, - {"force",0,0,'f'}, - {"preset",1,0,'p'}, - {"sequence",1,0,'s'}, - {"skip-lines",1,0,'S'}, - {"list-chroms",0,0,'l'}, - {"reheader",1,0,'r'}, - {0,0,0,0} - }; - - char *tmp; - while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0) - { - switch (c) - { - case 'R': args.regions_fname = optarg; break; - case 'T': args.targets_fname = optarg; break; - case 'C': do_csi = 1; break; - case 'r': reheader = optarg; break; - case 'h': args.print_header = 1; break; - case 'H': args.print_header = 1; args.header_only = 1; break; - case 'l': list_chroms = 1; break; - case '0': conf.preset |= TBX_UCSC; break; - case 'b': - conf.bc = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse argument: -b %s\n", optarg); - break; - case 'e': - conf.ec = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse argument: -e %s\n", optarg); - break; - case 'c': conf.meta_char = *optarg; break; - case 'f': is_force = 1; break; - case 'm': - min_shift = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse argument: -m %s\n", optarg); - break; - case 'p': - if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff; - else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; - else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; - else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; - else if (strcmp(optarg, "bcf") == 0) ; // bcf is autodetected, preset is not needed - else if (strcmp(optarg, "bam") == 0) ; // same as bcf - else error("The preset string not recognised: '%s'\n", optarg); - break; - case 's': - conf.sc = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse argument: -s %s\n", optarg); - break; - case 'S': - conf.line_skip = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse argument: -S %s\n", optarg); - break; - default: return usage(); - } - } - - if ( optind==argc ) return usage(); - - if ( list_chroms ) - return query_chroms(argv[optind]); - - if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname ) - { - int nregs = 0; - char **regs = NULL; - if ( !args.header_only ) - regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs); - return query_regions(&args, argv[optind], regs, nregs); - } - - char *fname = argv[optind]; - int ftype = file_type(fname); - if ( !conf_ptr ) // no preset given - { - if ( ftype==IS_GFF ) conf_ptr = &tbx_conf_gff; - else if ( ftype==IS_BED ) conf_ptr = &tbx_conf_bed; - else if ( ftype==IS_SAM ) conf_ptr = &tbx_conf_sam; - else if ( ftype==IS_VCF ) - { - conf_ptr = &tbx_conf_vcf; - if ( !min_shift && do_csi ) min_shift = 14; - } - else if ( ftype==IS_BCF ) - { - if ( !min_shift ) min_shift = 14; - } - else if ( ftype==IS_BAM ) - { - if ( !min_shift ) min_shift = 14; - } - } - if ( do_csi ) - { - if ( !min_shift ) min_shift = 14; - min_shift *= do_csi; // positive for CSIv2, negative for CSIv1 - } - if ( min_shift!=0 && !do_csi ) do_csi = 1; - - if ( reheader ) - return reheader_file(fname, reheader, ftype, conf_ptr); - - if ( conf_ptr ) - conf = *conf_ptr; - - char *suffix = ".tbi"; - if ( do_csi ) suffix = ".csi"; - else if ( ftype==IS_BAM ) suffix = ".bai"; - else if ( ftype==IS_CRAM ) suffix = ".crai"; - - char *idx_fname = calloc(strlen(fname) + 5, 1); - strcat(strcpy(idx_fname, fname), suffix); - - struct stat stat_tbi, stat_file; - if ( !is_force && stat(idx_fname, &stat_tbi)==0 ) - { - // Before complaining about existing index, check if the VCF file isn't - // newer. This is a common source of errors, people tend not to notice - // that tabix failed - stat(fname, &stat_file); - if ( stat_file.st_mtime <= stat_tbi.st_mtime ) - error("[tabix] the index file exists. Please use '-f' to overwrite.\n"); - } - free(idx_fname); - - if ( ftype==IS_CRAM ) - { - if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname); - return 0; - } - else if ( do_csi ) - { - if ( ftype==IS_BCF ) - { - if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname); - return 0; - } - if ( ftype==IS_BAM ) - { - if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname); - return 0; - } - if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname); - return 0; - } - else // TBI index - { - if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname); - return 0; - } - return 0; -}