b29f1fbf93c9df5aa65fed050d9f8132dc6a2c73
braney
  Tue Nov 24 14:20:10 2015 -0800
support htslib for sam/bam/cram/tabix support  #14717

diff --git src/lib/bamFile.c src/lib/bamFile.c
index 1e5f58f..c050b22 100644
--- src/lib/bamFile.c
+++ src/lib/bamFile.c
@@ -2,37 +2,47 @@
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "portable.h"
 #include "bamFile.h"
 #ifdef USE_BAM
 #include "htmshell.h"
 #include "udc.h"
 
 #ifdef KNETFILE_HOOKS
 // If KNETFILE_HOOKS is used (as recommended!), then we can simply call bam_index_load
 // without worrying about the samtools lib creating local cache files in cgi-bin:
 
+#ifdef USE_HTS
+static bam_index_t *bamOpenIdx(samfile_t *sam, char *fileOrUrl)
+/* If fileOrUrl has a valid accompanying .bai file, parse and return the index;
+ * otherwise return NULL. */
+{
+bam_index_t *idx = sam_index_load(sam, fileOrUrl);
+return idx;
+}
+#else
 static bam_index_t *bamOpenIdx(char *fileOrUrl)
 /* If fileOrUrl has a valid accompanying .bai file, parse and return the index;
  * otherwise return NULL. */
 {
 bam_index_t *idx = bam_index_load(fileOrUrl);
 return idx;
 }
+#endif
 
 #else// no KNETFILE_HOOKS
 // Oh well.  The unmodified samtools lib downloads .bai files into the current
 // working directory, which is cgi-bin -- not good.  So we need to temporarily
 // change to a trash directory, let samtools download there, then pop back to
 // cgi-bin.
 
 static char *getSamDir()
 /* Return the name of a trash dir for samtools to run in (it creates files in current dir)
  * and make sure the directory exists. */
 {
 static char *samDir = NULL;
 char *dirName = "samtools";
 if (samDir == NULL)
     {
@@ -68,76 +78,88 @@
 /* Free unless already NULL. */
 {
 if (pIdx != NULL && *pIdx != NULL)
     {
     free(*pIdx); // Not freeMem, freez etc -- sam just uses malloc/calloc.
     *pIdx = NULL;
     }
 }
 
 boolean bamFileExists(char *fileOrUrl)
 /* Return TRUE if we can successfully open the bam file and its index file.
  * NOTE: this doesn't give enough diagnostics */
 {
 char *bamFileName = fileOrUrl;
 samfile_t *fh = samopen(bamFileName, "rb", NULL);
+#ifdef USE_HTS
+if (fh != NULL)
+    {
+    bam_index_t *idx = bamOpenIdx(fh, bamFileName);
+#else
 // Check both fh and fh->header; non-NULL fh can have NULL header if header doesn't parse!
 if (fh != NULL && fh->header != NULL)
     {
     bam_index_t *idx = bamOpenIdx(bamFileName);
+#endif
     samclose(fh);
     if (idx == NULL)
 	{
 	warn("bamFileExists: failed to read index corresponding to %s", bamFileName);
 	return FALSE;
 	}
     bamCloseIdx(&idx);
     bamClose(&fh);
     return TRUE;
     }
 return FALSE;
 }
 
 samfile_t *bamOpen(char *fileOrUrl, char **retBamFileName)
 /* Return an open bam file or errAbort (should be named bamMustOpen).
  * Return parameter if NON-null will return the file name (vestigial; long ago
  * there was a plan to use udcFuse filenames instead of URLs) */
 {
 char *bamFileName = fileOrUrl;
 if (retBamFileName != NULL)
     *retBamFileName = bamFileName;
 
 #ifdef BAM_VERSION
 // suppress too verbose messages in samtools >= 0.1.18; see redmine #6491
 // This variable didn't exist in older versions of samtools (where BAM_VERSION wasn't defined).
 bam_verbose = 1;
 #endif
 
 samfile_t *fh = samopen(bamFileName, "rb", NULL);
 // Check both fh and fh->header; non-NULL fh can have NULL header if header doesn't parse!
+#ifdef USE_HTS
+if (fh == NULL)
+#else
 if (fh == NULL || fh->header == NULL)
+#endif
     {
     boolean usingUrl = (strstr(fileOrUrl, "tp://") || strstr(fileOrUrl, "https://"));
     struct dyString *urlWarning = dyStringNew(0);
     if (usingUrl && fh == NULL)
 	{
 	dyStringAppend(urlWarning,
 		       ". If you are able to access the URL with your web browser, "
 		       "please try reloading this page.");
 	}
+#ifndef USE_HTS
     else if (fh != NULL && fh->header == NULL)
 	dyStringAppend(urlWarning, ": parser error while reading the file header.");
+#endif
     errAbort("Failed to open %s%s", fileOrUrl, urlWarning->string);
     }
 return fh;
 }
 
 samfile_t *bamMustOpenLocal(char *fileName, char *mode, void *extraHeader)
 /* Open up sam or bam file or die trying.  The mode parameter is 
  *    "r" - open SAM to read
  *    "rb" - open BAM to read
  *    "w" - open SAM to write
  *    "wb" - open BAM to write
  * The extraHeader is generally NULL in the read case, and the write case
  * contains a pointer to a bam_header_t with information about the header.
  * The implementation is just a wrapper around samopen from the samtools library
  * that aborts with error message if there's a problem with the open. */
@@ -151,74 +173,108 @@
 void bamClose(samfile_t **pSamFile)
 /* Close down a samfile_t */
 {
 if (pSamFile != NULL)
     {
     samclose(*pSamFile);
     *pSamFile = NULL;
     }
 }
 
 void bamFileAndIndexMustExist(char *fileOrUrl)
 /* Open both a bam file and its accompanying index or errAbort; this is what it
  * takes for diagnostic info to propagate up through errCatches in calling code. */
 {
 samfile_t *bamF = bamOpen(fileOrUrl, NULL);
+#ifdef USE_HTS
+bam_index_t *idx = bamOpenIdx(bamF, fileOrUrl);
+#else
 bam_index_t *idx = bamOpenIdx(fileOrUrl);
+#endif
 if (idx == NULL)
     errAbort("failed to read index file (.bai) corresponding to %s", fileOrUrl);
 bamCloseIdx(&idx);
 bamClose(&bamF);
 }
 
+#ifdef USE_HTS
+void bamFetchAlreadyOpen(samfile_t *samfile, bam_hdr_t *header,  bam_index_t *idx, char *bamFileName, 
+#else
 void bamFetchAlreadyOpen(samfile_t *samfile, bam_index_t *idx, char *bamFileName, 
+#endif
 			 char *position, bam_fetch_f callbackFunc, void *callbackData)
 /* With the open bam file, return items the same way with the callbacks as with bamFetch() */
 /* except in this case use an already-open bam file and index (use bam_index_load and free() for */
 /* the index). It seems a little strange to pass the filename in with the open bam, but */
 /* it's just used to report errors. */
 {
+#ifdef USE_HTS
+bam1_t *b;
+AllocVar(b);
+hts_itr_t *iter = sam_itr_querys(idx, header, position);
+if (iter == NULL && startsWith("chr", position))
+    iter = sam_itr_querys(idx, header, position + strlen("chr"));
+
+if (iter == NULL)
+    return;
+int result;
+while ((result = sam_itr_next(samfile, iter, b)) >= 0) 
+    callbackFunc(b, callbackData, header);
+
+#else
 int chromId, start, end;
 int ret = bam_parse_region(samfile->header, position, &chromId, &start, &end);
 if (ret != 0 && startsWith("chr", position))
     ret = bam_parse_region(samfile->header, position+strlen("chr"), &chromId, &start, &end);
 if (ret != 0)
     // If the bam file does not cover the current chromosome, OK
     return;
 ret = bam_fetch(samfile->x.bam, idx, chromId, start, end, callbackData, callbackFunc);
 if (ret != 0)
     warn("bam_fetch(%s, %s (chromId=%d) failed (%d)", bamFileName, position, chromId, ret);
+#endif
 }
 
 void bamFetch(char *fileOrUrl, char *position, bam_fetch_f callbackFunc, void *callbackData,
 		 samfile_t **pSamFile)
 /* Open the .bam file, fetch items in the seq:start-end position range,
  * and call callbackFunc on each bam item retrieved from the file plus callbackData.
  * This handles BAM files with "chr"-less sequence names, e.g. from Ensembl. 
  * The pSamFile parameter is optional.  If non-NULL it will be filled in, just for
  * the benefit of the callback function, with the open samFile.  */
 {
 char *bamFileName = NULL;
 samfile_t *fh = bamOpen(fileOrUrl, &bamFileName);
+#ifdef USE_HTS
+bam_hdr_t *header = sam_hdr_read(fh);
+if (pSamFile != NULL)
+    *pSamFile = fh;
+bam_index_t *idx = bamOpenIdx(fh, bamFileName);
+#else
 if (pSamFile != NULL)
     *pSamFile = fh;
 bam_index_t *idx = bamOpenIdx(bamFileName);
+#endif
 if (idx == NULL)
     warn("bam_index_load(%s) failed.", bamFileName);
 else
     {
+#ifdef USE_HTS
+    bamFetchAlreadyOpen(fh, header, idx, bamFileName, position, callbackFunc, callbackData);
+#else
     bamFetchAlreadyOpen(fh, idx, bamFileName, position, callbackFunc, callbackData);
+#endif
     bamCloseIdx(&idx);
     }
 bamClose(&fh);
 }
 
 boolean bamIsRc(const bam1_t *bam)
 /* Return TRUE if alignment is on - strand. */
 {
 const bam1_core_t *core = &bam->core;
 return (core->flag & BAM_FREVERSE);
 }
 
 void bamGetSoftClipping(const bam1_t *bam, int *retLow, int *retHigh, int *retClippedQLen)
 /* If retLow is non-NULL, set it to the number of "soft-clipped" (skipped) bases at
  * the beginning of the query sequence and quality; likewise for retHigh at end.
@@ -540,30 +596,31 @@
     else if (type == 'c') { dyStringPrintf(dy, "%d", *s); ++s; }
     else if (type == 'S') { dyStringPrintf(dy, "%u", *(uint16_t*)s); s += 2; }
     else if (type == 's') { dyStringPrintf(dy, "%d", *(int16_t*)s); s += 2; }
     else if (type == 'I') { dyStringPrintf(dy, "%u", *(uint32_t*)s); s += 4; }
     else if (type == 'i') { dyStringPrintf(dy, "%d", *(int32_t*)s); s += 4; }
     else if (type == 'f') { dyStringPrintf(dy, "%g", *(float*)s); s += 4; }
     else if (type == 'd') { dyStringPrintf(dy, "%lg", *(double*)s); s += 8; }
     else if (type == 'Z' || type == 'H')
 	{
 	dyStringAppend(dy, (char *)s);
 	s += strlen((char *)s) + 1;
 	}
     }
 }
 
+#ifndef USE_HTS
 struct bamChromInfo *bamChromList(samfile_t *fh)
 {
 /* Return list of chromosomes from bam header. We make no attempty to normalize chromosome names to UCSC format,
    so list may contain things like "1" for "chr1", "I" for "chrI", "MT" for "chrM" etc. */
 int i;
 struct bamChromInfo *list = NULL;
 bam_header_t *bamHeader = fh->header;
 if(bamHeader == NULL)
     return NULL;
 for(i = 0; i < bamHeader->n_targets; i++)
     {
     struct bamChromInfo *info = NULL;
     AllocVar(info);
     info->name = cloneString(bamHeader->target_name[i]);
     info->size = bamHeader->target_len[i];
@@ -600,30 +657,31 @@
 	}
     fprintf(f, "%s\t%d\t%d\t.\t0\t%c\n", chrom, start, end, strand);
     }
 if (err < 0 && err != -1)
     errnoAbort("samread err %d", err);
 samclose(sf);
 }
 
 void samToBed(char *samIn, char *bedOut)
 /* samToBed - Convert SAM file to a pretty simple minded bed file.. */
 {
 FILE *f = mustOpen(bedOut, "w");
 samToOpenBed(samIn, f);
 carefulClose(&f);
 }
+#endif
 
 #else
 // If we're not compiling with samtools, make stub routines so compile won't fail:
 
 boolean bamFileExists(char *bamFileName)
 /* Return TRUE if we can successfully open the bam file and its index file. */
 {
 warn(COMPILE_WITH_SAMTOOLS, "bamFileExists");
 return FALSE;
 }
 
 void bamFileAndIndexMustExist(char *fileOrUrl)
 /* Open both a bam file and its accompanying index or errAbort; this is what it
  * takes for diagnostic info to propagate up through errCatches in calling code. */
 {