1ff2e18ba5e50dd2d1becd3cd6a25343578c10b8
markd
  Tue Apr 21 14:09:04 2026 -0700
rework of htslib/UDC integration to use htslib hfile driver mechanism

diff --git src/lib/bamFile.c src/lib/bamFile.c
index 9f4f9ab22a5..7da5e819dff 100644
--- src/lib/bamFile.c
+++ src/lib/bamFile.c
@@ -1,86 +1,151 @@
 /* bamFile -- interface to binary alignment format files using Heng Li's samtools lib. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "portable.h"
 #include "bamFile.h"
 #include "htmshell.h"
-#include "cram/cram_samtools.h"
-#include "cram/sam_header.h"
-#include "cram/cram_structs.h"
 #include "htslib/cram.h"
 #include "udc.h"
 #include "psl.h"
 
-// If KNETFILE_HOOKS is used (as recommended!), then we can simply call bam_index_load
-// without worrying about the samtools lib creating local cache files in cgi-bin:
+#include <limits.h>
 
-static bam_index_t *bamOpenIndexGivenUrl(samfile_t *sam, char *fileOrUrl, char *baiFileOrUrl)
-/* If fileOrUrl has a valid accompanying .bai file, parse and return the index;
- * otherwise return NULL. baiFileOrUrl can be NULL. 
- * The difference to bamOpenIndex is that the URL/filename of the bai file can be specified. */
+void cramCheckRefs(samfile_t *fh, char *refUrl, char *cacheDir)
+/* Check if all CRAM reference sequences are locally available.  If any are
+ * missing, create pending-download request files and errAbort with a message
+ * telling the user to refresh.  Uses the new cram_check_required_refs() API
+ * to pre-check before reading rather than discovering failures mid-decode. */
 {
-if (sam->format.format == cram) 
+if (fh->format.format != cram || cacheDir == NULL)
+    return;
+
+/* Set REF_CACHE so htslib's ref resolution finds our cache dir */
+setenv("REF_CACHE", cacheDir, 1);
+
+cram_fd *cfd = fh->fp.cram;
+cram_ref_check_t *chk = cram_check_required_refs(cfd, NULL, cacheDir);
+if (chk == NULL || chk->n_missing == 0)
     {
-    if (baiFileOrUrl)
-       return sam_index_load2(sam, fileOrUrl, baiFileOrUrl);
+    cram_ref_check_free(chk);
+    return;
+    }
+
+/* Use first missing ref for the error message (matches old behavior) */
+char *md5String = chk->refs[0].md5;
+
+char errorFile[4096];
+char pendingFile[4096];
+safef(errorFile, sizeof errorFile, "%s/error/%s", cacheDir, md5String);
+
+FILE *downFile;
+if ((downFile = fopen(errorFile, "r")) != NULL)
+    {
+    char errorBuf[4096];
+    mustGetLine(downFile, errorBuf, sizeof errorBuf);
+    fclose(downFile);
+    cram_ref_check_free(chk);
+    errAbort("cannot find reference %s.  Error: %s", md5String, errorBuf);
+    }
+
+/* Create pending download requests for all missing refs */
+safef(pendingFile, sizeof pendingFile, "%s/pending/", cacheDir);
+makeDirsOnPath(pendingFile);
+
+for (int i = 0; i < chk->n_missing; i++)
+    {
+    char *md5 = chk->refs[i].md5;
+    safef(pendingFile, sizeof pendingFile, "%s/pending/%s", cacheDir, md5);
+    downFile = fopen(pendingFile, "w");
+    if (downFile == NULL)
+        {
+        cram_ref_check_free(chk);
+        errAbort("cannot find reference %s.  Cannot create file %s.", md5, pendingFile);
+        }
+    if (refUrl != NULL)
+        fprintf(downFile, refUrl, md5);
+    else
+        fprintf(downFile, "http://www.ebi.ac.uk:80/ena/cram/md5/%s", md5);
+    fprintf(downFile, "\n");
+    fclose(downFile);
+    }
+
+char server[4096];
+if (refUrl != NULL)
+    safef(server, sizeof server, refUrl, md5String);
 else
-       return sam_index_load(sam, fileOrUrl);
+    safef(server, sizeof server, "http://www.ebi.ac.uk:80/ena/cram/md5/%s", md5String);
+
+cram_ref_check_free(chk);
+errAbort("Downloading reference from %s. "
+         "Please refresh screen to check download status.", server);
 }
 
-// assume that index is a .bai file 
+// If KNETFILE_HOOKS is used (as recommended!), then we can simply call bam_index_load
+// without worrying about the samtools lib creating local cache files in cgi-bin:
+
+static bam_index_t *tryIndexCandidates(samfile_t *sam, char *fileOrUrl, const char *ext)
+/* Try loading index by appending ext, then by replacing the suffix with ext.
+ * Passes explicit index name to sam_index_load2 so htslib does not attempt
+ * to download a remote index into the (non-writable) CWD. */
+{
 char indexName[4096];
 bam_index_t *ret = NULL;
-if (baiFileOrUrl==NULL)
-    {
-    // first try tacking .bai on the end of the bam file name
-    safef(indexName, sizeof indexName, "%s.bai", fileOrUrl);
-    if ((ret =  sam_index_load2(sam, fileOrUrl, indexName)) == NULL)
-        {
-        // since the open didn't work, try replacing suffix (if any) with .bai
-        safef(indexName, sizeof indexName - sizeof(".bai"), "%s", fileOrUrl);
+
+safef(indexName, sizeof indexName, "%s%s", fileOrUrl, ext);
+if ((ret = sam_index_load2(sam, fileOrUrl, indexName)) != NULL)
+    return ret;
+
+safef(indexName, sizeof indexName, "%s", fileOrUrl);
 char *lastDot = strrchr(indexName, '.');
-        if (lastDot)
+if (lastDot && (lastDot - indexName) + strlen(ext) < sizeof indexName)
     {
-            strcpy(lastDot, ".bai");
+    strcpy(lastDot, ext);
     ret = sam_index_load2(sam, fileOrUrl, indexName);
     }
+return ret;
 }
-    }
-else
-    ret = sam_index_load2(sam, fileOrUrl, baiFileOrUrl);
 
-return ret;
+static bam_index_t *bamOpenIndexGivenUrl(samfile_t *sam, char *fileOrUrl, char *baiFileOrUrl)
+/* If fileOrUrl has a valid accompanying .bai/.crai file, parse and return the index;
+ * otherwise return NULL. baiFileOrUrl can be NULL.
+ * The difference to bamOpenIndex is that the URL/filename of the index file can be specified. */
+{
+if (baiFileOrUrl != NULL)
+    return sam_index_load2(sam, fileOrUrl, baiFileOrUrl);
+
+const char *ext = (sam->format.format == cram) ? ".crai" : ".bai";
+return tryIndexCandidates(sam, fileOrUrl, ext);
 }
 
 
 static void bamCloseIdx(bam_index_t **pIdx)
 /* Free unless already NULL. */
 {
 if (pIdx != NULL && *pIdx != NULL)
     {
     free(*pIdx); // Not freeMem, freez etc -- sam just uses malloc/calloc.
     *pIdx = NULL;
     }
 }
 
 static bam_index_t *bamOpenIdx(samfile_t *sam, char *fileOrUrl)
-/* If fileOrUrl has a valid accompanying .bai file, parse and return the index;
- * otherwise return NULL. baiFileOrUrl can be NULL. */
+/* If fileOrUrl has a valid accompanying .bai/.crai file, parse and return the index;
+ * otherwise return NULL. */
 {
 return bamOpenIndexGivenUrl(sam, fileOrUrl, NULL);
 }
 
 boolean bamFileExists(char *fileOrUrl)
 /* Return TRUE if we can successfully open the bam file and its index file.
  * NOTE: this doesn't give enough diagnostics */
 {
 char *bamFileName = fileOrUrl;
 samfile_t *fh = samopen(bamFileName, "rb", NULL);
 if (fh != NULL)
     {
     bam_index_t *idx = bamOpenIdx(fh, bamFileName);
     samclose(fh);
     if (idx == NULL)
@@ -149,31 +214,34 @@
 {
 if (pSamFile != NULL)
     {
     samclose(*pSamFile);
     *pSamFile = NULL;
     }
 }
 
 void bamFileAndIndexMustExist(char *fileOrUrl, char *baiFileOrUrl)
 /* Open both a bam file and its accompanying index or errAbort; this is what it
  * takes for diagnostic info to propagate up through errCatches in calling code. */
 {
 samfile_t *bamF = bamOpen(fileOrUrl, NULL);
 bam_index_t *idx = bamOpenIndexGivenUrl(bamF, fileOrUrl, baiFileOrUrl);
 if (idx == NULL)
-    errAbort("failed to read index file (.bai) corresponding to %s", fileOrUrl);
+    {
+    const char *ext = (bamF->format.format == cram) ? ".crai" : ".bai";
+    errAbort("failed to read index file (%s) corresponding to %s", ext, fileOrUrl);
+    }
 bamCloseIdx(&idx);
 bamClose(&bamF);
 }
 
 void bamFetchAlreadyOpen(samfile_t *samfile, bam_hdr_t *header,  bam_index_t *idx, char *bamFileName, 
 			 char *position, bam_fetch_f callbackFunc, void *callbackData)
 /* With the open bam file, return items the same way with the callbacks as with bamFetch() */
 /* except in this case use an already-open bam file and index (use bam_index_load and free() for */
 /* the index). It seems a little strange to pass the filename in with the open bam, but */
 /* it's just used to report errors. */
 {
 bam1_t *b;
 AllocVar(b);
 hts_itr_t *iter = sam_itr_querys(idx, header, position);
 if (iter == NULL && startsWith("chr", position))
@@ -186,124 +254,79 @@
     {
     char *colon = strrchr(position, ':');
     char customPos[512];
     if (colon)
         safef(customPos, sizeof customPos, "%s%s", header->target_name[0], colon);
     else
         safecpy(customPos, sizeof customPos, header->target_name[0]);
     iter = sam_itr_querys(idx, header, customPos);
     }
 
 if (iter == NULL)
     return;
 int result;
 while ((result = sam_itr_next(samfile, iter, b)) >= 0)
     callbackFunc(b, callbackData, header);
-
-// if we're reading a CRAM file and the MD5 string has been set
-// we know there was an error finding the reference and we need
-// to request that it be loaded.
-if (samfile->format.format == cram) 
-    {
-    char *md5String =  cram_get_Md5(samfile);
-    if (!isEmpty(md5String))
-        {
-        char server[4096];
-        char pendingFile[4096];
-        char errorFile[4096];
-
-        char *refPath = cram_get_ref_url(samfile);
-        char *cacheDir = cram_get_cache_dir(samfile);
-
-        sprintf(server, refPath, md5String);
-        sprintf(pendingFile, "%s/pending/",cacheDir);
-        sprintf(errorFile, "%s/error/%s",cacheDir,md5String);
-        makeDirsOnPath(pendingFile);
-        sprintf(pendingFile, "%s/pending/%s",cacheDir,md5String);
-        FILE *downFile;
-        if ((downFile = fopen(errorFile, "r")) != NULL)
-            {
-            char errorBuf[4096];
-            mustGetLine(downFile, errorBuf, sizeof errorBuf);
-            errAbort("cannot find reference %s.  Error: %s\n", md5String,errorBuf);
-            }
-        else
-            {
-            if ((downFile = fopen(pendingFile, "w")) == NULL)
-                errAbort("cannot find reference %s.  Cannot create file %s.",md5String, pendingFile);  
-            fprintf(downFile,  "%s\n", server);
-            fclose(downFile);
-            errAbort("Cannot find reference %s.  Downloading from %s. Please refresh screen to check download status.",md5String, server);
-            }
-        }
-    }
 }
 
 void bamAndIndexFetchPlus(char *fileOrUrl, char *baiFileOrUrl, char *position, bam_fetch_f callbackFunc, void *callbackData,
 		 samfile_t **pSamFile, char *refUrl, char *cacheDir)
-/* Open the .bam file with the .bai index specified by baiFileOrUrl.
- * baiFileOrUrl can be NULL and defaults to <fileOrUrl>.bai.
+/* Open the BAM/CRAM file with the index specified by baiFileOrUrl.
+ * baiFileOrUrl can be NULL and defaults to <fileOrUrl>.bai (BAM) or <fileOrUrl>.crai (CRAM).
  * Fetch items in the seq:start-end position range,
  * and call callbackFunc on each bam item retrieved from the file plus callbackData.
  * This handles BAM files with "chr"-less sequence names, e.g. from Ensembl. 
  * The pSamFile parameter is optional.  If non-NULL it will be filled in, just for
  * the benefit of the callback function, with the open samFile.  */
 {
 char *bamFileName = NULL;
 samfile_t *fh = bamOpen(fileOrUrl, &bamFileName);
 if (fh->format.format == cram)
     {
     if (cacheDir == NULL)
         errAbort("CRAM cache dir hg.conf variable (cramRef) must exist for CRAM support");
-    cram_set_cache_url(fh, cacheDir, refUrl);  
+    setenv("REF_CACHE", cacheDir, 1);
     }
 bam_hdr_t *header = sam_hdr_read(fh);
 if (pSamFile != NULL)
     *pSamFile = fh;
+cramCheckRefs(fh, refUrl, cacheDir);
 bam_index_t *idx = bamOpenIndexGivenUrl(fh, bamFileName, baiFileOrUrl);
 if (idx == NULL)
     warn("bam_index_load(%s) failed.", bamFileName);
 else
     {
     bamFetchAlreadyOpen(fh, header, idx, bamFileName, position, callbackFunc, callbackData);
     bamCloseIdx(&idx);
     }
 bamClose(&fh);
 }
 
 static int bamGetTargetCount(char *fileOrUrl)
 /* Return the number of target sequences in a bam or cram file. */
 {
-int tCount = 0;
 htsFile *htsF = hts_open(fileOrUrl, "r");
-if (htsF->format.format == crai)
-    {
-    SAM_hdr *cramHdr = cram_fd_get_header(htsF->fp.cram);
-    tCount = cramHdr->nref;
-    }
-else
-    {
-    bam_hdr_t *bamHdr = bam_hdr_read(htsF->fp.bgzf);
-    tCount = bamHdr->n_targets;
-    }
+bam_hdr_t *header = sam_hdr_read(htsF);
+int tCount = sam_hdr_nref(header);
+bam_hdr_destroy(header);
 hts_close(htsF);
 return tCount;
 }
 
 long long bamFileItemCount(char *fileOrUrl, char *baiFileOrUrl)
 /* Return the total number of mapped items across all sequences in fileOrUrl, using index file.
- * If baiFileOrUrl is NULL, the index file is assumed to be fileOrUrl.bai.
+ * If baiFileOrUrl is NULL, the index file is assumed to be fileOrUrl.bai (BAM) or fileOrUrl.crai (CRAM).
  * NOTE: not all bam index files include mapped item counts, so this may return 0 even for large
  * bam.  As of May 2019, our copy of hts_idx_get_stat does not support cram indexes
  * (perhaps they never include counts?), so this always returns 0 for cram. */
 {
 long long itemCount = 0;
 hts_idx_t *idx = NULL;
 if (isNotEmpty(baiFileOrUrl))
     idx = hts_idx_load2(fileOrUrl, baiFileOrUrl);
 else
     {
     int format = endsWith(fileOrUrl, ".cram") ? HTS_FMT_CRAI : HTS_FMT_BAI;
     idx = hts_idx_load(fileOrUrl, format);
     }
 if (idx == NULL)
     warn("bamFileItemCount: hts_idx_load(%s) failed.", baiFileOrUrl ? baiFileOrUrl : fileOrUrl);