839d2067c1871e5ffa53842f1b9c04806cf14f97
jcasper
  Thu Jul 25 11:51:10 2024 -0700
The new straw library should recognize when it's doing a bad data retrieval.
Also added a rudimentary way to retrieve available normalization options, refs #33225

diff --git src/hg/lib/straw/new_straw.cpp src/hg/lib/straw/new_straw.cpp
index c10a798..c7428df 100644
--- src/hg/lib/straw/new_straw.cpp
+++ src/hg/lib/straw/new_straw.cpp
@@ -31,30 +31,36 @@
 #include <utility>
 #include <vector>
 #include <streambuf>
 //#include <curl/curl.h>
 #include <iterator>
 #include <algorithm>
 #include "zlib.h"
 //#include "straw.h"
 #include "new_straw.h"
 extern "C" {
 #include "../../inc/fakeCurl.h"
 }
 
 using namespace std;
 
+
+// Added UCSC: Dirty hack, but it's hard to quickly extract a list of available normalization options.
+// This gets populated by readFooter and readFooterURL, so you have to make a quick dummy data request
+// with straw() to set it up (e.g. to the first listed chromosome for a 1 bp window).
+set<string> globalNormOptions;
+
 /*
   Straw: fast C++ implementation of dump. Not as fully featured as the
   Java version. Reads the .hic file, finds the appropriate matrix and slice
   of data, and outputs as text in sparse upper triangular format.
 
   Currently only supporting matrices.
 
   Usage: straw [observed/oe/expected] <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize>
  */
 
 // callback for libcurl. data written to this buffer
 static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
     size_t realsize = size * nmemb;
     struct MemoryStruct *mem;
     mem = (struct MemoryStruct *) userp;
@@ -70,34 +76,38 @@
     mem->size += realsize;
     mem->memory[mem->size] = 0;
 
     return realsize;
 }
 
 // get a buffer that can be used as an input stream from the URL
 char *getData(CURL *curl, int64_t position, int64_t chunksize) {
     std::ostringstream oss;
     struct MemoryStruct chunk{};
     chunk.memory = static_cast<char *>(malloc(1));
     chunk.size = 0;    /* no data at this point */
     oss << position << "-" << position + chunksize;
     curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *) &chunk);
     curl_easy_setopt(curl, CURLOPT_RANGE, oss.str().c_str());
+    // Setting FAILONERROR because otherwise this library has a bad habit of assuming data exists
+    // and parsing what it gets back, regardless of whether it has run off the end of the file.
+    curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
     CURLcode res = curl_easy_perform(curl);
     if (res != CURLE_OK) {
-        fprintf(stderr, "curl_easy_perform() failed: %s\n",
-                curl_easy_strerror(res));
+        throw strawException("curl_easy_perform() failed: " + string(curl_easy_strerror(res)));
+        //fprintf(stderr, "curl_easy_perform() failed: %s\n",
+        //        curl_easy_strerror(res));
     }
 
     return chunk.memory;
 }
 
 bool readMagicString(istream &fin) {
     string str;
     getline(fin, str, '\0');
     return str[0] == 'H' && str[1] == 'I' && str[2] == 'C';
 }
 
 char readCharFromFile(istream &fin) {
     char tempChar;
     fin.read(&tempChar, sizeof(char));
     return tempChar;
@@ -135,30 +145,31 @@
 
 void convertGenomeToBinPos(const int64_t origRegionIndices[4], int64_t regionIndices[4], int32_t resolution) {
     for (uint16_t q = 0; q < 4; q++) {
         // used to find the blocks we need to access
         regionIndices[q] = origRegionIndices[q] / resolution;
     }
 }
 
 static CURL *initCURL(const char *url) {
     CURL *curl = curl_easy_init();
     if (curl) {
         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
         curl_easy_setopt(curl, CURLOPT_URL, url);
         curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
         curl_easy_setopt(curl, CURLOPT_USERAGENT, "straw");
+        curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
     } else {
         throw strawException("Unable to initialize curl");
         //cerr << "Unable to initialize curl " << endl;
         //exit(2);
     }
     return curl;
 }
 
 class HiCFileStream {
 public:
     string prefix = "http"; // HTTP code
     ifstream fin;
     CURL *curl;
     bool isHttp = false;
 
@@ -508,30 +519,34 @@
     if (!found) {
         cerr << "Remote file doesn't have the given chr_chr map " << key << endl;
         return false;
     }
 
     if ((matrixType == "observed" && norm == "NONE") ||
         ((matrixType == "oe" || matrixType == "expected") && norm == "NONE" && c1 != c2))
         return true; // no need to read norm vector index
 
     // read in and ignore expected value maps; don't store; reading these to
     // get to norm vector index
     buffer = getData(curl, currentPointer, 100);
     memstream fin3(buffer, 100);
 
     int32_t nExpectedValues = readInt32FromFile(fin3);
+    if (fin3.bad() || fin3.fail())
+    {
+        throw strawException("Unable to read expected values from file - it might be truncated early");
+    }
     currentPointer += 4;
     delete buffer;
     for (int i = 0; i < nExpectedValues; i++) {
 
         buffer = getData(curl, currentPointer, 1000);
         memstream fin4(buffer, 1000);
 
         string unit0;
         currentPointer += readStringFromURL(fin4, unit0);
 
         int32_t binSize = readInt32FromFile(fin4);
         currentPointer += 4;
 
         int64_t nValues;
         if (version > 8) {
@@ -557,30 +572,34 @@
 
         currentPointer += readThroughNormalizationFactorsURL(curl, currentPointer, version, store, expectedValues, c1, nNormalizationFactors);
     }
 
     if (c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm == "NONE") {
         if (expectedValues.empty()) {
             cerr << "Remote file did not contain expected values vectors at " << resolution << " " << unit << endl;
             return false;
         }
         return true;
     }
 
     buffer = getData(curl, currentPointer, 100);
     memstream fin6(buffer, 100);
     nExpectedValues = readInt32FromFile(fin6);
+    if (fin6.bad() || fin6.fail())
+    {
+        throw strawException("Unable to read normalized expected values from file - it might be truncated early");
+    }
     currentPointer += 4;
     delete buffer;
     for (int i = 0; i < nExpectedValues; i++) {
         buffer = getData(curl, currentPointer, 1000);
         memstream fin7(buffer, 1000);
 
         string nType, unit0;
         currentPointer += readStringFromURL(fin7, nType);
         currentPointer += readStringFromURL(fin7, unit0);
 
         int32_t binSize = readInt32FromFile(fin7);
         currentPointer += 4;
 
         int64_t nValues;
         if (version > 8) {
@@ -604,43 +623,53 @@
         delete buffer;
 
         currentPointer += readThroughNormalizationFactorsURL(curl, currentPointer, version, store, expectedValues, c1, nNormalizationFactors);
     }
 
     if (c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm != "NONE") {
         if (expectedValues.empty()) {
             cerr << "Remote file did not contain normalized expected values vectors at " << resolution << " " << unit << endl;
             return false;
         }
     }
 
     buffer = getData(curl, currentPointer, 100);
     memstream fin9(buffer, 100);
     nEntries = readInt32FromFile(fin9);
+    if (fin9.bad() || fin9.fail())
+    {
+        throw strawException("Unable to read normalization vectors from file - it might be truncated early");
+    }
     currentPointer += 4;
     delete buffer;
 
     bool found1 = false;
     bool found2 = false;
     int32_t bufferSize2 = nEntries*60;
     buffer = getData(curl, currentPointer, bufferSize2);
     memstream fin10(buffer, bufferSize2);
 
+    // added UCSC
+    globalNormOptions.clear();
+
     for (int i = 0; i < nEntries; i++) {
         string normtype;
         currentPointer += readStringFromURL(fin10, normtype);
 
+        // added UCSC
+        globalNormOptions.insert(normtype);
+
         int32_t chrIdx = readInt32FromFile(fin10);
         currentPointer += 4;
         string unit1;
         currentPointer += readStringFromURL(fin10, unit1);
 
         int32_t resolution1 = readInt32FromFile(fin10);
         int64_t filePosition = readInt64FromFile(fin10);
         currentPointer += 12;
 
         int64_t sizeInBytes;
         if (version > 8) {
             sizeInBytes = readInt64FromFile(fin10);
             currentPointer += 8;
         } else {
             sizeInBytes = (int64_t) readInt32FromFile(fin10);
@@ -692,101 +721,120 @@
             found = true;
         }
     }
     if (!found) {
         cerr << "File doesn't have the given chr_chr map " << key << endl;
         return false;
     }
 
     if ((matrixType == "observed" && norm == "NONE") ||
         ((matrixType == "oe" || matrixType == "expected") && norm == "NONE" && c1 != c2))
         return true; // no need to read norm vector index
 
     // read in and ignore expected value maps; don't store; reading these to
     // get to norm vector index
     int32_t nExpectedValues = readInt32FromFile(fin);
+    if (fin.bad() || fin.fail())
+    {
+        throw strawException("Unable to read expected values from file - it might be truncated early");
+    }
     for (int i = 0; i < nExpectedValues; i++) {
         string unit0;
         getline(fin, unit0, '\0'); //unit
         int32_t binSize = readInt32FromFile(fin);
 
         int64_t nValues;
         if (version > 8) {
             nValues = readInt64FromFile(fin);
         } else {
             nValues = (int64_t) readInt32FromFile(fin);
         }
 
         bool store = c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm == "NONE" && unit0 == unit &&
                      binSize == resolution;
         readThroughExpectedVector(version, fin, expectedValues, nValues, store, resolution);
         readThroughNormalizationFactors(fin, version, store, expectedValues, c1);
     }
 
     if (c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm == "NONE") {
         if (expectedValues.empty()) {
             cerr << "File did not contain expected values vectors at " << resolution << " " << unit << endl;
             return false;
         }
         return true;
     }
 
     nExpectedValues = readInt32FromFile(fin);
+    if (fin.bad() || fin.fail())
+    {
+        throw strawException("Unable to read normalized expected values from file - it might be truncated early");
+    }
+
     for (int i = 0; i < nExpectedValues; i++) {
         string nType, unit0;
         getline(fin, nType, '\0'); //typeString
         getline(fin, unit0, '\0'); //unit
         int32_t binSize = readInt32FromFile(fin);
-
         int64_t nValues;
         if (version > 8) {
             nValues = readInt64FromFile(fin);
         } else {
             nValues = (int64_t) readInt32FromFile(fin);
         }
         bool store = c1 == c2 && (matrixType == "oe" || matrixType == "expected") && nType == norm && unit0 == unit &&
                      binSize == resolution;
         readThroughExpectedVector(version, fin, expectedValues, nValues, store, resolution);
         readThroughNormalizationFactors(fin, version, store, expectedValues, c1);
     }
 
     if (c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm != "NONE") {
         if (expectedValues.empty()) {
             cerr << "File did not contain normalized expected values vectors at " << resolution << " " << unit << endl;
             return false;
         }
     }
 
     // Index of normalization vectors
     nEntries = readInt32FromFile(fin);
     bool found1 = false;
     bool found2 = false;
+
+    // added UCSC
+    globalNormOptions.clear();
+    if (fin.bad() || fin.fail())
+    {
+        throw strawException("Unable to read normalization vectors from file - it might be truncated early");
+    }
+
     for (int i = 0; i < nEntries; i++) {
         string normtype;
         getline(fin, normtype, '\0'); //normalization type
         int32_t chrIdx = readInt32FromFile(fin);
         string unit1;
         getline(fin, unit1, '\0'); //unit
         int32_t resolution1 = readInt32FromFile(fin);
         int64_t filePosition = readInt64FromFile(fin);
         int64_t sizeInBytes;
         if (version > 8) {
             sizeInBytes = readInt64FromFile(fin);
         } else {
             sizeInBytes = (int64_t) readInt32FromFile(fin);
         }
 
+        // added UCSC
+        globalNormOptions.insert(normtype);
+
         if (chrIdx == c1 && normtype == norm && unit1 == unit && resolution1 == resolution) {
             c1NormEntry.position = filePosition;
             c1NormEntry.size = sizeInBytes;
             found1 = true;
         }
         if (chrIdx == c2 && normtype == norm && unit1 == unit && resolution1 == resolution) {
             c2NormEntry.position = filePosition;
             c2NormEntry.size = sizeInBytes;
             found2 = true;
         }
     }
     if (!found1 || !found2) {
         cerr << "File did not contain " << norm << " normalization vectors for one or both chromosomes at "
              << resolution << " " << unit << endl;
     }
@@ -1281,30 +1329,34 @@
         } else {
             // readMatrix will assign blockBinCount and blockColumnCount
             blockMap = readMatrix(stream2->fin, myFilePos, unit, resolution, sumCounts,
                                   blockBinCount,
                                   blockColumnCount);
         }
         stream2->close();
 
         if (!isIntra) {
             avgCount = (sumCounts / numBins1) / numBins2;   // <= trying to avoid overflows
         }
     }
 
     static vector<double> readNormalizationVectorFromFooter(indexEntry cNormEntry, int32_t &version,
                                                             const string &fileName) {
+        if (cNormEntry.size == 0) {
+            // no normalization entries in this file.  Bow out now and avoid an allocation error
+            throw strawException("Trying to retrieve other normalization options, but none exist");
+        }
         char *buffer = readCompressedBytesFromFile(fileName, cNormEntry);
         memstream bufferin(buffer, cNormEntry.size);
         vector<double> cNorm = readNormalizationVector(bufferin, version);
         delete buffer;
         return cNorm;
     }
 
     static bool isInRange(int32_t r, int32_t c, int32_t numRows, int32_t numCols) {
         return 0 <= r && r < numRows && 0 <= c && c < numCols;
     }
 
     set<int32_t> getBlockNumbers(int64_t *regionIndices) const {
         if (version > 8 && isIntra) {
             return getBlockNumbersForRegionFromBinPositionV9Intra(regionIndices, blockBinCount, blockColumnCount);
         } else {
@@ -1471,30 +1523,31 @@
             int32_t found2;
             found2 = static_cast<int32_t>(s.find('/'));
             //content-range: bytes 0-100000/891471462
             if ((size_t) found2 != string::npos) {
                 string total = s.substr(found2 + 1);
                 totalFileSize = stol(total);
             }
         }
 
         return numbytes;
     }
 
     static CURL *oneTimeInitCURL(const char *url) {
         CURL *curl = initCURL(url);
         curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, hdf);
+        curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
         return curl;
     }
 
     explicit HiCFile(const string &fileName) {
         this->fileName = fileName;
 
         // read header into buffer; 100K should be sufficient
         if (std::strncmp(fileName.c_str(), prefix.c_str(), prefix.size()) == 0) {
             CURL *curl;
             curl = oneTimeInitCURL(fileName.c_str());
             char *buffer = getData(curl, 0, 100000);
             memstream bufin(buffer, 100000);
             chromosomeMap = readHeader(bufin, master, genomeID, numChromosomes,
                                        version, nviPosition, nviLength, attributes);
             resolutions = readResolutionsFromHeader(bufin);
@@ -1680,15 +1733,23 @@
     }
     chromNames = myChromNames;
     chromSizes = myChromSizes;
 
     vector<int> myBpResolutions;
     myBpResolutions.reserve(hiCFile->resolutions.size());
     for(int32_t i = 0; i < hiCFile->resolutions.size(); i++){
         myBpResolutions.push_back(hiCFile->resolutions[i]);
     }
     bpResolutions = myBpResolutions;
 
     attributes = hiCFile->attributes;
 
     // Ignoring fragment resolutions for now, since they're never being used here
 }
+
+set<string> getNormOptions()
+/* Return the set of normalization options that have been encountered through footer parsing.
+ * The result will be empty unless at least one straw() request has been made.
+ */
+{
+    return globalNormOptions;
+}