032653c06fe43263c625613a538389fc2425976b
jcasper
  Tue Jul 16 17:16:51 2024 -0700
Updated hic straw library adapted for UCSC, with a udc-enabled curl substitute.  refs #33225

diff --git src/hg/lib/straw/new_straw.cpp src/hg/lib/straw/new_straw.cpp
index 37fef36..c10a798 100644
--- src/hg/lib/straw/new_straw.cpp
+++ src/hg/lib/straw/new_straw.cpp
@@ -19,35 +19,39 @@
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
 */
 #include <cstring>
 #include <iostream>
 #include <fstream>
 #include <sstream>
 #include <map>
 #include <cmath>
 #include <set>
 #include <utility>
 #include <vector>
 #include <streambuf>
-#include <curl/curl.h>
+//#include <curl/curl.h>
 #include <iterator>
 #include <algorithm>
 #include "zlib.h"
-#include "straw.h"
+//#include "straw.h"
+#include "new_straw.h"
+extern "C" {
+#include "../../inc/fakeCurl.h"
+}
 
 using namespace std;
 
 /*
   Straw: fast C++ implementation of dump. Not as fully featured as the
   Java version. Reads the .hic file, finds the appropriate matrix and slice
   of data, and outputs as text in sparse upper triangular format.
 
   Currently only supporting matrices.
 
   Usage: straw [observed/oe/expected] <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize>
  */
 
 // callback for libcurl. data written to this buffer
 static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
@@ -132,56 +136,59 @@
 void convertGenomeToBinPos(const int64_t origRegionIndices[4], int64_t regionIndices[4], int32_t resolution) {
     for (uint16_t q = 0; q < 4; q++) {
         // used to find the blocks we need to access
         regionIndices[q] = origRegionIndices[q] / resolution;
     }
 }
 
 static CURL *initCURL(const char *url) {
     CURL *curl = curl_easy_init();
     if (curl) {
         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
         curl_easy_setopt(curl, CURLOPT_URL, url);
         curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
         curl_easy_setopt(curl, CURLOPT_USERAGENT, "straw");
     } else {
-        cerr << "Unable to initialize curl " << endl;
-        exit(2);
+        throw strawException("Unable to initialize curl");
+        //cerr << "Unable to initialize curl " << endl;
+        //exit(2);
     }
     return curl;
 }
 
 class HiCFileStream {
 public:
     string prefix = "http"; // HTTP code
     ifstream fin;
     CURL *curl;
     bool isHttp = false;
 
     explicit HiCFileStream(const string &fileName) {
         if (std::strncmp(fileName.c_str(), prefix.c_str(), prefix.size()) == 0) {
             isHttp = true;
             curl = initCURL(fileName.c_str());
             if (!curl) {
-                cerr << "URL " << fileName << " cannot be opened for reading" << endl;
-                exit(3);
+                throw strawException("URL " + fileName + " cannot be opened for reading");
+                //cerr << "URL " << fileName << " cannot be opened for reading" << endl;
+                //exit(3);
             }
         } else {
             fin.open(fileName, fstream::in | fstream::binary);
             if (!fin) {
-                cerr << "File " << fileName << " cannot be opened for reading" << endl;
-                exit(4);
+                throw strawException("File " + fileName + " cannot be opened for reading");
+                //cerr << "File " << fileName << " cannot be opened for reading" << endl;
+                //exit(4);
             }
         }
     }
 
     void close() {
         if (isHttp) {
             curl_easy_cleanup(curl);
         } else {
             fin.close();
         }
     }
 
     char *readCompressedBytes(indexEntry idx) {
         if (isHttp) {
             return getData(curl, idx.position, idx.size);
@@ -193,59 +200,67 @@
         }
     }
 };
 
 char *readCompressedBytesFromFile(const string &fileName, indexEntry idx) {
     HiCFileStream *stream = new HiCFileStream(fileName);
     char *compressedBytes = stream->readCompressedBytes(idx);
     stream->close();
     delete stream;
     return compressedBytes;
 }
 
 // reads the header, storing the positions of the normalization vectors and returning the masterIndexPosition pointer
 map<string, chromosome> readHeader(istream &fin, int64_t &masterIndexPosition, string &genomeID,
                                    int32_t &numChromosomes, int32_t &version, int64_t &nviPosition,
-                                   int64_t &nviLength) {
+                                   int64_t &nviLength, vector<string> &attributes) {
     map<string, chromosome> chromosomeMap;
     if (!readMagicString(fin)) {
-        cerr << "Hi-C magic string is missing, does not appear to be a hic file" << endl;
+        throw strawException("Hi-C magic string is missing, does not appear to be a hic file");
+        // Not sure why this function was trying to continue after this failure (the next step is
+        // reading resolutions from the file, which clearly isn't going to work).
+        //cerr << "Hi-C magic string is missing, does not appear to be a hic file" << endl;
         masterIndexPosition = -1;
         return chromosomeMap;
     }
 
     version = readInt32FromFile(fin);
     if (version < 6) {
-        cerr << "Version " << version << " no longer supported" << endl;
+        throw strawException("Version " + to_string(version) + " no longer supported");
+        // Not sure why this function was trying to continue after this failure (the next step is
+        // reading resolutions from the file, which clearly isn't going to work).
+        //cerr << "Version " << version << " no longer supported" << endl;
         masterIndexPosition = -1;
         return chromosomeMap;
     }
     masterIndexPosition = readInt64FromFile(fin);
     getline(fin, genomeID, '\0');
 
     if (version > 8) {
         nviPosition = readInt64FromFile(fin);
         nviLength = readInt64FromFile(fin);
     }
 
     int32_t nattributes = readInt32FromFile(fin);
 
-    // reading and ignoring attribute-value dictionary
+    // reading attribute-value dictionary
     for (int i = 0; i < nattributes; i++) {
         string key, value;
         getline(fin, key, '\0');
         getline(fin, value, '\0');
+        attributes.insert(attributes.end(), key);
+        attributes.insert(attributes.end(), value);
     }
 
     numChromosomes = readInt32FromFile(fin);
     // chromosome map for finding matrixType
     for (int i = 0; i < numChromosomes; i++) {
         string name;
         int64_t length;
         getline(fin, name, '\0');
         if (version > 8) {
             length = readInt64FromFile(fin);
         } else {
             length = (int64_t) readInt32FromFile(fin);
         }
 
         chromosome chr;
@@ -1429,30 +1444,31 @@
 };
 
 class HiCFile {
 public:
     string prefix = "http"; // HTTP code
     int64_t master = 0LL;
     map<string, chromosome> chromosomeMap;
     string genomeID;
     int32_t numChromosomes = 0;
     int32_t version = 0;
     int64_t nviPosition = 0LL;
     int64_t nviLength = 0LL;
     vector<int32_t> resolutions;
     static int64_t totalFileSize;
     string fileName;
+    vector<string> attributes;
 
     static size_t hdf(char *b, size_t size, size_t nitems, void *userdata) {
         size_t numbytes = size * nitems;
         b[numbytes + 1] = '\0';
         string s(b);
         int32_t found;
         found = static_cast<int32_t>(s.find("content-range"));
         if ((size_t) found == string::npos) {
             found = static_cast<int32_t>(s.find("Content-Range"));
         }
         if ((size_t) found != string::npos) {
             int32_t found2;
             found2 = static_cast<int32_t>(s.find('/'));
             //content-range: bytes 0-100000/891471462
             if ((size_t) found2 != string::npos) {
@@ -1468,43 +1484,44 @@
         CURL *curl = initCURL(url);
         curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, hdf);
         return curl;
     }
 
     explicit HiCFile(const string &fileName) {
         this->fileName = fileName;
 
         // read header into buffer; 100K should be sufficient
         if (std::strncmp(fileName.c_str(), prefix.c_str(), prefix.size()) == 0) {
             CURL *curl;
             curl = oneTimeInitCURL(fileName.c_str());
             char *buffer = getData(curl, 0, 100000);
             memstream bufin(buffer, 100000);
             chromosomeMap = readHeader(bufin, master, genomeID, numChromosomes,
-                                       version, nviPosition, nviLength);
+                                       version, nviPosition, nviLength, attributes);
             resolutions = readResolutionsFromHeader(bufin);
             curl_easy_cleanup(curl);
             delete buffer;
         } else {
             ifstream fin;
             fin.open(fileName, fstream::in | fstream::binary);
             if (!fin) {
-                cerr << "File " << fileName << " cannot be opened for reading" << endl;
-                exit(6);
+                throw strawException("File " + fileName + " cannot be opened for reading");
+                //cerr << "File " << fileName << " cannot be opened for reading" << endl;
+                //exit(6);
             }
             chromosomeMap = readHeader(fin, master, genomeID, numChromosomes,
-                                       version, nviPosition, nviLength);
+                                       version, nviPosition, nviLength, attributes);
             resolutions = readResolutionsFromHeader(fin);
             fin.close();
         }
     }
 
     string getGenomeID() const {
         return genomeID;
     }
 
     vector<int32_t> getResolutions() const {
         return resolutions;
     }
 
     vector<chromosome> getChromosomes() {
         chromosome chromosomes[chromosomeMap.size()];
@@ -1527,32 +1544,33 @@
                                        const string &norm, const string &unit, int32_t resolution) {
         chromosome chrom1 = chromosomeMap[chr1];
         chromosome chrom2 = chromosomeMap[chr2];
         return new MatrixZoomData(chrom1, chrom2, (matrixType), (norm), (unit),
                                   resolution, version, master, totalFileSize, fileName);
     }
 };
 
 int64_t HiCFile::totalFileSize = 0LL;
 
 void parsePositions(const string &chrLoc, string &chrom, int64_t &pos1, int64_t &pos2, map<string, chromosome> map) {
     string x, y;
     stringstream ss(chrLoc);
     getline(ss, chrom, ':');
     if (map.count(chrom) == 0) {
-        cerr << "chromosome " << chrom << " not found in the file." << endl;
-        exit(7);
+        throw strawException("chromosome " + chrom + " not found in the file.");
+        //cerr << "chromosome " << chrom << " not found in the file." << endl;
+        //exit(7);
     }
 
     if (getline(ss, x, ':') && getline(ss, y, ':')) {
         pos1 = stol(x);
         pos2 = stol(y);
     } else {
         pos1 = 0LL;
         pos2 = map[chrom].length;
     }
 }
 
 vector<contactRecord> straw(const string &matrixType, const string &norm, const string &fileName, const string &chr1loc,
                             const string &chr2loc, const string &unit, int32_t binsize) {
     if (!(unit == "BP" || unit == "FRAG")) {
         cerr << "Norm specified incorrectly, must be one of <BP/FRAG>" << endl;
@@ -1629,15 +1647,48 @@
     return totalNumRecords;
 }
 
 int64_t getNumRecordsForChromosomes(const string &fileName, int32_t binsize, bool interOnly) {
     HiCFile *hiCFile = new HiCFile(fileName);
     vector<chromosome> chromosomes = hiCFile->getChromosomes();
     for(int32_t i = 0; i < chromosomes.size(); i++){
         if(chromosomes[i].index <= 0) continue;
         MatrixZoomData *mzd = hiCFile->getMatrixZoomData(chromosomes[i].name, chromosomes[i].name, "observed", "NONE", "BP", binsize);
         int64_t totalNumRecords = mzd->getNumberOfTotalRecords();
         cout << chromosomes[i].name << " " << totalNumRecords << " ";
         cout << totalNumRecords*12/1000/1000/1000 << " GB" << endl;
     }
     return 0;
 }
+
+void getHeaderFields(const string &filename, string &genome, vector<string> &chromNames, vector<int> &chromSizes,
+        vector<int> &bpResolutions, vector<int> &fragResolutions, vector<string> &attributes)
+/* Fill in the provided fields with information from the header of the hic file in the supplied filename.
+ * fragResolutions is left empty for now, as we're not making use of it. */
+{
+    HiCFile *hiCFile = new HiCFile(filename);
+
+    genome.assign(hiCFile->getGenomeID());
+
+    vector<chromosome> chromList = hiCFile->getChromosomes();
+    vector<string> myChromNames;
+    myChromNames.reserve(chromList.size());
+    vector<int> myChromSizes;
+    myChromSizes.reserve(chromList.size());
+    for(int32_t i = 0; i < chromList.size(); i++){
+        myChromNames.push_back(chromList[i].name);
+        myChromSizes.push_back(chromList[i].length);
+    }
+    chromNames = myChromNames;
+    chromSizes = myChromSizes;
+
+    vector<int> myBpResolutions;
+    myBpResolutions.reserve(hiCFile->resolutions.size());
+    for(int32_t i = 0; i < hiCFile->resolutions.size(); i++){
+        myBpResolutions.push_back(hiCFile->resolutions[i]);
+    }
+    bpResolutions = myBpResolutions;
+
+    attributes = hiCFile->attributes;
+
+    // Ignoring fragment resolutions for now, since they're never being used here
+}