60aca91bcce6d4fa555e6c7c91d8ff8aa9e7bd2b
jcasper
  Fri Jun 11 15:17:21 2021 -0700
Updating hic support for files with large headers (over 100kb) and improving
multi-region performance, refs #18842, #27593

diff --git src/hg/lib/straw/cStraw.cpp src/hg/lib/straw/cStraw.cpp
new file mode 100644
index 0000000..b284723
--- /dev/null
+++ src/hg/lib/straw/cStraw.cpp
@@ -0,0 +1,169 @@
+
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <map>
+#include <set>
+#include <vector>
+#include <streambuf>
+#include "zlib.h"
+#include "straw.h"
+using namespace std;
+
+// Supplementary functions for invoking Straw in C
+
+
+void getHeaderFields(Straw *hicFile, string &genome, vector<string> &chromNames, vector<int> &chromSizes, vector<int> &bpResolutions, vector<int> &fragResolutions, vector<string> &attributes)
+/* Retrieve .hic header fields from the supplied filename and return them in the supplied variables. */
+{
+  hicFile->loadHeader();
+  genome = hicFile->genome;
+  chromNames = hicFile->chrNames;
+  chromSizes = hicFile->chrSizes;
+  bpResolutions = hicFile->bpResolutions;
+  fragResolutions = hicFile->fragResolutions;
+  map<std::string,std::string>::iterator loop;
+  for (loop=hicFile->attributes.begin(); loop!=hicFile->attributes.end(); loop++) {
+    attributes.insert(attributes.end(), loop->first);
+    attributes.insert(attributes.end(), loop->second);
+  }
+}
+
+
+extern "C" Straw *cStrawOpen(char *fname) {
+    return new Straw(fname);
+}
+
+extern "C" void cStrawClose(Straw **hicFile) {
+    delete *hicFile;
+    *hicFile = NULL;
+}
+
+extern "C" char *cStraw (Straw *hicFile, char *norm, int binsize, char *chr1loc, char *chr2loc, char *unit, int **xActual, int **yActual, double **counts, int *numRecords)
+/* Wrapper function to retrieve a data chunk from a .hic file, for use by C libraries.
+ * norm is one of NONE/VC/VC_SQRT/KR.
+ * binsize is one of the supported bin sizes as determined by cStrawHeader.
+ * chr1loc and chr2loc are the two positions to retrieve interaction data for, specified as chr:start-end.
+ * unit is one of BP/FRAG.
+ * Values are returned in newly allocated arrays in xActual, yActual, and counts, with the number of
+ * returned records in numRecords.
+ * The function returns NULL unless an error was encountered, in which case the return value points
+ * to a character string explaining the error. */
+{
+    string thisnorm(norm);
+    string thischr1loc(chr1loc);
+    string thischr2loc(chr2loc);
+    string thisunit(unit);
+    vector<int> thisx;
+    vector<int> thisy;
+    vector<float> thiscounts;
+    try {
+      hicFile->straw(thisnorm, binsize, thischr1loc, thischr2loc, unit, thisx, thisy, thiscounts);
+    } catch (strawException& err) {
+      char *errMsg = (char*) calloc((size_t) strlen(err.what())+1, sizeof(char));
+      strcpy(errMsg, err.what());
+      return errMsg;
+    }
+    *numRecords = thisx.size();
+    *xActual = (int*) calloc((size_t) *numRecords, sizeof(int));
+    *yActual = (int*) calloc((size_t) *numRecords, sizeof(int));
+    *counts = (double*) calloc((size_t) *numRecords, sizeof(double));
+    for (int i=0; i<*numRecords; i++)
+        {
+        (*xActual)[i] = thisx[i];
+        (*yActual)[i] = thisy[i];
+        (*counts)[i] = (double) thiscounts[i];
+        }
+    return NULL;
+}
+
+
+extern "C" char *cStrawHeader (Straw *hicFile, char **genome, char ***chromNames, int **chromSizes, int *nChroms, char ***bpResolutions, int *nBpRes, char ***fragResolutions, int *nFragRes, char ***attributes, int *nAttributes)
+/* Wrapper function to retrieve header fields from a .hic file, for use by C libraries.
+ * This retrieves the assembly name, list of chromosome names, list of available binsize resolutions,
+ * and list of available fragment resolutions in the specific .hic file.
+ * The function returns NULL unless an error was encountered, in which case the return value points
+ * to a character string explaining the error. */
+{
+  string genomeString;
+  vector<string> chromNameVector;
+  vector<int> chromSizeVector;
+  vector<int> bpResolutionVector;
+  vector<int> fragResolutionVector;
+  vector<string> attributeVector;
+  try {
+    getHeaderFields(hicFile, genomeString, chromNameVector, chromSizeVector, bpResolutionVector, fragResolutionVector, attributeVector);
+  } catch (strawException& err) {
+    char *errMsg = (char*) calloc((size_t) strlen(err.what())+1, sizeof(char));
+    strcpy(errMsg, err.what());
+    return errMsg;
+  }
+  if (genome != NULL)
+  {
+    *genome = (char*) malloc((genomeString.length()+1)*sizeof(char));
+    strcpy(*genome, genomeString.c_str());
+  }
+  if (nChroms != NULL)
+  {
+    *nChroms = chromNameVector.size();
+  }
+  if (chromNames != NULL)
+  {
+    *chromNames = (char**) calloc((size_t) chromNameVector.size(), sizeof(char*));
+    for (int i=0; i<chromNameVector.size(); i++)
+    {
+      (*chromNames)[i] = (char*) malloc((chromNameVector[i].length()+1)*sizeof(char));
+      strcpy((*chromNames)[i], chromNameVector[i].c_str());
+    }
+  }
+  if (chromSizes != NULL)
+  {
+    *chromSizes = (int*) calloc((size_t) chromSizeVector.size(), sizeof(int));
+    for (int i=0; i<chromSizeVector.size(); i++)
+    {
+      (*chromSizes)[i] = chromSizeVector[i];
+    }
+  }
+  if (nBpRes != NULL)
+  {
+    *nBpRes = bpResolutionVector.size();
+  }
+  if (bpResolutions != NULL)
+  {
+    *bpResolutions = (char**) calloc((size_t) bpResolutionVector.size(), sizeof(char*));
+    for (int i=0; i<bpResolutionVector.size(); i++)
+    {
+      (*bpResolutions)[i] = (char*) malloc((to_string(bpResolutionVector[i]).length()+1)*sizeof(char));
+      strcpy((*bpResolutions)[i], to_string(bpResolutionVector[i]).c_str());
+    }
+  }
+  if (nFragRes != NULL)
+  {
+    *nFragRes = fragResolutionVector.size();
+  }
+  if (fragResolutions != NULL)
+  {
+    *fragResolutions = (char**) calloc((size_t) fragResolutionVector.size(), sizeof(char*));
+    for (int i=0; i<fragResolutionVector.size(); i++)
+    {
+      (*fragResolutions)[i] = (char*) malloc((to_string(fragResolutionVector[i]).length()+1)*sizeof(char));
+      strcpy((*fragResolutions)[i], to_string(fragResolutionVector[i]).c_str());
+    }
+  }
+  if (nAttributes != NULL)
+  {
+    *nAttributes = attributeVector.size();
+  }
+  if (attributes != NULL)
+  {
+    *attributes = (char**) calloc((size_t) attributeVector.size(), sizeof(char*));
+    for (int i=0; i<attributeVector.size(); i++)
+    {
+      (*attributes)[i] = (char*) malloc((attributeVector[i].length()+1)*sizeof(char));
+      strcpy((*attributes)[i], attributeVector[i].c_str());
+    }
+  }
+  return NULL;
+}