src/hg/lib/straw/straw.cpp bf996d33a7459741eea5a172a6cd891abca8e35e

bf996d33a7459741eea5a172a6cd891abca8e35e
jcasper
  Thu Jul 25 11:55:13 2024 -0700
Staging the replacement of the old straw library with the new.
Hopefully this results in an easier-to-read git commit history, since the library had significant changes.
refs #33225

diff --git src/hg/lib/straw/straw.cpp src/hg/lib/straw/straw.cpp
deleted file mode 100644
index 0abbfbd..0000000
--- src/hg/lib/straw/straw.cpp
+++ /dev/null
@@ -1,986 +0,0 @@
-/*
-  The straw library (straw.cpp, straw.h) is used with permission of the Aiden
-  Lab at the Broad Institute.  It is included below, along with modifications
-  to work in the environment of the UCSC Genome Browser, under the following
-  license:
-
-  The MIT License (MIT)
- 
-  Copyright (c) 2011-2016 Broad Institute, Aiden Lab
- 
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- 
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
-*/
-
-extern "C" {
-#include "common.h"
-#include "udc.h"
-#undef min
-#undef max
-}
-
-#include <cstring>
-#include <iostream>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <map>
-#include <set>
-#include <vector>
-#include <streambuf>
-//#include <curl/curl.h>
-#include "zlib.h"
-#include "straw.h"
-//#include "udcWrapper.h"
-using namespace std;
-
-/*
-  Straw: fast C++ implementation of dump. Not as fully featured as the
-  Java version. Reads the .hic file, finds the appropriate matrix and slice
-  of data, and outputs as text in sparse upper triangular format.
-
-  Currently only supporting matrices.
-
-  Usage: straw <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize> 
- */
-
-
-// this is for creating a stream from a byte array for ease of use
-struct membuf : std::streambuf
-{
-  membuf(char* begin, char* end) {
-    this->setg(begin, begin, end);
-  }
-};
-
-// get a buffer that can be used as an input stream from the URL
-char* getHttpData(char *url, long position, long chunksize, long *fileSize = NULL) {
-  char *buffer = new (nothrow) char[chunksize];
-  if (buffer == NULL) {
-    throw strawException("Out of memory, unable to allocate Hi-C data buffer");
-  }
-  struct udcFile *udc = udcFileMayOpen(url, NULL);
-  if (udc == NULL)
-    {
-    throw strawException("Error: unable to open udcFile for resource " + string(url));
-    }
-  if (fileSize)
-    *fileSize = udcFileSize(url);
-  udcSeek(udc, position);
-  udcRead(udc, (void*)buffer, chunksize);
-  udcFileClose(&udc);
-  return buffer;
-}
-
-
-/* hicBuffer is a start toward providing a level of abstraction for a .hic data reader,
- * independent of whether the file is local or remote.  Remote files are handled with
- * an initial request for remote data, which is transparently extended if the caller
- * attempts to read off the end of the initial read buffer.  The previous approach
- * assumed a header was never larger than 100kB; this failed for .hic files
- * with high (e.g. 23k) chromosome counts.
- */
-class hicBuffer {
-  private:
-  string stashedPath;
-  long offset;
-  long extent;
-  bool isHttp;
-  istream *buf;
-  char *httpBuffer;
-  membuf *sbuf;
-  long bufSize;
-  long fullFileSize;
-
-  void extendHttpBuffer() {
-    // deletes the existing buffer and allocates a replacement containing the next bytes
-    // from the file
-    delete buf;
-    delete sbuf;
-    delete httpBuffer;
-    offset += extent;
-    if (offset + extent > fullFileSize)
-      extent = fullFileSize - offset;
-    httpBuffer = getHttpData((char*)(stashedPath.c_str()), offset, extent, NULL);    
-    sbuf = new membuf(httpBuffer, httpBuffer + extent); 
-    buf = new istream(sbuf);
-  }
-
-  public:
-  hicBuffer(const string& path, long start, long size = 100000)
-  {
-    // HTTP code
-    stashedPath = path;
-    string prefix="http";
-    isHttp = false;
-    httpBuffer = NULL;
-    offset = start;
-    extent = size;
-
-    if (std::strncmp(path.c_str(), prefix.c_str(), prefix.size()) == 0) {
-      isHttp = true;
-      char *url = (char*) path.c_str();
-      httpBuffer = getHttpData(url, start, size, &fullFileSize);    
-      sbuf = new membuf(httpBuffer, httpBuffer + size); 
-      buf = new istream(sbuf);  
-    }
-    else {
-      ifstream* fin = new ifstream;
-      fin->open(path, fstream::in);
-      if (fin->fail()) {
-        cerr << "File " << path << " cannot be opened for reading" << endl;
-        return;
-      }
-      fin->seekg(start, ios::beg);
-      buf = fin;
-    }
-
-  }
-
-  ~hicBuffer() {
-      delete buf;
-      if (isHttp) {
-        delete sbuf;
-        delete httpBuffer;
-      }
-  }
-
-  void getline(string& str, char delim) {
-    std::getline(*buf, str, delim);
-    if(buf->eof() && isHttp && (offset + extent < fullFileSize)) {
-      this->extendHttpBuffer();
-      string str2;
-      std::getline(*buf, str2, delim);
-      str.append(str2);
-    }
-    if (buf->fail())
-      throw strawException("getline failure when reading file");
-  }
-
-  void getline(string& str) {
-    std::getline(*buf, str);
-    if (buf->eof() && isHttp && (offset + extent < fullFileSize)) {
-      this->extendHttpBuffer();
-      string str2;
-      std::getline(*buf, str2);
-      str.append(str2);
-    }
-    if (buf->fail())
-      throw strawException("getline failure when reading file");
-  }
-
-  void read(char *s, streamsize n) {
-    buf->read(s, n);
-    if(buf->eof() && isHttp && (offset + extent < fullFileSize)) {
-      streamsize bytesRead = buf->gcount();
-      this->extendHttpBuffer();
-      char s2[n];
-      buf->read(s2, n-bytesRead);
-      if (!buf->fail())
-        strncpy(s+bytesRead, s2, n-bytesRead);
-    }
-    if (buf->fail())
-      throw strawException("read() failure when accessing file");
-  }
-
-  long fileSize() {
-    return fullFileSize;
-  }
-};
-
-// returns whether or not this is valid HiC file
-bool Straw::readMagicString() {
-  string str;
-  if (fileName == "") {
-    throw strawException("Must open a hic file before checking the magic string");
-  }
-  hicBuffer header(fileName, 0, 16);
-  header.getline(str, '\0' );
-  return str[0]=='H' && str[1]=='I' && str[2]=='C';
-}
-
-// reads the header, storing the various values in the object
-void Straw::loadHeader() {
-  if (master != 0)
-    // header was already loaded for this file
-    return;
-
-  hicBuffer header(fileName, 4, 100000);
-  total_bytes = header.fileSize();
-
-  header.read((char*)&(version), sizeof(int));
-  if (version < 6) {
-    throw strawException("Version " + to_string(version) + " no longer supported");
-  }
-  header.read((char*)&master, sizeof(long));
-  header.getline(genome, '\0' );
-  int nattributes;
-  header.read((char*)&nattributes, sizeof(int));
-
-  // reading attribute-value dictionary
-  for (int i=0; i<nattributes; i++) {
-    string key, value;
-    header.getline(key, '\0');
-    header.getline(value, '\0');
-    attributes[key] = value;
-  }
-
-  header.read((char*)&nChrs, sizeof(int));
-  for (int i=0; i<nChrs; i++) {
-    string name;
-    int length;
-    header.getline(name, '\0');
-    header.read((char*)&length, sizeof(int));
-    chrNames.insert(chrNames.end(), name);
-    chrSizes.insert(chrSizes.end(), length);
-  }
-
-  int nBpResolutions;
-  header.read((char*)&nBpResolutions, sizeof(int));
-  for (int i=0; i<nBpResolutions; i++) {
-    int res;
-    header.read((char*)&res, sizeof(int));
-    bpResolutions.insert(bpResolutions.end(), res);
-  }
-
-  int nFragResolutions;
-  header.read((char*)&nFragResolutions, sizeof(int));
-  for (int i=0; i<nFragResolutions; i++) {
-    int res;
-    header.read((char*)&res, sizeof(int));
-    fragResolutions.insert(fragResolutions.end(), res);
-  }
-}
-
-void Straw::getChrInfo(string chr1, string chr2, int &c1pos1, int &c1pos2, int &c2pos1, int &c2pos2, int &chr1ind, int &chr2ind){
-// Retrieve index for the given chromosomes. If position limits weren't specified,
-// then set them to the full extent of each chromosome.
-  bool found1 = false, found2 = false;
-  int i;
-  for(i=0; i<chrNames.size(); i++) {
-    if (chrNames[i] == chr1) {
-      found1=true;
-      chr1ind = i;
-      if (c1pos1 == -100) {
-	c1pos1 = 0;
-	c1pos2 = chrSizes[i];
-      }
-    }
-    if (chrNames[i] == chr2) {
-      found2=true;
-      chr2ind = i;
-      if (c2pos1 == -100) {
-	c2pos1 = 0;
-	c2pos2 = chrSizes[i];
-      }
-    }
-  }
-  if (!found1 || !found2) {
-    if (chr1==chr2)
-        throw strawException("Chromosome " + chr1 + " wasn't found in the file. Check that the chromosome name matches the genome.");
-    else
-        throw strawException("Chromosomes " + chr1 + " and/or " + chr2 + " weren't found in the file. Check that the chromosome names match the genome.");
-  }
-}
-
-// reads the footer from the master pointer location. takes in the chromosomes,
-// norm, unit (BP or FRAG) and resolution or binsize, and sets the file 
-// position of the matrix and the normalization vectors for those chromosomes 
-// at the given normalization and resolution
-void Straw::readFooter(istream& fin, long master, int c1, int c2, string norm, string unit, int resolution, long &myFilePos, indexEntry &c1NormEntry, indexEntry &c2NormEntry) {
-  stringstream ss;
-  ss << c1 << "_" << c2;
-  string chrKey = ss.str();
-
-  normVector vec1(c1, norm, unit, resolution);
-  normVector vec2(c2, norm, unit, resolution);
-
-  if (haveReadFooter) {
-    map<string,long>::iterator it = chrchrMap.find(chrKey);
-    if (it == chrchrMap.end())
-      throw strawException("File doesn't have the given chr_chr map");
-    myFilePos = it->second;
-
-    if (norm=="NONE") return; // no need to read norm vector index
-
-    map<normVector, pair<long,int>>::iterator it1 = normVectors.find(vec1);
-    map<normVector, pair<long,int>>::iterator it2 = normVectors.find(vec2);
-    if ((it1 == normVectors.end()) || (it2 == normVectors.end())) {
-      throw strawException("File did not contain " + norm + " normalization vectors for one or both chromosomes at "
-            + to_string(resolution) + " " + unit);
-    }
-    c1NormEntry.position = it1->second.first;
-    c1NormEntry.size = it1->second.second;
-    c2NormEntry.position = it2->second.first;
-    c2NormEntry.size = it2->second.second;
-    return;
-  }
-
-  int nBytes;
-  fin.read((char*)&nBytes, sizeof(int));
-
-  bool found = false;
-  int nEntries;
-  fin.read((char*)&nEntries, sizeof(int));
-  for (int i=0; i<nEntries; i++) {
-    string str;
-    getline(fin, str, '\0');
-    long fpos;
-    fin.read((char*)&fpos, sizeof(long));
-    int sizeinbytes;
-    fin.read((char*)&sizeinbytes, sizeof(int));
-    chrchrMap[str] = fpos;
-    if (str == chrKey) {
-      myFilePos = fpos;
-      found=true;
-    }
-  }
- 
-  // read in and ignore expected value maps; don't store; reading these to 
-  // get to norm vector index
-  int nExpectedValues;
-  fin.read((char*)&nExpectedValues, sizeof(int));
-  for (int i=0; i<nExpectedValues; i++) {
-    string str;
-    getline(fin, str, '\0'); //unit
-    int binSize;
-    fin.read((char*)&binSize, sizeof(int));
-
-    int nValues;
-    fin.read((char*)&nValues, sizeof(int));
-    for (int j=0; j<nValues; j++) {
-      double v;
-      fin.read((char*)&v, sizeof(double));
-    }
-
-    int nNormalizationFactors;
-    fin.read((char*)&nNormalizationFactors, sizeof(int));
-    for (int j=0; j<nNormalizationFactors; j++) {
-      int chrIdx;
-      fin.read((char*)&chrIdx, sizeof(int));
-      double v;
-      fin.read((char*)&v, sizeof(double));
-    }
-  }
-  fin.read((char*)&nExpectedValues, sizeof(int));
-  for (int i=0; i<nExpectedValues; i++) {
-    string str;
-    getline(fin, str, '\0'); //typeString
-    getline(fin, str, '\0'); //unit
-    int binSize;
-    fin.read((char*)&binSize, sizeof(int));
-
-    int nValues;
-    fin.read((char*)&nValues, sizeof(int));
-    for (int j=0; j<nValues; j++) {
-      double v;
-      fin.read((char*)&v, sizeof(double));
-    }
-    int nNormalizationFactors;
-    fin.read((char*)&nNormalizationFactors, sizeof(int));
-    for (int j=0; j<nNormalizationFactors; j++) {
-      int chrIdx;
-      fin.read((char*)&chrIdx, sizeof(int));
-      double v;
-      fin.read((char*)&v, sizeof(double));
-    }
-  }
-  // Index of normalization vectors
-  fin.read((char*)&nEntries, sizeof(int));
-  bool found1 = false;
-  bool found2 = false;
-  for (int i = 0; i < nEntries; i++) {
-    string normtype;
-    getline(fin, normtype, '\0'); //normalization type
-    int chrIdx;
-    fin.read((char*)&chrIdx, sizeof(int));
-    string unit1;
-    getline(fin, unit1, '\0'); //unit
-    int resolution1;
-    fin.read((char*)&resolution1, sizeof(int));
-    long filePosition;
-    fin.read((char*)&filePosition, sizeof(long));
-    int sizeInBytes;
-    fin.read((char*)&sizeInBytes, sizeof(int));
-    normVector newVec(chrIdx, normtype, unit1, resolution1);
-    normVectors[newVec] = pair<long,int>(filePosition, sizeInBytes);
-    if (chrIdx == c1 && normtype == norm && unit1 == unit && resolution1 == resolution) {
-      c1NormEntry.position=filePosition;
-      c1NormEntry.size=sizeInBytes;
-      found1 = true;
-    }
-    if (chrIdx == c2 && normtype == norm && unit1 == unit && resolution1 == resolution) {
-      c2NormEntry.position=filePosition;
-      c2NormEntry.size=sizeInBytes;
-      found2 = true;
-    }
-  }
-  haveReadFooter = true;
-
-  if (!found) {
-    throw strawException("File doesn't have the given chr_chr map");
-  }
-
-  if (norm=="NONE") return; // no norm vectors for NONE
-
-  if (!found1 || !found2) {
-    throw strawException("File did not contain " + norm + " normalization vectors for one or both chromosomes at "
-            + to_string(resolution) + " " + unit);
-  }
-}
-
-// reads the raw binned contact matrix at specified resolution, setting the block bin count and block column count 
-bool Straw::readMatrixZoomData(istream& fin, string myunit, int mybinsize, int &myBlockBinCount, int &myBlockColumnCount) {
-  string unit;
-  getline(fin, unit, '\0' ); // unit
-  int tmp;
-  fin.read((char*)&tmp, sizeof(int)); // Old "zoom" index -- not used
-  float tmp2;
-  fin.read((char*)&tmp2, sizeof(float)); // sumCounts
-  fin.read((char*)&tmp2, sizeof(float)); // occupiedCellCount
-  fin.read((char*)&tmp2, sizeof(float)); // stdDev
-  fin.read((char*)&tmp2, sizeof(float)); // percent95
-  int binSize;
-  fin.read((char*)&binSize, sizeof(int));
-  int blockBinCount;
-  fin.read((char*)&blockBinCount, sizeof(int));
-  int blockColumnCount;
-  fin.read((char*)&blockColumnCount, sizeof(int));
-  
-  bool storeBlockData = false;
-  if (myunit==unit && mybinsize==binSize) {
-    myBlockBinCount = blockBinCount;
-    myBlockColumnCount = blockColumnCount;
-    storeBlockData = true;
-  }
-  
-  int nBlocks;
-  fin.read((char*)&nBlocks, sizeof(int));
-
-  for (int b = 0; b < nBlocks; b++) {
-    int blockNumber;
-    fin.read((char*)&blockNumber, sizeof(int));
-    long filePosition;
-    fin.read((char*)&filePosition, sizeof(long));
-    int blockSizeInBytes;
-    fin.read((char*)&blockSizeInBytes, sizeof(int));
-    indexEntry entry;
-    entry.size = blockSizeInBytes;
-    entry.position = filePosition;
-    if (storeBlockData) blockMap[blockNumber] = entry;
-  }
-  return storeBlockData;
-}
-
-// reads the raw binned contact matrix at specified resolution, setting the block bin count and block column count 
-bool Straw::readMatrixZoomDataHttp(char *url, long &myFilePosition, string myunit, int mybinsize, int &myBlockBinCount, int &myBlockColumnCount) {
-  char* buffer;
-  int header_size = 5*sizeof(int)+4*sizeof(float);
-  char* first;
-  first = getHttpData(url, myFilePosition, 1);
-  if (first[0]=='B') {
-    header_size+=3;
-  }
-  else if (first[0]=='F') {
-    header_size+=5;
-  }
-  else {
-    throw strawException("Unit not understood - must be one of <BP/FRAG>");
-  }
-  buffer = getHttpData(url, myFilePosition, header_size);
-  membuf sbuf(buffer, buffer + header_size);
-  istream fin(&sbuf);
-
-  string unit;
-  getline(fin, unit, '\0' ); // unit
-  int tmp;
-  fin.read((char*)&tmp, sizeof(int)); // Old "zoom" index -- not used
-  float tmp2;
-  fin.read((char*)&tmp2, sizeof(float)); // sumCounts
-  fin.read((char*)&tmp2, sizeof(float)); // occupiedCellCount
-  fin.read((char*)&tmp2, sizeof(float)); // stdDev
-  fin.read((char*)&tmp2, sizeof(float)); // percent95
-  int binSize;
-  fin.read((char*)&binSize, sizeof(int));
-  int blockBinCount;
-  fin.read((char*)&blockBinCount, sizeof(int));
-  int blockColumnCount;
-  fin.read((char*)&blockColumnCount, sizeof(int));
-
-  bool storeBlockData = false;
-  if (myunit==unit && mybinsize==binSize) {
-    myBlockBinCount = blockBinCount;
-    myBlockColumnCount = blockColumnCount;
-    storeBlockData = true;
-  }
-  
-  int nBlocks;
-  fin.read((char*)&nBlocks, sizeof(int));
-
-  if (storeBlockData) {
-    buffer = getHttpData(url, myFilePosition+header_size, nBlocks*(sizeof(int)+sizeof(long)+sizeof(int)));
-    membuf sbuf2(buffer, buffer + nBlocks*(sizeof(int)+sizeof(long)+sizeof(int)));
-    istream fin2(&sbuf2);
-    for (int b = 0; b < nBlocks; b++) {
-      int blockNumber;
-      fin2.read((char*)&blockNumber, sizeof(int));
-      long filePosition;
-      fin2.read((char*)&filePosition, sizeof(long));
-      int blockSizeInBytes;
-      fin2.read((char*)&blockSizeInBytes, sizeof(int));
-      indexEntry entry;
-      entry.size = blockSizeInBytes;
-      entry.position = filePosition;
-      blockMap[blockNumber] = entry;
-    }
-  }
-  else {
-    myFilePosition = myFilePosition+header_size+(nBlocks*(sizeof(int)+sizeof(long)+sizeof(int)));
-  }
-  delete buffer;
-  return storeBlockData;
-}
-
-// goes to the specified file pointer in http and finds the raw contact matrix at specified resolution, calling readMatrixZoomData.
-// sets blockbincount and blockcolumncount
-void Straw::readMatrixHttp(char *url, long myFilePosition, string unit, int resolution, int &myBlockBinCount, int &myBlockColumnCount) {
-  char * buffer;
-  int size = sizeof(int)*3;
-  buffer = getHttpData(url, myFilePosition, size);
-  membuf sbuf(buffer, buffer + size);
-  istream bufin(&sbuf);
-
-  int c1,c2;
-  bufin.read((char*)&c1, sizeof(int)); //chr1
-  bufin.read((char*)&c2, sizeof(int)); //chr2
-  int nRes;
-  bufin.read((char*)&nRes, sizeof(int));
-  int i=0;
-  bool found=false;
-  myFilePosition=myFilePosition+size;
-  delete buffer;
-
-  while (i<nRes && !found) {
-    // myFilePosition gets updated within call
-    found = this->readMatrixZoomDataHttp(url, myFilePosition, unit, resolution, myBlockBinCount, myBlockColumnCount);
-    i++;
-  }
-  if (!found) {
-    throw strawException("Error finding block data");
-  }
-}
-
-// goes to the specified file pointer and finds the raw contact matrix at specified resolution, calling readMatrixZoomData.
-// sets blockbincount and blockcolumncount
-void Straw::readMatrix(istream& fin, long myFilePosition, string unit, int resolution, int &myBlockBinCount, int &myBlockColumnCount) {
-  fin.seekg(myFilePosition, ios::beg);
-  int c1,c2;
-  fin.read((char*)&c1, sizeof(int)); //chr1
-  fin.read((char*)&c2, sizeof(int)); //chr2
-  int nRes;
-  fin.read((char*)&nRes, sizeof(int));
-  int i=0;
-  bool found=false;
-  while (i<nRes && !found) {
-    found = this->readMatrixZoomData(fin, unit, resolution, myBlockBinCount, myBlockColumnCount);
-    i++;
-  }
-  if (!found) {
-    throw strawException("Error finding block data\nWas looking for " + unit + " at resolution " + to_string(resolution));
-  }
-}
-
-// gets the blocks that need to be read for this slice of the data.  needs blockbincount, blockcolumncount, and whether
-// or not this is intrachromosomal.
-set<int> Straw::getBlockNumbersForRegionFromBinPosition(int* regionIndices, int blockBinCount, int blockColumnCount, bool intra) {
-   int col1 = regionIndices[0] / blockBinCount;
-   int col2 = (regionIndices[1] + 1) / blockBinCount;
-   int row1 = regionIndices[2] / blockBinCount;
-   int row2 = (regionIndices[3] + 1) / blockBinCount;
-   
-   set<int> blocksSet;
-   // first check the upper triangular matrix
-   for (int r = row1; r <= row2; r++) {
-     for (int c = col1; c <= col2; c++) {
-       int blockNumber = r * blockColumnCount + c;
-       blocksSet.insert(blockNumber);
-     }
-   }
-   // check region part that overlaps with lower left triangle
-   // but only if intrachromosomal
-   if (intra) {
-     for (int r = col1; r <= col2; r++) {
-       for (int c = row1; c <= row2; c++) {
-	 int blockNumber = r * blockColumnCount + c;
-	 blocksSet.insert(blockNumber);
-       }
-     }
-   }
-
-   return blocksSet;
-}
-
-// this is the meat of reading the data.  takes in the block number and returns the set of contact records corresponding to
-// that block.  the block data is compressed and must be decompressed using the zlib library functions
-vector<contactRecord> Straw::readBlock(istream& fin, char *url, bool isHttp, int blockNumber) {
-  indexEntry idx = blockMap[blockNumber];
-  if (idx.size == 0) {
-    vector<contactRecord> v;
-    return v;
-  }
-  char* compressedBytes = new char[idx.size];
-  char* uncompressedBytes = new char[idx.size*10]; //biggest seen so far is 3
-
-  if (isHttp) {
-    compressedBytes = getHttpData(url, idx.position, idx.size);    
-  }
-  else {
-    fin.seekg(idx.position, ios::beg);
-    fin.read(compressedBytes, idx.size);
-  }
-  // Decompress the block
-  // zlib struct
-  z_stream infstream;
-  infstream.zalloc = Z_NULL;
-  infstream.zfree = Z_NULL;
-  infstream.opaque = Z_NULL;
-  infstream.avail_in = (uInt)(idx.size); // size of input
-  infstream.next_in = (Bytef *)compressedBytes; // input char array
-  infstream.avail_out = (uInt)idx.size*10; // size of output
-  infstream.next_out = (Bytef *)uncompressedBytes; // output char array
-  // the actual decompression work.
-  inflateInit(&infstream);
-  inflate(&infstream, Z_NO_FLUSH);
-  inflateEnd(&infstream);
-  int uncompressedSize=infstream.total_out;
-
-  // create stream from buffer for ease of use
-  membuf sbuf(uncompressedBytes, uncompressedBytes + uncompressedSize);
-  istream bufferin(&sbuf);
-  int nRecords;
-  bufferin.read((char*)&nRecords, sizeof(int));
-  vector<contactRecord> v(nRecords);
-  // different versions have different specific formats
-  if (this->version < 7) {
-    for (int i = 0; i < nRecords; i++) {
-      int binX, binY;
-      bufferin.read((char*)&binX, sizeof(int));
-      bufferin.read((char*)&binY, sizeof(int));
-      float counts;
-      bufferin.read((char*)&counts, sizeof(float));
-      contactRecord record;
-      record.binX = binX;
-      record.binY = binY;
-      record.counts = counts;
-      v[i] = record;
-    }
-  } 
-  else {
-    int binXOffset, binYOffset;
-    bufferin.read((char*)&binXOffset, sizeof(int));
-    bufferin.read((char*)&binYOffset, sizeof(int));
-    char useShort;
-    bufferin.read((char*)&useShort, sizeof(char));
-    char type;
-    bufferin.read((char*)&type, sizeof(char));
-    int index=0;
-    if (type == 1) {
-      // List-of-rows representation
-      short rowCount;
-      bufferin.read((char*)&rowCount, sizeof(short));
-      for (int i = 0; i < rowCount; i++) {
-	short y;
-	bufferin.read((char*)&y, sizeof(short));
-	int binY = y + binYOffset;
-	short colCount;
-	bufferin.read((char*)&colCount, sizeof(short));
-	for (int j = 0; j < colCount; j++) {
-	  short x;
-	  bufferin.read((char*)&x, sizeof(short));
-	  int binX = binXOffset + x;
-	  float counts;
-	  if (useShort == 0) { // yes this is opposite of usual
-	    short c;
-	    bufferin.read((char*)&c, sizeof(short));
-	    counts = c;
-	  } 
-	  else {
-	    bufferin.read((char*)&counts, sizeof(float));
-	  }
-	  contactRecord record;
-	  record.binX = binX;
-	  record.binY = binY;
-	  record.counts = counts;
-	  v[index]=record;
-	  index++;
-	}
-      }
-    }
-    else if (type == 2) { // have yet to find test file where this is true, possibly entirely deprecated
-      int nPts;
-      bufferin.read((char*)&nPts, sizeof(int));
-      short w;
-      bufferin.read((char*)&w, sizeof(short));
-
-      for (int i = 0; i < nPts; i++) {
-	//int idx = (p.y - binOffset2) * w + (p.x - binOffset1);
-	int row = i / w;
-	int col = i - row * w;
-	int bin1 = binXOffset + col;
-	int bin2 = binYOffset + row;
-
-	float counts;
-	if (useShort == 0) { // yes this is opposite of the usual
-	  short c;
-	  bufferin.read((char*)&c, sizeof(short));
-	  if (c != -32768) {
-	    contactRecord record;
-	    record.binX = bin1;
-	    record.binY = bin2;
-	    record.counts = c;
-	    v[index]=record;
-	    index++;
-	  }
-	} 
-	else {
-	  bufferin.read((char*)&counts, sizeof(float));
-	  if (counts != 0x7fc00000) { // not sure this works
-	    //	  if (!Float.isNaN(counts)) {
-	    contactRecord record;
-	    record.binX = bin1;
-	    record.binY = bin2;
-	    record.counts = counts;
-	    v[index]=record;
-	    index++;
-	  }
-	}
-      }
-    }
-  }
-  delete[] compressedBytes;
-  delete[] uncompressedBytes; // don't forget to delete your heap arrays in C++!
-  return v;
-}
-
-// reads the normalization vector from the file at the specified location
-vector<double> Straw::readNormalizationVector(istream& bufferin) {
-  int nValues;
-  bufferin.read((char*)&nValues, sizeof(int));
-  vector<double> values(nValues);
-  //  bool allNaN = true;
-
-  for (int i = 0; i < nValues; i++) {
-    double d;
-    bufferin.read((char*)&d, sizeof(double));
-    values[i] = d;
-    /* if (!Double.isNaN(values[i])) {
-      allNaN = false;
-      }*/
-  }
-  //  if (allNaN) return null;
-  return values;
-}
-
-int Straw::open(string fname) {
-  fileName = fname;
-  if (this->readMagicString()) {
-      return 1;
-  }
-  else {
-    fileName = "";
-    throw strawException("Hi-C magic string is missing, does not appear to be a hic file");
-  }
-  return 0;
-}
-
-int Straw::fileIsOpen() {
-    return fileName != "";
-}
-
-void Straw::close() {
-    master = 0;
-    fileName = "";
-    genome = "";
-    haveReadFooter = false;
-    chrNames.clear();
-    chrSizes.clear();
-    attributes.clear();
-    blockMap.clear();
-}
-
-void Straw::straw(string norm, int binsize, string chr1loc, string chr2loc, string unit, vector<int> &xActual, vector<int> &yActual, vector<float> &counts)
-{
-  int earlyexit=1;
-
-  if (!fileIsOpen()) {
-    throw strawException("straw error - no open .hic file");
-  }
-
-  if (!(norm=="NONE"||norm=="VC"||norm=="VC_SQRT"||norm=="KR")) {
-    throw strawException("Invalid Norm value - must be one of <NONE/VC/VC_SQRT/KR>");
-  }
-  if (!(unit=="BP"||unit=="FRAG")) {
-    throw strawException("Invalid Unit value - must be one of <BP/FRAG>");
-  }
-
-  // parse chromosome positions
-  stringstream ss(chr1loc);
-  string chr1, chr2, x, y;
-  int c1pos1=-100, c1pos2=-100, c2pos1=-100, c2pos2=-100;
-  getline(ss, chr1, ':');
-  if (getline(ss, x, ':') && getline(ss, y, ':')) {
-    c1pos1 = stoi(x);
-    c1pos2 = stoi(y);
-  }
-  stringstream ss1(chr2loc);
-  getline(ss1, chr2, ':');
-  if (getline(ss1, x, ':') && getline(ss1, y, ':')) {
-    c2pos1 = stoi(x);
-    c2pos2 = stoi(y);
-  }  
-  int chr1ind, chr2ind;
-
-  this->loadHeader();
-  this->getChrInfo(chr1, chr2, c1pos1, c1pos2, c2pos1, c2pos2, chr1ind, chr2ind);
-
-  // from header have size of chromosomes, set region to read
-  int c1=min(chr1ind,chr2ind);
-  int c2=max(chr1ind,chr2ind);
-  int origRegionIndices[4]; // as given by user
-  int regionIndices[4]; // used to find the blocks we need to access
-  // reverse order if necessary
-  if (chr1ind > chr2ind) {
-    origRegionIndices[0] = c2pos1;
-    origRegionIndices[1] = c2pos2;
-    origRegionIndices[2] = c1pos1;
-    origRegionIndices[3] = c1pos2;
-    regionIndices[0] = c2pos1 / binsize;
-    regionIndices[1] = c2pos2 / binsize;
-    regionIndices[2] = c1pos1 / binsize;
-    regionIndices[3] = c1pos2 / binsize;
-  }
-  else {
-    origRegionIndices[0] = c1pos1;
-    origRegionIndices[1] = c1pos2;
-    origRegionIndices[2] = c2pos1;
-    origRegionIndices[3] = c2pos2;
-    regionIndices[0] = c1pos1 / binsize;
-    regionIndices[1] = c1pos2 / binsize;
-    regionIndices[2] = c2pos1 / binsize;
-    regionIndices[3] = c2pos2 / binsize;
-  }
-
-  indexEntry c1NormEntry, c2NormEntry;
-  long myFilePos;
-
-
-  ifstream fin;
-  char *url;
-  bool isHttp = false;
-  string prefix="http";
-  if (std::strncmp(fileName.c_str(), prefix.c_str(), prefix.size()) == 0) {
-    isHttp = true;
-  }
-  if (isHttp) {
-    unsigned long bytes_to_read = total_bytes - master;
-    url = (char*) fileName.c_str();
-    char* buffer2;
-    buffer2 = getHttpData(url, master, bytes_to_read);    
-    membuf sbuf2(buffer2, buffer2 + bytes_to_read);
-    istream bufin2(&sbuf2);
-    this->readFooter(bufin2, master, c1, c2, norm, unit, binsize, myFilePos, c1NormEntry, c2NormEntry); 
-    delete buffer2;
-  }
-  else { 
-    fin.open(fileName, fstream::in);
-    fin.seekg(master, ios::beg);
-    this->readFooter(fin, master, c1, c2, norm, unit, binsize, myFilePos, c1NormEntry, c2NormEntry); 
-  }
-  // readFooter will assign the above variables
-
-  vector<double> c1Norm;
-  vector<double> c2Norm;
-
-  if (norm != "NONE") {
-    char* buffer3;
-    if (isHttp) {
-      buffer3 = getHttpData(url, c1NormEntry.position, c1NormEntry.size);
-    }
-    else {
-      buffer3 = new char[c1NormEntry.size];
-      fin.seekg(c1NormEntry.position, ios::beg);
-      fin.read(buffer3, c1NormEntry.size);
-    }
-    membuf sbuf3(buffer3, buffer3 + c1NormEntry.size);
-    istream bufferin(&sbuf3);
-    c1Norm = this->readNormalizationVector(bufferin);
-
-    char* buffer4;
-    if (isHttp) {
-      buffer4 = getHttpData(url, c2NormEntry.position, c2NormEntry.size);
-    }
-    else {
-      buffer4 = new char[c2NormEntry.size];
-      fin.seekg(c2NormEntry.position, ios::beg);
-      fin.read(buffer4, c2NormEntry.size);
-    }
-    membuf sbuf4(buffer4, buffer4 + c2NormEntry.size);
-    istream bufferin2(&sbuf4);
-    c2Norm = this->readNormalizationVector(bufferin2);
-    delete buffer3;
-    delete buffer4;
-  }
-
-  int blockBinCount, blockColumnCount;
-  if (isHttp) {
-    // readMatrix will assign blockBinCount and blockColumnCount
-    this->readMatrixHttp(url, myFilePos, unit, binsize, blockBinCount, blockColumnCount); 
-  }
-  else {
-    // readMatrix will assign blockBinCount and blockColumnCount
-    this->readMatrix(fin, myFilePos, unit, binsize, blockBinCount, blockColumnCount); 
-  }
-  set<int> blockNumbers = this->getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockColumnCount, c1==c2); 
-
-  // getBlockIndices
-  vector<contactRecord> records;
-  for (set<int>::iterator it=blockNumbers.begin(); it!=blockNumbers.end(); ++it) {
-    // get contacts in this block
-    records = this->readBlock(fin, url, isHttp, *it);
-    for (vector<contactRecord>::iterator it2=records.begin(); it2!=records.end(); ++it2) {
-      contactRecord rec = *it2;
-      
-      int x = rec.binX * binsize;
-      int y = rec.binY * binsize;
-      float c = rec.counts;
-      if (norm != "NONE") {
-	c = c / (c1Norm[rec.binX] * c2Norm[rec.binY]);
-      }
-
-      if ((x >= origRegionIndices[0] && x <= origRegionIndices[1] &&
-	   y >= origRegionIndices[2] && y <= origRegionIndices[3]) ||
-	  // or check regions that overlap with lower left
-	  ((c1==c2) && y >= origRegionIndices[0] && y <= origRegionIndices[1] && x >= origRegionIndices[2] && x <= origRegionIndices[3])) {
-	xActual.push_back(x);
-	yActual.push_back(y);
-	counts.push_back(c);
-	//printf("%d\t%d\t%.14g\n", x, y, c);
-      }
-    }
-  }
-}