ae1ef6aa226d4436d4dc9e2b9b28120d802ab8a5 jcasper Thu Jul 25 11:57:53 2024 -0700 Dropping new straw library files into place, refs #33225 diff --git src/hg/lib/straw/new_straw.cpp src/hg/lib/straw/new_straw.cpp deleted file mode 100644 index c7428df..0000000 --- src/hg/lib/straw/new_straw.cpp +++ /dev/null @@ -1,1755 +0,0 @@ -/* - The MIT License (MIT) - - Copyright (c) 2017-2021 Aiden Lab, Rice University, Baylor College of Medicine - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ -#include <cstring> -#include <iostream> -#include <fstream> -#include <sstream> -#include <map> -#include <cmath> -#include <set> -#include <utility> -#include <vector> -#include <streambuf> -//#include <curl/curl.h> -#include <iterator> -#include <algorithm> -#include "zlib.h" -//#include "straw.h" -#include "new_straw.h" -extern "C" { -#include "../../inc/fakeCurl.h" -} - -using namespace std; - - -// Added UCSC: Dirty hack, but it's hard to quickly extract a list of available normalization options. -// This gets populated by readFooter and readFooterURL, so you have to make a quick dummy data request -// with straw() to set it up (e.g. to the first listed chromosome for a 1 bp window). -set<string> globalNormOptions; - -/* - Straw: fast C++ implementation of dump. Not as fully featured as the - Java version. Reads the .hic file, finds the appropriate matrix and slice - of data, and outputs as text in sparse upper triangular format. - - Currently only supporting matrices. - - Usage: straw [observed/oe/expected] <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize> - */ - -// callback for libcurl. data written to this buffer -static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) { - size_t realsize = size * nmemb; - struct MemoryStruct *mem; - mem = (struct MemoryStruct *) userp; - - mem->memory = static_cast<char *>(realloc(mem->memory, mem->size + realsize + 1)); - if (mem->memory == nullptr) { - /* out of memory! */ - printf("not enough memory (realloc returned NULL)\n"); - return 0; - } - - std::memcpy(&(mem->memory[mem->size]), contents, realsize); - mem->size += realsize; - mem->memory[mem->size] = 0; - - return realsize; -} - -// get a buffer that can be used as an input stream from the URL -char *getData(CURL *curl, int64_t position, int64_t chunksize) { - std::ostringstream oss; - struct MemoryStruct chunk{}; - chunk.memory = static_cast<char *>(malloc(1)); - chunk.size = 0; /* no data at this point */ - oss << position << "-" << position + chunksize; - curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *) &chunk); - curl_easy_setopt(curl, CURLOPT_RANGE, oss.str().c_str()); - // Setting FAILONERROR because otherwise this library has a bad habit of assuming data exists - // and parsing what it gets back, regardless of whether it has run off the end of the file. - curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); - CURLcode res = curl_easy_perform(curl); - if (res != CURLE_OK) { - throw strawException("curl_easy_perform() failed: " + string(curl_easy_strerror(res))); - //fprintf(stderr, "curl_easy_perform() failed: %s\n", - // curl_easy_strerror(res)); - } - - return chunk.memory; -} - -bool readMagicString(istream &fin) { - string str; - getline(fin, str, '\0'); - return str[0] == 'H' && str[1] == 'I' && str[2] == 'C'; -} - -char readCharFromFile(istream &fin) { - char tempChar; - fin.read(&tempChar, sizeof(char)); - return tempChar; -} - -int16_t readInt16FromFile(istream &fin) { - int16_t tempInt16; - fin.read((char *) &tempInt16, sizeof(int16_t)); - return tempInt16; -} - -int32_t readInt32FromFile(istream &fin) { - int32_t tempInt32; - fin.read((char *) &tempInt32, sizeof(int32_t)); - return tempInt32; -} - -int64_t readInt64FromFile(istream &fin) { - int64_t tempInt64; - fin.read((char *) &tempInt64, sizeof(int64_t)); - return tempInt64; -} - -float readFloatFromFile(istream &fin) { - float tempFloat; - fin.read((char *) &tempFloat, sizeof(float)); - return tempFloat; -} - -double readDoubleFromFile(istream &fin) { - double tempDouble; - fin.read((char *) &tempDouble, sizeof(double)); - return tempDouble; -} - -void convertGenomeToBinPos(const int64_t origRegionIndices[4], int64_t regionIndices[4], int32_t resolution) { - for (uint16_t q = 0; q < 4; q++) { - // used to find the blocks we need to access - regionIndices[q] = origRegionIndices[q] / resolution; - } -} - -static CURL *initCURL(const char *url) { - CURL *curl = curl_easy_init(); - if (curl) { - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); - curl_easy_setopt(curl, CURLOPT_URL, url); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_USERAGENT, "straw"); - curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); - } else { - throw strawException("Unable to initialize curl"); - //cerr << "Unable to initialize curl " << endl; - //exit(2); - } - return curl; -} - -class HiCFileStream { -public: - string prefix = "http"; // HTTP code - ifstream fin; - CURL *curl; - bool isHttp = false; - - explicit HiCFileStream(const string &fileName) { - if (std::strncmp(fileName.c_str(), prefix.c_str(), prefix.size()) == 0) { - isHttp = true; - curl = initCURL(fileName.c_str()); - if (!curl) { - throw strawException("URL " + fileName + " cannot be opened for reading"); - //cerr << "URL " << fileName << " cannot be opened for reading" << endl; - //exit(3); - } - } else { - fin.open(fileName, fstream::in | fstream::binary); - if (!fin) { - throw strawException("File " + fileName + " cannot be opened for reading"); - //cerr << "File " << fileName << " cannot be opened for reading" << endl; - //exit(4); - } - } - } - - void close() { - if (isHttp) { - curl_easy_cleanup(curl); - } else { - fin.close(); - } - } - - char *readCompressedBytes(indexEntry idx) { - if (isHttp) { - return getData(curl, idx.position, idx.size); - } else { - char *buffer = new char[idx.size]; - fin.seekg(idx.position, ios::beg); - fin.read(buffer, idx.size); - return buffer; - } - } -}; - -char *readCompressedBytesFromFile(const string &fileName, indexEntry idx) { - HiCFileStream *stream = new HiCFileStream(fileName); - char *compressedBytes = stream->readCompressedBytes(idx); - stream->close(); - delete stream; - return compressedBytes; -} - -// reads the header, storing the positions of the normalization vectors and returning the masterIndexPosition pointer -map<string, chromosome> readHeader(istream &fin, int64_t &masterIndexPosition, string &genomeID, - int32_t &numChromosomes, int32_t &version, int64_t &nviPosition, - int64_t &nviLength, vector<string> &attributes) { - map<string, chromosome> chromosomeMap; - if (!readMagicString(fin)) { - throw strawException("Hi-C magic string is missing, does not appear to be a hic file"); - // Not sure why this function was trying to continue after this failure (the next step is - // reading resolutions from the file, which clearly isn't going to work). - //cerr << "Hi-C magic string is missing, does not appear to be a hic file" << endl; - masterIndexPosition = -1; - return chromosomeMap; - } - - version = readInt32FromFile(fin); - if (version < 6) { - throw strawException("Version " + to_string(version) + " no longer supported"); - // Not sure why this function was trying to continue after this failure (the next step is - // reading resolutions from the file, which clearly isn't going to work). - //cerr << "Version " << version << " no longer supported" << endl; - masterIndexPosition = -1; - return chromosomeMap; - } - masterIndexPosition = readInt64FromFile(fin); - getline(fin, genomeID, '\0'); - - if (version > 8) { - nviPosition = readInt64FromFile(fin); - nviLength = readInt64FromFile(fin); - } - - int32_t nattributes = readInt32FromFile(fin); - - // reading attribute-value dictionary - for (int i = 0; i < nattributes; i++) { - string key, value; - getline(fin, key, '\0'); - getline(fin, value, '\0'); - attributes.insert(attributes.end(), key); - attributes.insert(attributes.end(), value); - } - - numChromosomes = readInt32FromFile(fin); - // chromosome map for finding matrixType - for (int i = 0; i < numChromosomes; i++) { - string name; - int64_t length; - getline(fin, name, '\0'); - if (version > 8) { - length = readInt64FromFile(fin); - } else { - length = (int64_t) readInt32FromFile(fin); - } - - chromosome chr; - chr.index = i; - chr.name = name; - chr.length = length; - chromosomeMap[name] = chr; - } - return chromosomeMap; -} - -vector<int32_t> readResolutionsFromHeader(istream &fin) { - int numBpResolutions = readInt32FromFile(fin); - vector<int32_t> resolutions; - for (int i = 0; i < numBpResolutions; i++) { - int32_t res = readInt32FromFile(fin); - resolutions.push_back(res); - } - return resolutions; -} - -//https://www.techiedelight.com/get-slice-sub-vector-from-vector-cpp/ -vector<double> sliceVector(vector<double> &v, int64_t m, int64_t n) { - vector<double> vec; - copy(v.begin() + m, v.begin() + n + 1, back_inserter(vec)); - return vec; -} - -// assume always an odd number for length of vector; -// eve if even, this calculation should be close enough -double getMedian(vector<double> &v) { - size_t n = v.size() / 2; - nth_element(v.begin(), v.begin() + n, v.end()); - return v[n]; -} - -void rollingMedian(vector<double> &initialValues, vector<double> &finalResult, int32_t window) { - // window is actually a ~wing-span - if (window < 1) { - finalResult = initialValues; - return; - } - - /* - finalResult.push_back(initialValues[0]); - int64_t length = initialValues.size(); - for (int64_t index = 1; index < length; index++) { - int64_t initialIndex; - int64_t finalIndex; - if (index < window) { - initialIndex = 0; - finalIndex = 2 * index; - } else { - initialIndex = index - window; - finalIndex = index + window; - } - - if (finalIndex > length - 1) { - finalIndex = length - 1; - } - - vector<double> subVector = sliceVector(initialValues, initialIndex, finalIndex); - finalResult.push_back(getMedian(subVector)); - } - */ - finalResult = initialValues; -} - -void populateVectorWithFloats(istream &fin, vector<double> &vector, int64_t nValues) { - for (int j = 0; j < nValues; j++) { - double v = readFloatFromFile(fin); - vector.push_back(v); - } -} - -void populateVectorWithDoubles(istream &fin, vector<double> &vector, int64_t nValues) { - for (int j = 0; j < nValues; j++) { - double v = readDoubleFromFile(fin); - vector.push_back(v); - } -} - -int64_t readThroughExpectedVectorURL(CURL *curl, int64_t currentPointer, int32_t version, vector<double> &expectedValues, int64_t nValues, - bool store, int32_t resolution) { - if (store) { - int32_t bufferSize = nValues * sizeof(double) + 10000; - if (version > 8) { - bufferSize = nValues * sizeof(float) + 10000; - } - char *buffer = getData(curl, currentPointer, bufferSize); - memstream fin(buffer, bufferSize); - - vector<double> initialExpectedValues; - if (version > 8) { - populateVectorWithFloats(fin, initialExpectedValues, nValues); - } else { - populateVectorWithDoubles(fin, initialExpectedValues, nValues); - } - int32_t window = 5000000 / resolution; - rollingMedian(initialExpectedValues, expectedValues, window); - delete buffer; - } - - if (version > 8) { - return nValues * sizeof(float); - } else { - return nValues * sizeof(double); - } -} - -void readThroughExpectedVector(int32_t version, istream &fin, vector<double> &expectedValues, int64_t nValues, - bool store, int32_t resolution) { - if (store) { - vector<double> initialExpectedValues; - if (version > 8) { - populateVectorWithFloats(fin, initialExpectedValues, nValues); - } else { - populateVectorWithDoubles(fin, initialExpectedValues, nValues); - } - int32_t window = 5000000 / resolution; - rollingMedian(initialExpectedValues, expectedValues, window); - } else if (nValues > 0) { - if (version > 8) { - fin.seekg(nValues * sizeof(float), ios_base::cur); - } else { - fin.seekg(nValues * sizeof(double), ios_base::cur); - } - } -} - -int64_t readThroughNormalizationFactorsURL(CURL *curl, int64_t currentPointer, int32_t version, bool store, vector<double> &expectedValues, - int32_t c1, int32_t nNormalizationFactors) { - - if (store) { - int32_t bufferSize = nNormalizationFactors * (sizeof(int32_t) + sizeof(double)) + 10000; - if (version > 8) { - bufferSize = nNormalizationFactors * (sizeof(int32_t) + sizeof(float )) + 10000; - } - char *buffer = getData(curl, currentPointer, bufferSize); - memstream fin(buffer, bufferSize); - - for (int j = 0; j < nNormalizationFactors; j++) { - int32_t chrIdx = readInt32FromFile(fin); - double v; - if (version > 8) { - v = readFloatFromFile(fin); - } else { - v = readDoubleFromFile(fin); - } - if (chrIdx == c1) { - for (double &expectedValue : expectedValues) { - expectedValue = expectedValue / v; - } - } - } - delete buffer; - } - - if (version > 8) { - return nNormalizationFactors * (sizeof(int32_t) + sizeof(float)); - } else { - return nNormalizationFactors * (sizeof(int32_t) + sizeof(double)); - } -} - -void readThroughNormalizationFactors(istream &fin, int32_t version, bool store, vector<double> &expectedValues, - int32_t c1) { - int32_t nNormalizationFactors = readInt32FromFile(fin); - if (store) { - for (int j = 0; j < nNormalizationFactors; j++) { - int32_t chrIdx = readInt32FromFile(fin); - double v; - if (version > 8) { - v = readFloatFromFile(fin); - } else { - v = readDoubleFromFile(fin); - } - if (chrIdx == c1) { - for (double &expectedValue : expectedValues) { - expectedValue = expectedValue / v; - } - } - } - } else if (nNormalizationFactors > 0) { - if (version > 8) { - fin.seekg(nNormalizationFactors * (sizeof(int32_t) + sizeof(float)), ios_base::cur); - } else { - fin.seekg(nNormalizationFactors * (sizeof(int32_t) + sizeof(double)), ios_base::cur); - } - } -} - -int64_t readStringFromURL(istream &fin, string &basicString) { - getline(fin, basicString, '\0'); - return (basicString.length() + 1); -} - -// reads the footer from the master pointer location. takes in the chromosomes, -// norm, unit (BP or FRAG) and resolution or binsize, and sets the file -// position of the matrix and the normalization vectors for those chromosomes -// at the given normalization and resolution -bool readFooterURL(CURL *curl, int64_t master, int32_t version, int32_t c1, int32_t c2, const string &matrixType, - const string &norm, const string &unit, int32_t resolution, int64_t &myFilePos, - indexEntry &c1NormEntry, indexEntry &c2NormEntry, vector<double> &expectedValues) { - - int64_t currentPointer = master; - - char *buffer = getData(curl, currentPointer, 100); - memstream fin1(buffer, 100); - - if (version > 8) { - int64_t nBytes = readInt64FromFile(fin1); - currentPointer += 8; - } else { - int32_t nBytes = readInt32FromFile(fin1); - currentPointer += 4; - } - - stringstream ss; - ss << c1 << "_" << c2; - string key = ss.str(); - - int32_t nEntries = readInt32FromFile(fin1); - currentPointer += 4; - delete buffer; - - int32_t bufferSize0 = nEntries * 50; - buffer = getData(curl, currentPointer, bufferSize0); - memstream fin2(buffer, bufferSize0); - - bool found = false; - for (int i = 0; i < nEntries; i++) { - string keyStr; - currentPointer += readStringFromURL(fin2, keyStr); - - int64_t fpos = readInt64FromFile(fin2); - int32_t sizeinbytes = readInt32FromFile(fin2); - currentPointer += 12; - if (keyStr == key) { - myFilePos = fpos; - found = true; - } - } - delete buffer; - if (!found) { - cerr << "Remote file doesn't have the given chr_chr map " << key << endl; - return false; - } - - if ((matrixType == "observed" && norm == "NONE") || - ((matrixType == "oe" || matrixType == "expected") && norm == "NONE" && c1 != c2)) - return true; // no need to read norm vector index - - // read in and ignore expected value maps; don't store; reading these to - // get to norm vector index - buffer = getData(curl, currentPointer, 100); - memstream fin3(buffer, 100); - - int32_t nExpectedValues = readInt32FromFile(fin3); - if (fin3.bad() || fin3.fail()) - { - throw strawException("Unable to read expected values from file - it might be truncated early"); - } - currentPointer += 4; - delete buffer; - for (int i = 0; i < nExpectedValues; i++) { - - buffer = getData(curl, currentPointer, 1000); - memstream fin4(buffer, 1000); - - string unit0; - currentPointer += readStringFromURL(fin4, unit0); - - int32_t binSize = readInt32FromFile(fin4); - currentPointer += 4; - - int64_t nValues; - if (version > 8) { - nValues = readInt64FromFile(fin4); - currentPointer += 8; - } else { - nValues = (int64_t) readInt32FromFile(fin4); - currentPointer += 4; - } - - delete buffer; - - bool store = c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm == "NONE" && unit0 == unit && - binSize == resolution; - - currentPointer += readThroughExpectedVectorURL(curl, currentPointer, version, expectedValues, nValues, store, resolution); - - buffer = getData(curl, currentPointer, 100); - memstream fin5(buffer, 100); - int32_t nNormalizationFactors = readInt32FromFile(fin5); - currentPointer += 4; - delete buffer; - - currentPointer += readThroughNormalizationFactorsURL(curl, currentPointer, version, store, expectedValues, c1, nNormalizationFactors); - } - - if (c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm == "NONE") { - if (expectedValues.empty()) { - cerr << "Remote file did not contain expected values vectors at " << resolution << " " << unit << endl; - return false; - } - return true; - } - - buffer = getData(curl, currentPointer, 100); - memstream fin6(buffer, 100); - nExpectedValues = readInt32FromFile(fin6); - if (fin6.bad() || fin6.fail()) - { - throw strawException("Unable to read normalized expected values from file - it might be truncated early"); - } - currentPointer += 4; - delete buffer; - for (int i = 0; i < nExpectedValues; i++) { - buffer = getData(curl, currentPointer, 1000); - memstream fin7(buffer, 1000); - - string nType, unit0; - currentPointer += readStringFromURL(fin7, nType); - currentPointer += readStringFromURL(fin7, unit0); - - int32_t binSize = readInt32FromFile(fin7); - currentPointer += 4; - - int64_t nValues; - if (version > 8) { - nValues = readInt64FromFile(fin7); - currentPointer += 8; - } else { - nValues = (int64_t) readInt32FromFile(fin7); - currentPointer += 4; - } - bool store = c1 == c2 && (matrixType == "oe" || matrixType == "expected") && nType == norm && unit0 == unit && - binSize == resolution; - - delete buffer; - - currentPointer += readThroughExpectedVectorURL(curl, currentPointer, version, expectedValues, nValues, store, resolution); - - buffer = getData(curl, currentPointer, 100); - memstream fin8(buffer, 100); - int32_t nNormalizationFactors = readInt32FromFile(fin8); - currentPointer += 4; - delete buffer; - - currentPointer += readThroughNormalizationFactorsURL(curl, currentPointer, version, store, expectedValues, c1, nNormalizationFactors); - } - - if (c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm != "NONE") { - if (expectedValues.empty()) { - cerr << "Remote file did not contain normalized expected values vectors at " << resolution << " " << unit << endl; - return false; - } - } - - buffer = getData(curl, currentPointer, 100); - memstream fin9(buffer, 100); - nEntries = readInt32FromFile(fin9); - if (fin9.bad() || fin9.fail()) - { - throw strawException("Unable to read normalization vectors from file - it might be truncated early"); - } - currentPointer += 4; - delete buffer; - - bool found1 = false; - bool found2 = false; - int32_t bufferSize2 = nEntries*60; - buffer = getData(curl, currentPointer, bufferSize2); - memstream fin10(buffer, bufferSize2); - - // added UCSC - globalNormOptions.clear(); - - for (int i = 0; i < nEntries; i++) { - string normtype; - currentPointer += readStringFromURL(fin10, normtype); - - // added UCSC - globalNormOptions.insert(normtype); - - int32_t chrIdx = readInt32FromFile(fin10); - currentPointer += 4; - string unit1; - currentPointer += readStringFromURL(fin10, unit1); - - int32_t resolution1 = readInt32FromFile(fin10); - int64_t filePosition = readInt64FromFile(fin10); - currentPointer += 12; - - int64_t sizeInBytes; - if (version > 8) { - sizeInBytes = readInt64FromFile(fin10); - currentPointer += 8; - } else { - sizeInBytes = (int64_t) readInt32FromFile(fin10); - currentPointer += 4; - } - - if (chrIdx == c1 && normtype == norm && unit1 == unit && resolution1 == resolution) { - c1NormEntry.position = filePosition; - c1NormEntry.size = sizeInBytes; - found1 = true; - } - if (chrIdx == c2 && normtype == norm && unit1 == unit && resolution1 == resolution) { - c2NormEntry.position = filePosition; - c2NormEntry.size = sizeInBytes; - found2 = true; - } - } - delete buffer; - if (!found1 || !found2) { - cerr << "Remote file did not contain " << norm << " normalization vectors for one or both chromosomes at " - << resolution << " " << unit << endl; - } - return true; -} - -bool readFooter(istream &fin, int64_t master, int32_t version, int32_t c1, int32_t c2, const string &matrixType, - const string &norm, const string &unit, int32_t resolution, int64_t &myFilePos, - indexEntry &c1NormEntry, indexEntry &c2NormEntry, vector<double> &expectedValues) { - - if (version > 8) { - int64_t nBytes = readInt64FromFile(fin); - } else { - int32_t nBytes = readInt32FromFile(fin); - } - - stringstream ss; - ss << c1 << "_" << c2; - string key = ss.str(); - - int32_t nEntries = readInt32FromFile(fin); - bool found = false; - for (int i = 0; i < nEntries; i++) { - string keyStr; - getline(fin, keyStr, '\0'); - int64_t fpos = readInt64FromFile(fin); - int32_t sizeinbytes = readInt32FromFile(fin); - if (keyStr == key) { - myFilePos = fpos; - found = true; - } - } - if (!found) { - cerr << "File doesn't have the given chr_chr map " << key << endl; - return false; - } - - if ((matrixType == "observed" && norm == "NONE") || - ((matrixType == "oe" || matrixType == "expected") && norm == "NONE" && c1 != c2)) - return true; // no need to read norm vector index - - // read in and ignore expected value maps; don't store; reading these to - // get to norm vector index - int32_t nExpectedValues = readInt32FromFile(fin); - if (fin.bad() || fin.fail()) - { - throw strawException("Unable to read expected values from file - it might be truncated early"); - } - for (int i = 0; i < nExpectedValues; i++) { - string unit0; - getline(fin, unit0, '\0'); //unit - int32_t binSize = readInt32FromFile(fin); - - int64_t nValues; - if (version > 8) { - nValues = readInt64FromFile(fin); - } else { - nValues = (int64_t) readInt32FromFile(fin); - } - - bool store = c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm == "NONE" && unit0 == unit && - binSize == resolution; - readThroughExpectedVector(version, fin, expectedValues, nValues, store, resolution); - readThroughNormalizationFactors(fin, version, store, expectedValues, c1); - } - - if (c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm == "NONE") { - if (expectedValues.empty()) { - cerr << "File did not contain expected values vectors at " << resolution << " " << unit << endl; - return false; - } - return true; - } - - nExpectedValues = readInt32FromFile(fin); - if (fin.bad() || fin.fail()) - { - throw strawException("Unable to read normalized expected values from file - it might be truncated early"); - } - - for (int i = 0; i < nExpectedValues; i++) { - string nType, unit0; - getline(fin, nType, '\0'); //typeString - getline(fin, unit0, '\0'); //unit - int32_t binSize = readInt32FromFile(fin); - int64_t nValues; - if (version > 8) { - nValues = readInt64FromFile(fin); - } else { - nValues = (int64_t) readInt32FromFile(fin); - } - bool store = c1 == c2 && (matrixType == "oe" || matrixType == "expected") && nType == norm && unit0 == unit && - binSize == resolution; - readThroughExpectedVector(version, fin, expectedValues, nValues, store, resolution); - readThroughNormalizationFactors(fin, version, store, expectedValues, c1); - } - - if (c1 == c2 && (matrixType == "oe" || matrixType == "expected") && norm != "NONE") { - if (expectedValues.empty()) { - cerr << "File did not contain normalized expected values vectors at " << resolution << " " << unit << endl; - return false; - } - } - - // Index of normalization vectors - nEntries = readInt32FromFile(fin); - bool found1 = false; - bool found2 = false; - - // added UCSC - globalNormOptions.clear(); - if (fin.bad() || fin.fail()) - { - throw strawException("Unable to read normalization vectors from file - it might be truncated early"); - } - - for (int i = 0; i < nEntries; i++) { - string normtype; - getline(fin, normtype, '\0'); //normalization type - int32_t chrIdx = readInt32FromFile(fin); - string unit1; - getline(fin, unit1, '\0'); //unit - int32_t resolution1 = readInt32FromFile(fin); - int64_t filePosition = readInt64FromFile(fin); - int64_t sizeInBytes; - if (version > 8) { - sizeInBytes = readInt64FromFile(fin); - } else { - sizeInBytes = (int64_t) readInt32FromFile(fin); - } - - // added UCSC - globalNormOptions.insert(normtype); - - if (chrIdx == c1 && normtype == norm && unit1 == unit && resolution1 == resolution) { - c1NormEntry.position = filePosition; - c1NormEntry.size = sizeInBytes; - found1 = true; - } - if (chrIdx == c2 && normtype == norm && unit1 == unit && resolution1 == resolution) { - c2NormEntry.position = filePosition; - c2NormEntry.size = sizeInBytes; - found2 = true; - } - } - if (!found1 || !found2) { - cerr << "File did not contain " << norm << " normalization vectors for one or both chromosomes at " - << resolution << " " << unit << endl; - } - return true; -} - -indexEntry readIndexEntry(istream &fin) { - int64_t filePosition = readInt64FromFile(fin); - int32_t blockSizeInBytes = readInt32FromFile(fin); - indexEntry entry = indexEntry(); - entry.size = (int64_t) blockSizeInBytes; - entry.position = filePosition; - return entry; -} - -void setValuesForMZD(istream &fin, const string &myunit, float &mySumCounts, int32_t &mybinsize, - int32_t &myBlockBinCount, int32_t &myBlockColumnCount, bool &found) { - string unit; - getline(fin, unit, '\0'); // unit - readInt32FromFile(fin); // Old "zoom" index -- not used - float sumCounts = readFloatFromFile(fin); // sumCounts - readFloatFromFile(fin); // occupiedCellCount - readFloatFromFile(fin); // stdDev - readFloatFromFile(fin); // percent95 - int32_t binSize = readInt32FromFile(fin); - int32_t blockBinCount = readInt32FromFile(fin); - int32_t blockColumnCount = readInt32FromFile(fin); - found = false; - if (myunit == unit && mybinsize == binSize) { - mySumCounts = sumCounts; - myBlockBinCount = blockBinCount; - myBlockColumnCount = blockColumnCount; - found = true; - } -} - -void populateBlockMap(istream &fin, int32_t nBlocks, map<int32_t, indexEntry> &blockMap) { - for (int b = 0; b < nBlocks; b++) { - int32_t blockNumber = readInt32FromFile(fin); - blockMap[blockNumber] = readIndexEntry(fin); - } -} - -// reads the raw binned contact matrix at specified resolution, setting the block bin count and block column count -map<int32_t, indexEntry> readMatrixZoomData(istream &fin, const string &myunit, int32_t mybinsize, float &mySumCounts, - int32_t &myBlockBinCount, int32_t &myBlockColumnCount, bool &found) { - - map<int32_t, indexEntry> blockMap; - setValuesForMZD(fin, myunit, mySumCounts, mybinsize, myBlockBinCount, myBlockColumnCount, found); - - int32_t nBlocks = readInt32FromFile(fin); - if (found) { - populateBlockMap(fin, nBlocks, blockMap); - } else { - fin.seekg(nBlocks * (sizeof(int32_t) + sizeof(int64_t) + sizeof(int32_t)), ios_base::cur); - } - return blockMap; -} - -// reads the raw binned contact matrix at specified resolution, setting the block bin count and block column count -map<int32_t, indexEntry> readMatrixZoomDataHttp(CURL *curl, int64_t &myFilePosition, const string &myunit, - int32_t mybinsize, float &mySumCounts, int32_t &myBlockBinCount, - int32_t &myBlockColumnCount, bool &found) { - map<int32_t, indexEntry> blockMap; - int32_t header_size = 5 * sizeof(int32_t) + 4 * sizeof(float); - char *first = getData(curl, myFilePosition, 1); - if (first[0] == 'B') { - header_size += 3; - } else if (first[0] == 'F') { - header_size += 5; - } else { - cerr << "Unit not understood" << endl; - return blockMap; - } - delete first; - char *buffer = getData(curl, myFilePosition, header_size); - memstream fin(buffer, header_size); - setValuesForMZD(fin, myunit, mySumCounts, mybinsize, myBlockBinCount, myBlockColumnCount, found); - int32_t nBlocks = readInt32FromFile(fin); - delete buffer; - - if (found) { - int32_t chunkSize = nBlocks * (sizeof(int32_t) + sizeof(int64_t) + sizeof(int32_t)); - buffer = getData(curl, myFilePosition + header_size, chunkSize); - memstream fin2(buffer, chunkSize); - populateBlockMap(fin2, nBlocks, blockMap); - delete buffer; - } else { - myFilePosition = myFilePosition + header_size - + (nBlocks * (sizeof(int32_t) + sizeof(int64_t) + sizeof(int32_t))); - } - return blockMap; -} - -// goes to the specified file pointer in http and finds the raw contact matrixType at specified resolution, calling readMatrixZoomData. -// sets blockbincount and blockcolumncount -map<int32_t, indexEntry> readMatrixHttp(CURL *curl, int64_t myFilePosition, const string &unit, int32_t resolution, - float &mySumCounts, int32_t &myBlockBinCount, int32_t &myBlockColumnCount) { - int32_t size = sizeof(int32_t) * 3; - char *buffer = getData(curl, myFilePosition, size); - memstream bufin(buffer, size); - - int32_t c1 = readInt32FromFile(bufin); - int32_t c2 = readInt32FromFile(bufin); - int32_t nRes = readInt32FromFile(bufin); - int32_t i = 0; - bool found = false; - myFilePosition = myFilePosition + size; - delete buffer; - map<int32_t, indexEntry> blockMap; - - while (i < nRes && !found) { - // myFilePosition gets updated within call - blockMap = readMatrixZoomDataHttp(curl, myFilePosition, unit, resolution, mySumCounts, myBlockBinCount, - myBlockColumnCount, found); - i++; - } - if (!found) { - cerr << "Error finding block data" << endl; - } - return blockMap; -} - -// goes to the specified file pointer and finds the raw contact matrixType at specified resolution, calling readMatrixZoomData. -// sets blockbincount and blockcolumncount -map<int32_t, indexEntry> readMatrix(istream &fin, int64_t myFilePosition, const string &unit, int32_t resolution, - float &mySumCounts, int32_t &myBlockBinCount, int32_t &myBlockColumnCount) { - map<int32_t, indexEntry> blockMap; - - fin.seekg(myFilePosition, ios::beg); - int32_t c1 = readInt32FromFile(fin); - int32_t c2 = readInt32FromFile(fin); - int32_t nRes = readInt32FromFile(fin); - int32_t i = 0; - bool found = false; - while (i < nRes && !found) { - blockMap = readMatrixZoomData(fin, unit, resolution, mySumCounts, myBlockBinCount, myBlockColumnCount, found); - i++; - } - if (!found) { - cerr << "Error finding block data" << endl; - } - return blockMap; -} - -// gets the blocks that need to be read for this slice of the data. needs blockbincount, blockcolumncount, and whether -// or not this is intrachromosomal. -set<int32_t> getBlockNumbersForRegionFromBinPosition(const int64_t *regionIndices, int32_t blockBinCount, - int32_t blockColumnCount, bool intra) { - int32_t col1, col2, row1, row2; - col1 = static_cast<int32_t>(regionIndices[0] / blockBinCount); - col2 = static_cast<int32_t>((regionIndices[1] + 1) / blockBinCount); - row1 = static_cast<int32_t>(regionIndices[2] / blockBinCount); - row2 = static_cast<int32_t>((regionIndices[3] + 1) / blockBinCount); - - set<int32_t> blocksSet; - // first check the upper triangular matrixType - for (int r = row1; r <= row2; r++) { - for (int c = col1; c <= col2; c++) { - int32_t blockNumber = r * blockColumnCount + c; - blocksSet.insert(blockNumber); - } - } - // check region part that overlaps with lower left triangle but only if intrachromosomal - if (intra) { - for (int r = col1; r <= col2; r++) { - for (int c = row1; c <= row2; c++) { - int32_t blockNumber = r * blockColumnCount + c; - blocksSet.insert(blockNumber); - } - } - } - return blocksSet; -} - -set<int32_t> getBlockNumbersForRegionFromBinPositionV9Intra(int64_t *regionIndices, int32_t blockBinCount, - int32_t blockColumnCount) { - // regionIndices is binX1 binX2 binY1 binY2 - set<int32_t> blocksSet; - int32_t translatedLowerPAD, translatedHigherPAD, translatedNearerDepth, translatedFurtherDepth; - translatedLowerPAD = static_cast<int32_t>((regionIndices[0] + regionIndices[2]) / 2 / blockBinCount); - translatedHigherPAD = static_cast<int32_t>((regionIndices[1] + regionIndices[3]) / 2 / blockBinCount + 1); - translatedNearerDepth = static_cast<int32_t>(log2( - 1 + abs(regionIndices[0] - regionIndices[3]) / sqrt(2) / blockBinCount)); - translatedFurtherDepth = static_cast<int32_t>(log2( - 1 + abs(regionIndices[1] - regionIndices[2]) / sqrt(2) / blockBinCount)); - - // because code above assume above diagonal; but we could be below diagonal - int32_t nearerDepth = min(translatedNearerDepth, translatedFurtherDepth); - if ((regionIndices[0] > regionIndices[3] && regionIndices[1] < regionIndices[2]) || - (regionIndices[1] > regionIndices[2] && regionIndices[0] < regionIndices[3])) { - nearerDepth = 0; - } - int32_t furtherDepth = max(translatedNearerDepth, translatedFurtherDepth) + 1; // +1; integer divide rounds down - - for (int depth = nearerDepth; depth <= furtherDepth; depth++) { - for (int pad = translatedLowerPAD; pad <= translatedHigherPAD; pad++) { - int32_t blockNumber = depth * blockColumnCount + pad; - blocksSet.insert(blockNumber); - } - } - - return blocksSet; -} - -void appendRecord(vector<contactRecord> &vector, int32_t index, int32_t binX, int32_t binY, float counts) { - contactRecord record = contactRecord(); - record.binX = binX; - record.binY = binY; - record.counts = counts; - vector[index] = record; -} - -int32_t decompressBlock(indexEntry idx, char *compressedBytes, char *uncompressedBytes) { - z_stream infstream; - infstream.zalloc = Z_NULL; - infstream.zfree = Z_NULL; - infstream.opaque = Z_NULL; - infstream.avail_in = static_cast<uInt>(idx.size); // size of input - infstream.next_in = (Bytef *) compressedBytes; // input char array - infstream.avail_out = static_cast<uInt>(idx.size * 10); // size of output - infstream.next_out = (Bytef *) uncompressedBytes; // output char array - // the actual decompression work. - inflateInit(&infstream); - inflate(&infstream, Z_NO_FLUSH); - inflateEnd(&infstream); - int32_t uncompressedSize = static_cast<int32_t>(infstream.total_out); - return uncompressedSize; -} - -long getNumRecordsInBlock(const string &fileName, indexEntry idx, int32_t version){ - if (idx.size <= 0) { - return 0; - } - char *compressedBytes = readCompressedBytesFromFile(fileName, idx); - char *uncompressedBytes = new char[idx.size * 10]; //biggest seen so far is 3 - int32_t uncompressedSize = decompressBlock(idx, compressedBytes, uncompressedBytes); - - // create stream from buffer for ease of use - memstream bufferin(uncompressedBytes, uncompressedSize); - uint64_t nRecords; - nRecords = static_cast<uint64_t>(readInt32FromFile(bufferin)); - delete[] compressedBytes; - delete[] uncompressedBytes; // don't forget to delete your heap arrays in C++! - return nRecords; -} - -// this is the meat of reading the data. takes in the block number and returns the set of contact records corresponding to -// that block. the block data is compressed and must be decompressed using the zlib library functions -vector<contactRecord> readBlock(const string &fileName, indexEntry idx, int32_t version) { - if (idx.size <= 0) { - vector<contactRecord> v; - return v; - } - char *compressedBytes = readCompressedBytesFromFile(fileName, idx); - char *uncompressedBytes = new char[idx.size * 10]; //biggest seen so far is 3 - int32_t uncompressedSize = decompressBlock(idx, compressedBytes, uncompressedBytes); - - // create stream from buffer for ease of use - memstream bufferin(uncompressedBytes, uncompressedSize); - uint64_t nRecords; - nRecords = static_cast<uint64_t>(readInt32FromFile(bufferin)); - vector<contactRecord> v(nRecords); - // different versions have different specific formats - if (version < 7) { - for (uInt i = 0; i < nRecords; i++) { - int32_t binX = readInt32FromFile(bufferin); - int32_t binY = readInt32FromFile(bufferin); - float counts = readFloatFromFile(bufferin); - appendRecord(v, i, binX, binY, counts); - } - } else { - int32_t binXOffset = readInt32FromFile(bufferin); - int32_t binYOffset = readInt32FromFile(bufferin); - bool useShort = readCharFromFile(bufferin) == 0; // yes this is opposite of usual - - bool useShortBinX = true; - bool useShortBinY = true; - if (version > 8) { - useShortBinX = readCharFromFile(bufferin) == 0; - useShortBinY = readCharFromFile(bufferin) == 0; - } - - char type = readCharFromFile(bufferin); - int32_t index = 0; - if (type == 1) { - if (useShortBinX && useShortBinY) { - int16_t rowCount = readInt16FromFile(bufferin); - for (int i = 0; i < rowCount; i++) { - int32_t binY = binYOffset + readInt16FromFile(bufferin); - int16_t colCount = readInt16FromFile(bufferin); - for (int j = 0; j < colCount; j++) { - int32_t binX = binXOffset + readInt16FromFile(bufferin); - float counts; - if (useShort) { - counts = readInt16FromFile(bufferin); - } else { - counts = readFloatFromFile(bufferin); - } - appendRecord(v, index++, binX, binY, counts); - } - } - } else if (useShortBinX && !useShortBinY) { - int32_t rowCount = readInt32FromFile(bufferin); - for (int i = 0; i < rowCount; i++) { - int32_t binY = binYOffset + readInt32FromFile(bufferin); - int16_t colCount = readInt16FromFile(bufferin); - for (int j = 0; j < colCount; j++) { - int32_t binX = binXOffset + readInt16FromFile(bufferin); - float counts; - if (useShort) { - counts = readInt16FromFile(bufferin); - } else { - counts = readFloatFromFile(bufferin); - } - appendRecord(v, index++, binX, binY, counts); - } - } - } else if (!useShortBinX && useShortBinY) { - int16_t rowCount = readInt16FromFile(bufferin); - for (int i = 0; i < rowCount; i++) { - int32_t binY = binYOffset + readInt16FromFile(bufferin); - int32_t colCount = readInt32FromFile(bufferin); - for (int j = 0; j < colCount; j++) { - int32_t binX = binXOffset + readInt32FromFile(bufferin); - float counts; - if (useShort) { - counts = readInt16FromFile(bufferin); - } else { - counts = readFloatFromFile(bufferin); - } - appendRecord(v, index++, binX, binY, counts); - } - } - } else { - int32_t rowCount = readInt32FromFile(bufferin); - for (int i = 0; i < rowCount; i++) { - int32_t binY = binYOffset + readInt32FromFile(bufferin); - int32_t colCount = readInt32FromFile(bufferin); - for (int j = 0; j < colCount; j++) { - int32_t binX = binXOffset + readInt32FromFile(bufferin); - float counts; - if (useShort) { - counts = readInt16FromFile(bufferin); - } else { - counts = readFloatFromFile(bufferin); - } - appendRecord(v, index++, binX, binY, counts); - } - } - } - } else if (type == 2) { - int32_t nPts = readInt32FromFile(bufferin); - int16_t w = readInt16FromFile(bufferin); - - for (int i = 0; i < nPts; i++) { - //int32_t idx = (p.y - binOffset2) * w + (p.x - binOffset1); - int32_t row = i / w; - int32_t col = i - row * w; - int32_t bin1 = binXOffset + col; - int32_t bin2 = binYOffset + row; - - float counts; - if (useShort) { - int16_t c = readInt16FromFile(bufferin); - if (c != -32768) { - appendRecord(v, index++, bin1, bin2, c); - } - } else { - counts = readFloatFromFile(bufferin); - if (!isnan(counts)) { - appendRecord(v, index++, bin1, bin2, counts); - } - } - } - } - } - delete[] compressedBytes; - delete[] uncompressedBytes; // don't forget to delete your heap arrays in C++! - return v; -} - -// reads the normalization vector from the file at the specified location -vector<double> readNormalizationVector(istream &bufferin, int32_t version) { - int64_t nValues; - if (version > 8) { - nValues = readInt64FromFile(bufferin); - } else { - nValues = (int64_t) readInt32FromFile(bufferin); - } - - uint64_t numValues; - numValues = static_cast<uint64_t>(nValues); - vector<double> values(numValues); - - if (version > 8) { - for (int i = 0; i < nValues; i++) { - values[i] = (double) readFloatFromFile(bufferin); - } - } else { - for (int i = 0; i < nValues; i++) { - values[i] = readDoubleFromFile(bufferin); - } - } - - return values; -} - -class MatrixZoomData { -public: - bool isIntra; - string fileName; - int64_t myFilePos = 0LL; - vector<double> expectedValues; - bool foundFooter = false; - vector<double> c1Norm; - vector<double> c2Norm; - int32_t c1 = 0; - int32_t c2 = 0; - string matrixType; - string norm; - int32_t version = 0; - int32_t resolution = 0; - int32_t numBins1 = 0; - int32_t numBins2 = 0; - float sumCounts; - int32_t blockBinCount, blockColumnCount; - map<int32_t, indexEntry> blockMap; - double avgCount; - - MatrixZoomData(const chromosome &chrom1, const chromosome &chrom2, const string &matrixType, - const string &norm, const string &unit, int32_t resolution, - int32_t &version, int64_t &master, int64_t &totalFileSize, - const string &fileName) { - this->version = version; - this->fileName = fileName; - int32_t c01 = chrom1.index; - int32_t c02 = chrom2.index; - if (c01 <= c02) { // default is ok - this->c1 = c01; - this->c2 = c02; - this->numBins1 = static_cast<int32_t>(chrom1.length / resolution); - this->numBins2 = static_cast<int32_t>(chrom2.length / resolution); - } else { // flip - this->c1 = c02; - this->c2 = c01; - this->numBins1 = static_cast<int32_t>(chrom2.length / resolution); - this->numBins2 = static_cast<int32_t>(chrom1.length / resolution); - } - isIntra = c1 == c2; - - this->matrixType = matrixType; - this->norm = norm; - this->resolution = resolution; - - HiCFileStream *stream = new HiCFileStream(fileName); - indexEntry c1NormEntry{}, c2NormEntry{}; - - if (stream->isHttp) { - foundFooter = readFooterURL(stream->curl, master, version, c1, c2, matrixType, norm, unit, - resolution, - myFilePos, - c1NormEntry, c2NormEntry, expectedValues); - } else { - stream->fin.seekg(master, ios::beg); - foundFooter = readFooter(stream->fin, master, version, c1, c2, matrixType, norm, - unit, - resolution, myFilePos, - c1NormEntry, c2NormEntry, expectedValues); - } - - if (!foundFooter) { - return; - } - stream->close(); - - if (norm != "NONE") { - c1Norm = readNormalizationVectorFromFooter(c1NormEntry, version, fileName); - if (isIntra) { - c2Norm = c1Norm; - } else { - c2Norm = readNormalizationVectorFromFooter(c2NormEntry, version, fileName); - } - } - - HiCFileStream *stream2 = new HiCFileStream((fileName)); - if (stream2->isHttp) { - // readMatrix will assign blockBinCount and blockColumnCount - blockMap = readMatrixHttp(stream2->curl, myFilePos, unit, resolution, sumCounts, - blockBinCount, - blockColumnCount); - } else { - // readMatrix will assign blockBinCount and blockColumnCount - blockMap = readMatrix(stream2->fin, myFilePos, unit, resolution, sumCounts, - blockBinCount, - blockColumnCount); - } - stream2->close(); - - if (!isIntra) { - avgCount = (sumCounts / numBins1) / numBins2; // <= trying to avoid overflows - } - } - - static vector<double> readNormalizationVectorFromFooter(indexEntry cNormEntry, int32_t &version, - const string &fileName) { - if (cNormEntry.size == 0) { - // no normalization entries in this file. Bow out now and avoid an allocation error - throw strawException("Trying to retrieve other normalization options, but none exist"); - } - char *buffer = readCompressedBytesFromFile(fileName, cNormEntry); - memstream bufferin(buffer, cNormEntry.size); - vector<double> cNorm = readNormalizationVector(bufferin, version); - delete buffer; - return cNorm; - } - - static bool isInRange(int32_t r, int32_t c, int32_t numRows, int32_t numCols) { - return 0 <= r && r < numRows && 0 <= c && c < numCols; - } - - set<int32_t> getBlockNumbers(int64_t *regionIndices) const { - if (version > 8 && isIntra) { - return getBlockNumbersForRegionFromBinPositionV9Intra(regionIndices, blockBinCount, blockColumnCount); - } else { - return getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockColumnCount, isIntra); - } - } - - vector<double> getNormVector(int32_t index) { - if (index == c1) { - return c1Norm; - } else if (index == c2) { - return c2Norm; - } - cerr << "Invalid index provided: " << index << endl; - cerr << "Should be either " << c1 << " or " << c2 << endl; - vector<double> v; - return v; - } - - vector<double> getExpectedValues() { - return expectedValues; - } - - vector<contactRecord> getRecords(int64_t gx0, int64_t gx1, int64_t gy0, int64_t gy1) { - if (!foundFooter) { - vector<contactRecord> v; - return v; - } - int64_t origRegionIndices[] = {gx0, gx1, gy0, gy1}; - int64_t regionIndices[4]; - convertGenomeToBinPos(origRegionIndices, regionIndices, resolution); - - set<int32_t> blockNumbers = getBlockNumbers(regionIndices); - vector<contactRecord> records; - for (int32_t blockNumber : blockNumbers) { - // get contacts in this block - //cout << *it << " -- " << blockMap.size() << endl; - //cout << blockMap[*it].size << " " << blockMap[*it].position << endl; - vector<contactRecord> tmp_records = readBlock(fileName, blockMap[blockNumber], version); - for (contactRecord rec : tmp_records) { - int64_t x = rec.binX * resolution; - int64_t y = rec.binY * resolution; - - if ((x >= origRegionIndices[0] && x <= origRegionIndices[1] && - y >= origRegionIndices[2] && y <= origRegionIndices[3]) || - // or check regions that overlap with lower left - (isIntra && y >= origRegionIndices[0] && y <= origRegionIndices[1] && x >= origRegionIndices[2] && - x <= origRegionIndices[3])) { - - float c = rec.counts; - if (norm != "NONE") { - c = static_cast<float>(c / (c1Norm[rec.binX] * c2Norm[rec.binY])); - } - if (matrixType == "oe") { - if (isIntra) { - c = static_cast<float>(c / expectedValues[min(expectedValues.size() - 1, - (size_t) floor(abs(y - x) / - resolution))]); - } else { - c = static_cast<float>(c / avgCount); - } - } else if (matrixType == "expected") { - if (isIntra) { - c = static_cast<float>(expectedValues[min(expectedValues.size() - 1, - (size_t) floor(abs(y - x) / - resolution))]); - } else { - c = static_cast<float>(avgCount); - } - } - - if (!isnan(c) && !isinf(c)){ - contactRecord record = contactRecord(); - record.binX = static_cast<int32_t>(x); - record.binY = static_cast<int32_t>(y); - record.counts = c; - records.push_back(record); - } - } - } - } - return records; - } - - vector<vector<float> > getRecordsAsMatrix(int64_t gx0, int64_t gx1, int64_t gy0, int64_t gy1) { - vector<contactRecord> records = this->getRecords(gx0, gx1, gy0, gy1); - if (records.empty()) { - vector<vector<float> > res = vector<vector<float> >(1, vector<float>(1, 0)); - return res; - } - - int64_t origRegionIndices[] = {gx0, gx1, gy0, gy1}; - int64_t regionIndices[4]; - convertGenomeToBinPos(origRegionIndices, regionIndices, resolution); - - int64_t originR = regionIndices[0]; - int64_t endR = regionIndices[1]; - int64_t originC = regionIndices[2]; - int64_t endC = regionIndices[3]; - int32_t numRows = endR - originR + 1; - int32_t numCols = endC - originC + 1; - vector<vector<float> > matrix; - for (int32_t i = 0; i < numRows; i++) { - matrix.emplace_back(numCols, 0); - } - - for (contactRecord cr : records) { - if (isnan(cr.counts) || isinf(cr.counts)) continue; - int32_t r = cr.binX / resolution - originR; - int32_t c = cr.binY / resolution - originC; - if (isInRange(r, c, numRows, numCols)) { - matrix[r][c] = cr.counts; - } - if (isIntra) { - r = cr.binY / resolution - originR; - c = cr.binX / resolution - originC; - if (isInRange(r, c, numRows, numCols)) { - matrix[r][c] = cr.counts; - } - } - } - return matrix; - } - - int64_t getNumberOfTotalRecords() { - if (!foundFooter) { - return 0; - } - int64_t regionIndices[4] = {0, numBins1, 0, numBins2}; - set<int32_t> blockNumbers = getBlockNumbers(regionIndices); - int64_t total = 0; - for (int32_t blockNumber : blockNumbers) { - total += getNumRecordsInBlock(fileName, blockMap[blockNumber], version); - } - return total; - } -}; - -class HiCFile { -public: - string prefix = "http"; // HTTP code - int64_t master = 0LL; - map<string, chromosome> chromosomeMap; - string genomeID; - int32_t numChromosomes = 0; - int32_t version = 0; - int64_t nviPosition = 0LL; - int64_t nviLength = 0LL; - vector<int32_t> resolutions; - static int64_t totalFileSize; - string fileName; - vector<string> attributes; - - static size_t hdf(char *b, size_t size, size_t nitems, void *userdata) { - size_t numbytes = size * nitems; - b[numbytes + 1] = '\0'; - string s(b); - int32_t found; - found = static_cast<int32_t>(s.find("content-range")); - if ((size_t) found == string::npos) { - found = static_cast<int32_t>(s.find("Content-Range")); - } - if ((size_t) found != string::npos) { - int32_t found2; - found2 = static_cast<int32_t>(s.find('/')); - //content-range: bytes 0-100000/891471462 - if ((size_t) found2 != string::npos) { - string total = s.substr(found2 + 1); - totalFileSize = stol(total); - } - } - - return numbytes; - } - - static CURL *oneTimeInitCURL(const char *url) { - CURL *curl = initCURL(url); - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, hdf); - curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); - return curl; - } - - explicit HiCFile(const string &fileName) { - this->fileName = fileName; - - // read header into buffer; 100K should be sufficient - if (std::strncmp(fileName.c_str(), prefix.c_str(), prefix.size()) == 0) { - CURL *curl; - curl = oneTimeInitCURL(fileName.c_str()); - char *buffer = getData(curl, 0, 100000); - memstream bufin(buffer, 100000); - chromosomeMap = readHeader(bufin, master, genomeID, numChromosomes, - version, nviPosition, nviLength, attributes); - resolutions = readResolutionsFromHeader(bufin); - curl_easy_cleanup(curl); - delete buffer; - } else { - ifstream fin; - fin.open(fileName, fstream::in | fstream::binary); - if (!fin) { - throw strawException("File " + fileName + " cannot be opened for reading"); - //cerr << "File " << fileName << " cannot be opened for reading" << endl; - //exit(6); - } - chromosomeMap = readHeader(fin, master, genomeID, numChromosomes, - version, nviPosition, nviLength, attributes); - resolutions = readResolutionsFromHeader(fin); - fin.close(); - } - } - - string getGenomeID() const { - return genomeID; - } - - vector<int32_t> getResolutions() const { - return resolutions; - } - - vector<chromosome> getChromosomes() { - chromosome chromosomes[chromosomeMap.size()]; - map<string, chromosome>::iterator iter = chromosomeMap.begin(); - while (iter != chromosomeMap.end()) { - chromosome chrom = static_cast<chromosome>(iter->second); - chromosomes[chrom.index] = chrom; - iter++; - } - - vector<chromosome> final_chromosomes; - final_chromosomes.reserve(chromosomeMap.size()); - for(int32_t i = 0; i < chromosomeMap.size(); i++){ - final_chromosomes.push_back(chromosomes[i]); - } - return final_chromosomes; - } - - MatrixZoomData * getMatrixZoomData(const string &chr1, const string &chr2, const string &matrixType, - const string &norm, const string &unit, int32_t resolution) { - chromosome chrom1 = chromosomeMap[chr1]; - chromosome chrom2 = chromosomeMap[chr2]; - return new MatrixZoomData(chrom1, chrom2, (matrixType), (norm), (unit), - resolution, version, master, totalFileSize, fileName); - } -}; - -int64_t HiCFile::totalFileSize = 0LL; - -void parsePositions(const string &chrLoc, string &chrom, int64_t &pos1, int64_t &pos2, map<string, chromosome> map) { - string x, y; - stringstream ss(chrLoc); - getline(ss, chrom, ':'); - if (map.count(chrom) == 0) { - throw strawException("chromosome " + chrom + " not found in the file."); - //cerr << "chromosome " << chrom << " not found in the file." << endl; - //exit(7); - } - - if (getline(ss, x, ':') && getline(ss, y, ':')) { - pos1 = stol(x); - pos2 = stol(y); - } else { - pos1 = 0LL; - pos2 = map[chrom].length; - } -} - -vector<contactRecord> straw(const string &matrixType, const string &norm, const string &fileName, const string &chr1loc, - const string &chr2loc, const string &unit, int32_t binsize) { - if (!(unit == "BP" || unit == "FRAG")) { - cerr << "Norm specified incorrectly, must be one of <BP/FRAG>" << endl; - cerr << "Usage: straw [observed/oe/expected] <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize>" - << endl; - vector<contactRecord> v; - return v; - } - - HiCFile *hiCFile = new HiCFile(fileName); - string chr1, chr2; - int64_t origRegionIndices[4] = {-100LL, -100LL, -100LL, -100LL}; - parsePositions((chr1loc), chr1, origRegionIndices[0], origRegionIndices[1], hiCFile->chromosomeMap); - parsePositions((chr2loc), chr2, origRegionIndices[2], origRegionIndices[3], hiCFile->chromosomeMap); - - if (hiCFile->chromosomeMap[chr1].index > hiCFile->chromosomeMap[chr2].index) { - MatrixZoomData *mzd = hiCFile->getMatrixZoomData(chr2, chr1, matrixType, norm, unit, binsize); - return mzd->getRecords(origRegionIndices[2], origRegionIndices[3], origRegionIndices[0], origRegionIndices[1]); - } else { - MatrixZoomData *mzd = hiCFile->getMatrixZoomData(chr1, chr2, matrixType, norm, unit, binsize); - return mzd->getRecords(origRegionIndices[0], origRegionIndices[1], origRegionIndices[2], origRegionIndices[3]); - } -} - -vector<vector<float> > strawAsMatrix(const string &matrixType, const string &norm, const string &fileName, const string &chr1loc, - const string &chr2loc, const string &unit, int32_t binsize) { - if (!(unit == "BP" || unit == "FRAG")) { - cerr << "Norm specified incorrectly, must be one of <BP/FRAG>" << endl; - cerr << "Usage: straw [observed/oe/expected] <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize>" - << endl; - vector<vector<float> > res = vector<vector<float> >(1, vector<float>(1, 0)); - return res; - } - - HiCFile *hiCFile = new HiCFile(fileName); - string chr1, chr2; - int64_t origRegionIndices[4] = {-100LL, -100LL, -100LL, -100LL}; - parsePositions((chr1loc), chr1, origRegionIndices[0], origRegionIndices[1], hiCFile->chromosomeMap); - parsePositions((chr2loc), chr2, origRegionIndices[2], origRegionIndices[3], hiCFile->chromosomeMap); - - if (hiCFile->chromosomeMap[chr1].index > hiCFile->chromosomeMap[chr2].index) { - MatrixZoomData *mzd = hiCFile->getMatrixZoomData(chr2, chr1, matrixType, norm, unit, binsize); - return mzd->getRecordsAsMatrix(origRegionIndices[2], origRegionIndices[3], origRegionIndices[0], origRegionIndices[1]); - } else { - MatrixZoomData *mzd = hiCFile->getMatrixZoomData(chr1, chr2, matrixType, norm, unit, binsize); - return mzd->getRecordsAsMatrix(origRegionIndices[0], origRegionIndices[1], origRegionIndices[2], origRegionIndices[3]); - } -} - -int64_t getNumRecordsForFile(const string &fileName, int32_t binsize, bool interOnly) { - HiCFile *hiCFile = new HiCFile(fileName); - int64_t totalNumRecords = 0; - - int32_t indexOffset = 0; - if (interOnly){ - indexOffset = 1; - } - - vector<chromosome> chromosomes = hiCFile->getChromosomes(); - for(int32_t i = 0; i < chromosomes.size(); i++){ - if(chromosomes[i].index <= 0) continue; - for(int32_t j = i + indexOffset; j < chromosomes.size(); j++){ - if(chromosomes[j].index <= 0) continue; - MatrixZoomData *mzd; - if(chromosomes[i].index > chromosomes[j].index){ - mzd = hiCFile->getMatrixZoomData(chromosomes[j].name, chromosomes[i].name, "observed", "NONE", "BP", binsize); - } else { - mzd = hiCFile->getMatrixZoomData(chromosomes[i].name, chromosomes[j].name, "observed", "NONE", "BP", binsize); - } - totalNumRecords += mzd->getNumberOfTotalRecords(); - } - } - - return totalNumRecords; -} - -int64_t getNumRecordsForChromosomes(const string &fileName, int32_t binsize, bool interOnly) { - HiCFile *hiCFile = new HiCFile(fileName); - vector<chromosome> chromosomes = hiCFile->getChromosomes(); - for(int32_t i = 0; i < chromosomes.size(); i++){ - if(chromosomes[i].index <= 0) continue; - MatrixZoomData *mzd = hiCFile->getMatrixZoomData(chromosomes[i].name, chromosomes[i].name, "observed", "NONE", "BP", binsize); - int64_t totalNumRecords = mzd->getNumberOfTotalRecords(); - cout << chromosomes[i].name << " " << totalNumRecords << " "; - cout << totalNumRecords*12/1000/1000/1000 << " GB" << endl; - } - return 0; -} - -void getHeaderFields(const string &filename, string &genome, vector<string> &chromNames, vector<int> &chromSizes, - vector<int> &bpResolutions, vector<int> &fragResolutions, vector<string> &attributes) -/* Fill in the provided fields with information from the header of the hic file in the supplied filename. - * fragResolutions is left empty for now, as we're not making use of it. */ -{ - HiCFile *hiCFile = new HiCFile(filename); - - genome.assign(hiCFile->getGenomeID()); - - vector<chromosome> chromList = hiCFile->getChromosomes(); - vector<string> myChromNames; - myChromNames.reserve(chromList.size()); - vector<int> myChromSizes; - myChromSizes.reserve(chromList.size()); - for(int32_t i = 0; i < chromList.size(); i++){ - myChromNames.push_back(chromList[i].name); - myChromSizes.push_back(chromList[i].length); - } - chromNames = myChromNames; - chromSizes = myChromSizes; - - vector<int> myBpResolutions; - myBpResolutions.reserve(hiCFile->resolutions.size()); - for(int32_t i = 0; i < hiCFile->resolutions.size(); i++){ - myBpResolutions.push_back(hiCFile->resolutions[i]); - } - bpResolutions = myBpResolutions; - - attributes = hiCFile->attributes; - - // Ignoring fragment resolutions for now, since they're never being used here -} - -set<string> getNormOptions() -/* Return the set of normalization options that have been encountered through footer parsing. - * The result will be empty unless at least one straw() request has been made. - */ -{ - return globalNormOptions; -}