60aca91bcce6d4fa555e6c7c91d8ff8aa9e7bd2b jcasper Fri Jun 11 15:17:21 2021 -0700 Updating hic support for files with large headers (over 100kb) and improving multi-region performance, refs #18842, #27593 diff --git src/hg/lib/straw/straw.cpp src/hg/lib/straw/straw.cpp index e9602a9..0abbfbd 100644 --- src/hg/lib/straw/straw.cpp +++ src/hg/lib/straw/straw.cpp @@ -14,196 +14,354 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +extern "C" { +#include "common.h" +#include "udc.h" +#undef min +#undef max +} + #include #include #include #include #include #include #include #include #include //#include #include "zlib.h" #include "straw.h" -#include "udcWrapper.h" +//#include "udcWrapper.h" using namespace std; -/* Simple exception wrapper class */ -class strawException: public std::runtime_error { - public: - strawException(const std::string& error): - std::runtime_error(error) { - } -}; - - /* Straw: fast C++ implementation of dump. Not as fully featured as the Java version. Reads the .hic file, finds the appropriate matrix and slice of data, and outputs as text in sparse upper triangular format. Currently only supporting matrices. Usage: straw [:x1:x2] [:y1:y2] */ + + // this is for creating a stream from a byte array for ease of use struct membuf : std::streambuf { membuf(char* begin, char* end) { this->setg(begin, begin, end); } }; -// version number -static int version; - -// map of block numbers to pointers -map blockMap; - -long total_bytes; - - // get a buffer that can be used as an input stream from the URL -char* getHttpData(char *url, long position, long chunksize) { - char *buffer = (char*) malloc(chunksize); +char* getHttpData(char *url, long position, long chunksize, long *fileSize = NULL) { + char *buffer = new (nothrow) char[chunksize]; if (buffer == NULL) { - throw strawException("Out of memory (malloc returned NULL)"); + throw strawException("Out of memory, unable to allocate Hi-C data buffer"); } struct udcFile *udc = udcFileMayOpen(url, NULL); if (udc == NULL) { throw strawException("Error: unable to open udcFile for resource " + string(url)); } - total_bytes = udcFileSize(url); + if (fileSize) + *fileSize = udcFileSize(url); udcSeek(udc, position); udcRead(udc, (void*)buffer, chunksize); udcFileClose(&udc); return buffer; } +/* hicBuffer is a start toward providing a level of abstraction for a .hic data reader, + * independent of whether the file is local or remote. Remote files are handled with + * an initial request for remote data, which is transparently extended if the caller + * attempts to read off the end of the initial read buffer. The previous approach + * assumed a header was never larger than 100kB; this failed for .hic files + * with high (e.g. 23k) chromosome counts. + */ +class hicBuffer { + private: + string stashedPath; + long offset; + long extent; + bool isHttp; + istream *buf; + char *httpBuffer; + membuf *sbuf; + long bufSize; + long fullFileSize; + + void extendHttpBuffer() { + // deletes the existing buffer and allocates a replacement containing the next bytes + // from the file + delete buf; + delete sbuf; + delete httpBuffer; + offset += extent; + if (offset + extent > fullFileSize) + extent = fullFileSize - offset; + httpBuffer = getHttpData((char*)(stashedPath.c_str()), offset, extent, NULL); + sbuf = new membuf(httpBuffer, httpBuffer + extent); + buf = new istream(sbuf); + } + + public: + hicBuffer(const string& path, long start, long size = 100000) + { + // HTTP code + stashedPath = path; + string prefix="http"; + isHttp = false; + httpBuffer = NULL; + offset = start; + extent = size; + + if (std::strncmp(path.c_str(), prefix.c_str(), prefix.size()) == 0) { + isHttp = true; + char *url = (char*) path.c_str(); + httpBuffer = getHttpData(url, start, size, &fullFileSize); + sbuf = new membuf(httpBuffer, httpBuffer + size); + buf = new istream(sbuf); + } + else { + ifstream* fin = new ifstream; + fin->open(path, fstream::in); + if (fin->fail()) { + cerr << "File " << path << " cannot be opened for reading" << endl; + return; + } + fin->seekg(start, ios::beg); + buf = fin; + } + + } + + ~hicBuffer() { + delete buf; + if (isHttp) { + delete sbuf; + delete httpBuffer; + } + } + + void getline(string& str, char delim) { + std::getline(*buf, str, delim); + if(buf->eof() && isHttp && (offset + extent < fullFileSize)) { + this->extendHttpBuffer(); + string str2; + std::getline(*buf, str2, delim); + str.append(str2); + } + if (buf->fail()) + throw strawException("getline failure when reading file"); + } + + void getline(string& str) { + std::getline(*buf, str); + if (buf->eof() && isHttp && (offset + extent < fullFileSize)) { + this->extendHttpBuffer(); + string str2; + std::getline(*buf, str2); + str.append(str2); + } + if (buf->fail()) + throw strawException("getline failure when reading file"); + } + + void read(char *s, streamsize n) { + buf->read(s, n); + if(buf->eof() && isHttp && (offset + extent < fullFileSize)) { + streamsize bytesRead = buf->gcount(); + this->extendHttpBuffer(); + char s2[n]; + buf->read(s2, n-bytesRead); + if (!buf->fail()) + strncpy(s+bytesRead, s2, n-bytesRead); + } + if (buf->fail()) + throw strawException("read() failure when accessing file"); + } + + long fileSize() { + return fullFileSize; + } +}; + // returns whether or not this is valid HiC file -bool readMagicString(istream& fin) { +bool Straw::readMagicString() { string str; - getline(fin, str, '\0' ); + if (fileName == "") { + throw strawException("Must open a hic file before checking the magic string"); + } + hicBuffer header(fileName, 0, 16); + header.getline(str, '\0' ); return str[0]=='H' && str[1]=='I' && str[2]=='C'; } -// reads the header, storing the positions of the normalization vectors and returning the master pointer -long readHeader(istream& fin, string chr1, string chr2, int &c1pos1, int &c1pos2, int &c2pos1, int &c2pos2, int &chr1ind, int &chr2ind) { - if (!readMagicString(fin)) { - throw strawException("Hi-C magic string is missing, does not appear to be a hic file"); - } +// reads the header, storing the various values in the object +void Straw::loadHeader() { + if (master != 0) + // header was already loaded for this file + return; + + hicBuffer header(fileName, 4, 100000); + total_bytes = header.fileSize(); - fin.read((char*)&version, sizeof(int)); + header.read((char*)&(version), sizeof(int)); if (version < 6) { throw strawException("Version " + to_string(version) + " no longer supported"); } - long master; - fin.read((char*)&master, sizeof(long)); - string genome; - getline(fin, genome, '\0' ); + header.read((char*)&master, sizeof(long)); + header.getline(genome, '\0' ); int nattributes; - fin.read((char*)&nattributes, sizeof(int)); + header.read((char*)&nattributes, sizeof(int)); - // reading and ignoring attribute-value dictionary + // reading attribute-value dictionary for (int i=0; i::iterator it = chrchrMap.find(chrKey); + if (it == chrchrMap.end()) + throw strawException("File doesn't have the given chr_chr map"); + myFilePos = it->second; + + if (norm=="NONE") return; // no need to read norm vector index + + map>::iterator it1 = normVectors.find(vec1); + map>::iterator it2 = normVectors.find(vec2); + if ((it1 == normVectors.end()) || (it2 == normVectors.end())) { + throw strawException("File did not contain " + norm + " normalization vectors for one or both chromosomes at " + + to_string(resolution) + " " + unit); + } + c1NormEntry.position = it1->second.first; + c1NormEntry.size = it1->second.second; + c2NormEntry.position = it2->second.first; + c2NormEntry.size = it2->second.second; + return; + } + int nBytes; + fin.read((char*)&nBytes, sizeof(int)); + + bool found = false; int nEntries; fin.read((char*)&nEntries, sizeof(int)); - bool found = false; for (int i=0; i(filePosition, sizeInBytes); if (chrIdx == c1 && normtype == norm && unit1 == unit && resolution1 == resolution) { c1NormEntry.position=filePosition; c1NormEntry.size=sizeInBytes; found1 = true; } if (chrIdx == c2 && normtype == norm && unit1 == unit && resolution1 == resolution) { c2NormEntry.position=filePosition; c2NormEntry.size=sizeInBytes; found2 = true; } } + haveReadFooter = true; + + if (!found) { + throw strawException("File doesn't have the given chr_chr map"); + } + + if (norm=="NONE") return; // no norm vectors for NONE + if (!found1 || !found2) { throw strawException("File did not contain " + norm + " normalization vectors for one or both chromosomes at " + to_string(resolution) + " " + unit); } } // reads the raw binned contact matrix at specified resolution, setting the block bin count and block column count -bool readMatrixZoomData(istream& fin, string myunit, int mybinsize, int &myBlockBinCount, int &myBlockColumnCount) { +bool Straw::readMatrixZoomData(istream& fin, string myunit, int mybinsize, int &myBlockBinCount, int &myBlockColumnCount) { string unit; getline(fin, unit, '\0' ); // unit int tmp; fin.read((char*)&tmp, sizeof(int)); // Old "zoom" index -- not used float tmp2; fin.read((char*)&tmp2, sizeof(float)); // sumCounts fin.read((char*)&tmp2, sizeof(float)); // occupiedCellCount fin.read((char*)&tmp2, sizeof(float)); // stdDev fin.read((char*)&tmp2, sizeof(float)); // percent95 int binSize; fin.read((char*)&binSize, sizeof(int)); int blockBinCount; fin.read((char*)&blockBinCount, sizeof(int)); int blockColumnCount; fin.read((char*)&blockColumnCount, sizeof(int)); @@ -308,31 +476,31 @@ int blockNumber; fin.read((char*)&blockNumber, sizeof(int)); long filePosition; fin.read((char*)&filePosition, sizeof(long)); int blockSizeInBytes; fin.read((char*)&blockSizeInBytes, sizeof(int)); indexEntry entry; entry.size = blockSizeInBytes; entry.position = filePosition; if (storeBlockData) blockMap[blockNumber] = entry; } return storeBlockData; } // reads the raw binned contact matrix at specified resolution, setting the block bin count and block column count -bool readMatrixZoomDataHttp(char *url, long &myFilePosition, string myunit, int mybinsize, int &myBlockBinCount, int &myBlockColumnCount) { +bool Straw::readMatrixZoomDataHttp(char *url, long &myFilePosition, string myunit, int mybinsize, int &myBlockBinCount, int &myBlockColumnCount) { char* buffer; int header_size = 5*sizeof(int)+4*sizeof(float); char* first; first = getHttpData(url, myFilePosition, 1); if (first[0]=='B') { header_size+=3; } else if (first[0]=='F') { header_size+=5; } else { throw strawException("Unit not understood - must be one of "); } buffer = getHttpData(url, myFilePosition, header_size); membuf sbuf(buffer, buffer + header_size); @@ -378,109 +546,110 @@ indexEntry entry; entry.size = blockSizeInBytes; entry.position = filePosition; blockMap[blockNumber] = entry; } } else { myFilePosition = myFilePosition+header_size+(nBlocks*(sizeof(int)+sizeof(long)+sizeof(int))); } delete buffer; return storeBlockData; } // goes to the specified file pointer in http and finds the raw contact matrix at specified resolution, calling readMatrixZoomData. // sets blockbincount and blockcolumncount -void readMatrixHttp(char *url, long myFilePosition, string unit, int resolution, int &myBlockBinCount, int &myBlockColumnCount) { +void Straw::readMatrixHttp(char *url, long myFilePosition, string unit, int resolution, int &myBlockBinCount, int &myBlockColumnCount) { char * buffer; int size = sizeof(int)*3; buffer = getHttpData(url, myFilePosition, size); membuf sbuf(buffer, buffer + size); istream bufin(&sbuf); int c1,c2; bufin.read((char*)&c1, sizeof(int)); //chr1 bufin.read((char*)&c2, sizeof(int)); //chr2 int nRes; bufin.read((char*)&nRes, sizeof(int)); int i=0; bool found=false; myFilePosition=myFilePosition+size; delete buffer; while (ireadMatrixZoomDataHttp(url, myFilePosition, unit, resolution, myBlockBinCount, myBlockColumnCount); i++; } if (!found) { throw strawException("Error finding block data"); } } // goes to the specified file pointer and finds the raw contact matrix at specified resolution, calling readMatrixZoomData. // sets blockbincount and blockcolumncount -void readMatrix(istream& fin, long myFilePosition, string unit, int resolution, int &myBlockBinCount, int &myBlockColumnCount) { +void Straw::readMatrix(istream& fin, long myFilePosition, string unit, int resolution, int &myBlockBinCount, int &myBlockColumnCount) { fin.seekg(myFilePosition, ios::beg); int c1,c2; fin.read((char*)&c1, sizeof(int)); //chr1 fin.read((char*)&c2, sizeof(int)); //chr2 int nRes; fin.read((char*)&nRes, sizeof(int)); int i=0; bool found=false; while (ireadMatrixZoomData(fin, unit, resolution, myBlockBinCount, myBlockColumnCount); i++; } if (!found) { throw strawException("Error finding block data\nWas looking for " + unit + " at resolution " + to_string(resolution)); } } + // gets the blocks that need to be read for this slice of the data. needs blockbincount, blockcolumncount, and whether // or not this is intrachromosomal. -set getBlockNumbersForRegionFromBinPosition(int* regionIndices, int blockBinCount, int blockColumnCount, bool intra) { +set Straw::getBlockNumbersForRegionFromBinPosition(int* regionIndices, int blockBinCount, int blockColumnCount, bool intra) { int col1 = regionIndices[0] / blockBinCount; int col2 = (regionIndices[1] + 1) / blockBinCount; int row1 = regionIndices[2] / blockBinCount; int row2 = (regionIndices[3] + 1) / blockBinCount; set blocksSet; // first check the upper triangular matrix for (int r = row1; r <= row2; r++) { for (int c = col1; c <= col2; c++) { int blockNumber = r * blockColumnCount + c; blocksSet.insert(blockNumber); } } // check region part that overlaps with lower left triangle // but only if intrachromosomal if (intra) { for (int r = col1; r <= col2; r++) { for (int c = row1; c <= row2; c++) { int blockNumber = r * blockColumnCount + c; blocksSet.insert(blockNumber); } } } return blocksSet; } // this is the meat of reading the data. takes in the block number and returns the set of contact records corresponding to // that block. the block data is compressed and must be decompressed using the zlib library functions -vector readBlock(istream& fin, char *url, bool isHttp, int blockNumber) { +vector Straw::readBlock(istream& fin, char *url, bool isHttp, int blockNumber) { indexEntry idx = blockMap[blockNumber]; if (idx.size == 0) { vector v; return v; } char* compressedBytes = new char[idx.size]; char* uncompressedBytes = new char[idx.size*10]; //biggest seen so far is 3 if (isHttp) { compressedBytes = getHttpData(url, idx.position, idx.size); } else { fin.seekg(idx.position, ios::beg); fin.read(compressedBytes, idx.size); } @@ -495,31 +664,31 @@ infstream.avail_out = (uInt)idx.size*10; // size of output infstream.next_out = (Bytef *)uncompressedBytes; // output char array // the actual decompression work. inflateInit(&infstream); inflate(&infstream, Z_NO_FLUSH); inflateEnd(&infstream); int uncompressedSize=infstream.total_out; // create stream from buffer for ease of use membuf sbuf(uncompressedBytes, uncompressedBytes + uncompressedSize); istream bufferin(&sbuf); int nRecords; bufferin.read((char*)&nRecords, sizeof(int)); vector v(nRecords); // different versions have different specific formats - if (version < 7) { + if (this->version < 7) { for (int i = 0; i < nRecords; i++) { int binX, binY; bufferin.read((char*)&binX, sizeof(int)); bufferin.read((char*)&binY, sizeof(int)); float counts; bufferin.read((char*)&counts, sizeof(float)); contactRecord record; record.binX = binX; record.binY = binY; record.counts = counts; v[i] = record; } } else { int binXOffset, binYOffset; @@ -597,102 +766,109 @@ record.binY = bin2; record.counts = counts; v[index]=record; index++; } } } } } delete[] compressedBytes; delete[] uncompressedBytes; // don't forget to delete your heap arrays in C++! return v; } // reads the normalization vector from the file at the specified location -vector readNormalizationVector(istream& bufferin) { +vector Straw::readNormalizationVector(istream& bufferin) { int nValues; bufferin.read((char*)&nValues, sizeof(int)); vector values(nValues); // bool allNaN = true; for (int i = 0; i < nValues; i++) { double d; bufferin.read((char*)&d, sizeof(double)); values[i] = d; /* if (!Double.isNaN(values[i])) { allNaN = false; }*/ } // if (allNaN) return null; return values; } -void straw(string norm, string fname, int binsize, string chr1loc, string chr2loc, string unit, vector &xActual, vector &yActual, vector &counts) +int Straw::open(string fname) { + fileName = fname; + if (this->readMagicString()) { + return 1; + } + else { + fileName = ""; + throw strawException("Hi-C magic string is missing, does not appear to be a hic file"); + } + return 0; +} + +int Straw::fileIsOpen() { + return fileName != ""; +} + +void Straw::close() { + master = 0; + fileName = ""; + genome = ""; + haveReadFooter = false; + chrNames.clear(); + chrSizes.clear(); + attributes.clear(); + blockMap.clear(); +} + +void Straw::straw(string norm, int binsize, string chr1loc, string chr2loc, string unit, vector &xActual, vector &yActual, vector &counts) { int earlyexit=1; + + if (!fileIsOpen()) { + throw strawException("straw error - no open .hic file"); + } + if (!(norm=="NONE"||norm=="VC"||norm=="VC_SQRT"||norm=="KR")) { throw strawException("Invalid Norm value - must be one of "); } if (!(unit=="BP"||unit=="FRAG")) { throw strawException("Invalid Unit value - must be one of "); } // parse chromosome positions stringstream ss(chr1loc); string chr1, chr2, x, y; int c1pos1=-100, c1pos2=-100, c2pos1=-100, c2pos2=-100; getline(ss, chr1, ':'); if (getline(ss, x, ':') && getline(ss, y, ':')) { c1pos1 = stoi(x); c1pos2 = stoi(y); } stringstream ss1(chr2loc); getline(ss1, chr2, ':'); if (getline(ss1, x, ':') && getline(ss1, y, ':')) { c2pos1 = stoi(x); c2pos2 = stoi(y); } int chr1ind, chr2ind; - // HTTP code - string prefix="http"; - bool isHttp = false; - ifstream fin; - - // read header into buffer; 100K should be sufficient - char *url = NULL; - - long master; - if (std::strncmp(fname.c_str(), prefix.c_str(), prefix.size()) == 0) { - isHttp = true; - char * buffer; - url = (char*) fname.c_str(); - buffer = getHttpData(url, 0, 100000); - membuf sbuf(buffer, buffer + 100000); - istream bufin(&sbuf); - master = readHeader(bufin, chr1, chr2, c1pos1, c1pos2, c2pos1, c2pos2, chr1ind, chr2ind); - delete buffer; - } - else { - fin.open(fname, fstream::in); - if (!fin) { - cerr << "File " << fname << " cannot be opened for reading" << endl; - return; - } - master = readHeader(fin, chr1, chr2, c1pos1, c1pos2, c2pos1, c2pos2, chr1ind, chr2ind); - } + this->loadHeader(); + this->getChrInfo(chr1, chr2, c1pos1, c1pos2, c2pos1, c2pos2, chr1ind, chr2ind); // from header have size of chromosomes, set region to read int c1=min(chr1ind,chr2ind); int c2=max(chr1ind,chr2ind); int origRegionIndices[4]; // as given by user int regionIndices[4]; // used to find the blocks we need to access // reverse order if necessary if (chr1ind > chr2ind) { origRegionIndices[0] = c2pos1; origRegionIndices[1] = c2pos2; origRegionIndices[2] = c1pos1; origRegionIndices[3] = c1pos2; regionIndices[0] = c2pos1 / binsize; regionIndices[1] = c2pos2 / binsize; regionIndices[2] = c1pos1 / binsize; @@ -700,322 +876,111 @@ } else { origRegionIndices[0] = c1pos1; origRegionIndices[1] = c1pos2; origRegionIndices[2] = c2pos1; origRegionIndices[3] = c2pos2; regionIndices[0] = c1pos1 / binsize; regionIndices[1] = c1pos2 / binsize; regionIndices[2] = c2pos1 / binsize; regionIndices[3] = c2pos2 / binsize; } indexEntry c1NormEntry, c2NormEntry; long myFilePos; - unsigned long bytes_to_read = total_bytes - master; + ifstream fin; + char *url; + bool isHttp = false; + string prefix="http"; + if (std::strncmp(fileName.c_str(), prefix.c_str(), prefix.size()) == 0) { + isHttp = true; + } if (isHttp) { + unsigned long bytes_to_read = total_bytes - master; + url = (char*) fileName.c_str(); char* buffer2; buffer2 = getHttpData(url, master, bytes_to_read); membuf sbuf2(buffer2, buffer2 + bytes_to_read); istream bufin2(&sbuf2); - readFooter(bufin2, master, c1, c2, norm, unit, binsize, myFilePos, c1NormEntry, c2NormEntry); + this->readFooter(bufin2, master, c1, c2, norm, unit, binsize, myFilePos, c1NormEntry, c2NormEntry); delete buffer2; } else { + fin.open(fileName, fstream::in); fin.seekg(master, ios::beg); - readFooter(fin, master, c1, c2, norm, unit, binsize, myFilePos, c1NormEntry, c2NormEntry); + this->readFooter(fin, master, c1, c2, norm, unit, binsize, myFilePos, c1NormEntry, c2NormEntry); } // readFooter will assign the above variables - vector c1Norm; vector c2Norm; if (norm != "NONE") { char* buffer3; if (isHttp) { buffer3 = getHttpData(url, c1NormEntry.position, c1NormEntry.size); } else { buffer3 = new char[c1NormEntry.size]; fin.seekg(c1NormEntry.position, ios::beg); fin.read(buffer3, c1NormEntry.size); } membuf sbuf3(buffer3, buffer3 + c1NormEntry.size); istream bufferin(&sbuf3); - c1Norm = readNormalizationVector(bufferin); + c1Norm = this->readNormalizationVector(bufferin); char* buffer4; if (isHttp) { buffer4 = getHttpData(url, c2NormEntry.position, c2NormEntry.size); } else { buffer4 = new char[c2NormEntry.size]; fin.seekg(c2NormEntry.position, ios::beg); fin.read(buffer4, c2NormEntry.size); } membuf sbuf4(buffer4, buffer4 + c2NormEntry.size); istream bufferin2(&sbuf4); - c2Norm = readNormalizationVector(bufferin2); + c2Norm = this->readNormalizationVector(bufferin2); delete buffer3; delete buffer4; } int blockBinCount, blockColumnCount; if (isHttp) { // readMatrix will assign blockBinCount and blockColumnCount - readMatrixHttp(url, myFilePos, unit, binsize, blockBinCount, blockColumnCount); + this->readMatrixHttp(url, myFilePos, unit, binsize, blockBinCount, blockColumnCount); } else { // readMatrix will assign blockBinCount and blockColumnCount - readMatrix(fin, myFilePos, unit, binsize, blockBinCount, blockColumnCount); + this->readMatrix(fin, myFilePos, unit, binsize, blockBinCount, blockColumnCount); } - set blockNumbers = getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockColumnCount, c1==c2); + set blockNumbers = this->getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockColumnCount, c1==c2); // getBlockIndices vector records; for (set::iterator it=blockNumbers.begin(); it!=blockNumbers.end(); ++it) { // get contacts in this block - records = readBlock(fin, url, isHttp, *it); + records = this->readBlock(fin, url, isHttp, *it); for (vector::iterator it2=records.begin(); it2!=records.end(); ++it2) { contactRecord rec = *it2; int x = rec.binX * binsize; int y = rec.binY * binsize; float c = rec.counts; if (norm != "NONE") { c = c / (c1Norm[rec.binX] * c2Norm[rec.binY]); } if ((x >= origRegionIndices[0] && x <= origRegionIndices[1] && y >= origRegionIndices[2] && y <= origRegionIndices[3]) || // or check regions that overlap with lower left ((c1==c2) && y >= origRegionIndices[0] && y <= origRegionIndices[1] && x >= origRegionIndices[2] && x <= origRegionIndices[3])) { xActual.push_back(x); yActual.push_back(y); counts.push_back(c); //printf("%d\t%d\t%.14g\n", x, y, c); } } } } - - -void parseHeaderFields(istream& fin, string &genome, vector &chromNames, vector &chromSizes, vector &bpResolutions, vector &fragResolutions, vector &attributes) -/* Parse out header fields from the supplied data buffer, assumed to come from the beginning of - * a .hic file. */ -{ - if (!readMagicString(fin)) { - throw strawException("Hi-C magic string is missing, does not appear to be a hic file"); - } - - fin.read((char*)&version, sizeof(int)); - if (version < 6) { - throw strawException("Version " + to_string(version) + " no longer supported"); - } - long master; - fin.read((char*)&master, sizeof(long)); - getline(fin, genome, '\0' ); - int nattributes; - fin.read((char*)&nattributes, sizeof(int)); - - // read in attribute-value dictionary - for (int i=0; i &chromNames, vector &chromSizes, vector &bpResolutions, vector &fragResolutions, vector &attributes) -/* Retrieve .hic header fields from the supplied filename and return them in the supplied variables. */ -{ - // HTTP code - string prefix="http"; - bool isHttp = false; - - // read header into buffer; 100K should be sufficient - char * buffer = NULL; - char *url = NULL; - - if (std::strncmp(fname.c_str(), prefix.c_str(), prefix.size()) == 0) { - isHttp = true; - url = (char*) fname.c_str(); - buffer = getHttpData(url, 0, 100000); - membuf sbuf(buffer, buffer + 100000); - istream bufin(&sbuf); - parseHeaderFields(bufin, genome, chromNames, chromSizes, bpResolutions, fragResolutions, attributes); - delete buffer; - } - else { - fstream fin; - fin.open(fname, fstream::in); - if (!fin) { - throw strawException("File " + fname + " cannot be opened for reading"); - } - parseHeaderFields(fin, genome, chromNames, chromSizes, bpResolutions, fragResolutions, attributes); - fin.close(); - } -} - - - -extern "C" char *Cstraw (char *norm, char *fname, int binsize, char *chr1loc, char *chr2loc, char *unit, int **xActual, int **yActual, double **counts, int *numRecords) -/* Wrapper function to retrieve a data chunk from a .hic file, for use by C libraries. - * norm is one of NONE/VC/VC_SQRT/KR. - * binsize is one of the supported bin sizes as determined by CstrawHeader. - * chr1loc and chr2loc are the two positions to retrieve interaction data for, specified as chr:start-end. - * unit is one of BP/FRAG. - * Values are returned in newly allocated arrays in xActual, yActual, and counts, with the number of - * returned records in numRecords. - * The function returns NULL unless an error was encountered, in which case the return value points - * to a character string explaining the error. */ -{ - string thisnorm(norm); - string thisfname(fname); - string thischr1loc(chr1loc); - string thischr2loc(chr2loc); - string thisunit(unit); - vector thisx; - vector thisy; - vector thiscounts; - try { - straw(thisnorm, thisfname, binsize, thischr1loc, thischr2loc, unit, thisx, thisy, thiscounts); - } catch (strawException& err) { - char *errMsg = (char*) calloc((size_t) strlen(err.what())+1, sizeof(char)); - strcpy(errMsg, err.what()); - return errMsg; - } - *numRecords = thisx.size(); - *xActual = (int*) calloc((size_t) *numRecords, sizeof(int)); - *yActual = (int*) calloc((size_t) *numRecords, sizeof(int)); - *counts = (double*) calloc((size_t) *numRecords, sizeof(double)); - for (int i=0; i<*numRecords; i++) - { - (*xActual)[i] = thisx[i]; - (*yActual)[i] = thisy[i]; - (*counts)[i] = (double) thiscounts[i]; - } - return NULL; -} - - -extern "C" char *CstrawHeader (char *filename, char **genome, char ***chromNames, int **chromSizes, int *nChroms, char ***bpResolutions, int *nBpRes, char ***fragResolutions, int *nFragRes, char ***attributes, int *nAttributes) -/* Wrapper function to retrieve header fields from a .hic file, for use by C libraries. - * This retrieves the assembly name, list of chromosome names, list of available binsize resolutions, - * and list of available fragment resolutions in the specific .hic file. - * The function returns NULL unless an error was encountered, in which case the return value points - * to a character string explaining the error. */ -{ - string filenameString(filename); - string genomeString; - vector chromNameVector; - vector chromSizeVector; - vector bpResolutionVector; - vector fragResolutionVector; - vector attributeVector; - try { - getHeaderFields(filenameString, genomeString, chromNameVector, chromSizeVector, bpResolutionVector, fragResolutionVector, attributeVector); - } catch (strawException& err) { - char *errMsg = (char*) calloc((size_t) strlen(err.what())+1, sizeof(char)); - strcpy(errMsg, err.what()); - return errMsg; - } - if (genome != NULL) - { - *genome = (char*) malloc((genomeString.length()+1)*sizeof(char)); - strcpy(*genome, genomeString.c_str()); - } - if (nChroms != NULL) - { - *nChroms = chromNameVector.size(); - } - if (chromNames != NULL) - { - *chromNames = (char**) calloc((size_t) chromNameVector.size(), sizeof(char*)); - for (int i=0; i