f863c012ea39c498eaf4cff84f85f92fef210c5f jcasper Mon May 13 00:51:57 2019 -0700 Initial commit of Hi-C display support via .hic files, refs #18842 diff --git src/hg/lib/straw/straw.cpp src/hg/lib/straw/straw.cpp new file mode 100644 index 0000000..bf58d41 --- /dev/null +++ src/hg/lib/straw/straw.cpp @@ -0,0 +1,994 @@ +/* + The straw library (straw.cpp, straw.h) is used with permission of the Aiden + Lab at the Broad Institute. It is included below, along with modifications + to work in the environment of the UCSC Genome Browser, under the following + license: + + The MIT License (MIT) + + Copyright (c) 2011-2016 Broad Institute, Aiden Lab + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include "zlib.h" +#include "straw.h" +#include "udcWrapper.h" +using namespace std; + +/* Simple exception wrapper class */ +class strawException: public std::runtime_error { + public: + strawException(const std::string& error): + std::runtime_error(error) { + } +}; + + +/* + Straw: fast C++ implementation of dump. Not as fully featured as the + Java version. Reads the .hic file, finds the appropriate matrix and slice + of data, and outputs as text in sparse upper triangular format. + + Currently only supporting matrices. + + Usage: straw [:x1:x2] [:y1:y2] + */ +// this is for creating a stream from a byte array for ease of use +struct membuf : std::streambuf +{ + membuf(char* begin, char* end) { + this->setg(begin, begin, end); + } +}; + +// version number +int version; + +// map of block numbers to pointers +map blockMap; + +long total_bytes; + + +// get a buffer that can be used as an input stream from the URL +char* getHttpData(char *url, long position, long chunksize) { + char *buffer = (char*) malloc(chunksize); + if (buffer == NULL) { + throw strawException("Out of memory (malloc returned NULL)"); + } + struct udcFile *udc = udcFileMayOpen(url, NULL); + if (udc == NULL) + { + throw strawException("Error: unable to open udcFile for resource " + string(url)); + } + total_bytes = udcFileSize(url); + udcSeek(udc, position); + udcRead(udc, (void*)buffer, chunksize); + udcFileClose(&udc); + return buffer; +} + + +// returns whether or not this is valid HiC file +bool readMagicString(istream& fin) { + string str; + getline(fin, str, '\0' ); + return str[0]=='H' && str[1]=='I' && str[2]=='C'; +} + +// reads the header, storing the positions of the normalization vectors and returning the master pointer +long readHeader(istream& fin, string chr1, string chr2, int &c1pos1, int &c1pos2, int &c2pos1, int &c2pos2, int &chr1ind, int &chr2ind) { + if (!readMagicString(fin)) { + throw strawException("Hi-C magic string is missing, does not appear to be a hic file"); + } + + fin.read((char*)&version, sizeof(int)); + if (version < 6) { + throw strawException("Version " + to_string(version) + " no longer supported"); + } + long master; + fin.read((char*)&master, sizeof(long)); + string genome; + getline(fin, genome, '\0' ); + int nattributes; + fin.read((char*)&nattributes, sizeof(int)); + + // reading and ignoring attribute-value dictionary + for (int i=0; i"); + } + buffer = getHttpData(url, myFilePosition, header_size); + membuf sbuf(buffer, buffer + header_size); + istream fin(&sbuf); + + string unit; + getline(fin, unit, '\0' ); // unit + int tmp; + fin.read((char*)&tmp, sizeof(int)); // Old "zoom" index -- not used + float tmp2; + fin.read((char*)&tmp2, sizeof(float)); // sumCounts + fin.read((char*)&tmp2, sizeof(float)); // occupiedCellCount + fin.read((char*)&tmp2, sizeof(float)); // stdDev + fin.read((char*)&tmp2, sizeof(float)); // percent95 + int binSize; + fin.read((char*)&binSize, sizeof(int)); + int blockBinCount; + fin.read((char*)&blockBinCount, sizeof(int)); + int blockColumnCount; + fin.read((char*)&blockColumnCount, sizeof(int)); + + bool storeBlockData = false; + if (myunit==unit && mybinsize==binSize) { + myBlockBinCount = blockBinCount; + myBlockColumnCount = blockColumnCount; + storeBlockData = true; + } + + int nBlocks; + fin.read((char*)&nBlocks, sizeof(int)); + + if (storeBlockData) { + buffer = getHttpData(url, myFilePosition+header_size, nBlocks*(sizeof(int)+sizeof(long)+sizeof(int))); + membuf sbuf2(buffer, buffer + nBlocks*(sizeof(int)+sizeof(long)+sizeof(int))); + istream fin2(&sbuf2); + for (int b = 0; b < nBlocks; b++) { + int blockNumber; + fin2.read((char*)&blockNumber, sizeof(int)); + long filePosition; + fin2.read((char*)&filePosition, sizeof(long)); + int blockSizeInBytes; + fin2.read((char*)&blockSizeInBytes, sizeof(int)); + indexEntry entry; + entry.size = blockSizeInBytes; + entry.position = filePosition; + blockMap[blockNumber] = entry; + } + } + else { + myFilePosition = myFilePosition+header_size+(nBlocks*(sizeof(int)+sizeof(long)+sizeof(int))); + } + delete buffer; + return storeBlockData; +} + +// goes to the specified file pointer in http and finds the raw contact matrix at specified resolution, calling readMatrixZoomData. +// sets blockbincount and blockcolumncount +void readMatrixHttp(char *url, long myFilePosition, string unit, int resolution, int &myBlockBinCount, int &myBlockColumnCount) { + char * buffer; + int size = sizeof(int)*3; + buffer = getHttpData(url, myFilePosition, size); + membuf sbuf(buffer, buffer + size); + istream bufin(&sbuf); + + int c1,c2; + bufin.read((char*)&c1, sizeof(int)); //chr1 + bufin.read((char*)&c2, sizeof(int)); //chr2 + int nRes; + bufin.read((char*)&nRes, sizeof(int)); + int i=0; + bool found=false; + myFilePosition=myFilePosition+size; + delete buffer; + + while (i getBlockNumbersForRegionFromBinPosition(int* regionIndices, int blockBinCount, int blockColumnCount, bool intra) { + int col1 = regionIndices[0] / blockBinCount; + int col2 = (regionIndices[1] + 1) / blockBinCount; + int row1 = regionIndices[2] / blockBinCount; + int row2 = (regionIndices[3] + 1) / blockBinCount; + + set blocksSet; + // first check the upper triangular matrix + for (int r = row1; r <= row2; r++) { + for (int c = col1; c <= col2; c++) { + int blockNumber = r * blockColumnCount + c; + blocksSet.insert(blockNumber); + } + } + // check region part that overlaps with lower left triangle + // but only if intrachromosomal + if (intra) { + for (int r = col1; r <= col2; r++) { + for (int c = row1; c <= row2; c++) { + int blockNumber = r * blockColumnCount + c; + blocksSet.insert(blockNumber); + } + } + } + + return blocksSet; +} + +// this is the meat of reading the data. takes in the block number and returns the set of contact records corresponding to +// that block. the block data is compressed and must be decompressed using the zlib library functions +vector readBlock(istream& fin, char *url, bool isHttp, int blockNumber) { + indexEntry idx = blockMap[blockNumber]; + if (idx.size == 0) { + vector v; + return v; + } + char* compressedBytes = new char[idx.size]; + char* uncompressedBytes = new char[idx.size*10]; //biggest seen so far is 3 + + if (isHttp) { + compressedBytes = getHttpData(url, idx.position, idx.size); + } + else { + fin.seekg(idx.position, ios::beg); + fin.read(compressedBytes, idx.size); + } + // Decompress the block + // zlib struct + z_stream infstream; + infstream.zalloc = Z_NULL; + infstream.zfree = Z_NULL; + infstream.opaque = Z_NULL; + infstream.avail_in = (uInt)(idx.size); // size of input + infstream.next_in = (Bytef *)compressedBytes; // input char array + infstream.avail_out = (uInt)idx.size*10; // size of output + infstream.next_out = (Bytef *)uncompressedBytes; // output char array + // the actual decompression work. + inflateInit(&infstream); + inflate(&infstream, Z_NO_FLUSH); + inflateEnd(&infstream); + int uncompressedSize=infstream.total_out; + + // create stream from buffer for ease of use + membuf sbuf(uncompressedBytes, uncompressedBytes + uncompressedSize); + istream bufferin(&sbuf); + int nRecords; + bufferin.read((char*)&nRecords, sizeof(int)); + vector v(nRecords); + // different versions have different specific formats + if (version < 7) { + for (int i = 0; i < nRecords; i++) { + int binX, binY; + bufferin.read((char*)&binX, sizeof(int)); + bufferin.read((char*)&binY, sizeof(int)); + float counts; + bufferin.read((char*)&counts, sizeof(float)); + contactRecord record; + record.binX = binX; + record.binY = binY; + record.counts = counts; + v[i] = record; + } + } + else { + int binXOffset, binYOffset; + bufferin.read((char*)&binXOffset, sizeof(int)); + bufferin.read((char*)&binYOffset, sizeof(int)); + char useShort; + bufferin.read((char*)&useShort, sizeof(char)); + char type; + bufferin.read((char*)&type, sizeof(char)); + int index=0; + if (type == 1) { + // List-of-rows representation + short rowCount; + bufferin.read((char*)&rowCount, sizeof(short)); + for (int i = 0; i < rowCount; i++) { + short y; + bufferin.read((char*)&y, sizeof(short)); + int binY = y + binYOffset; + short colCount; + bufferin.read((char*)&colCount, sizeof(short)); + for (int j = 0; j < colCount; j++) { + short x; + bufferin.read((char*)&x, sizeof(short)); + int binX = binXOffset + x; + float counts; + if (useShort == 0) { // yes this is opposite of usual + short c; + bufferin.read((char*)&c, sizeof(short)); + counts = c; + } + else { + bufferin.read((char*)&counts, sizeof(float)); + } + contactRecord record; + record.binX = binX; + record.binY = binY; + record.counts = counts; + v[index]=record; + index++; + } + } + } + else if (type == 2) { // have yet to find test file where this is true, possibly entirely deprecated + int nPts; + bufferin.read((char*)&nPts, sizeof(int)); + short w; + bufferin.read((char*)&w, sizeof(short)); + + for (int i = 0; i < nPts; i++) { + //int idx = (p.y - binOffset2) * w + (p.x - binOffset1); + int row = i / w; + int col = i - row * w; + int bin1 = binXOffset + col; + int bin2 = binYOffset + row; + + float counts; + if (useShort == 0) { // yes this is opposite of the usual + short c; + bufferin.read((char*)&c, sizeof(short)); + if (c != -32768) { + contactRecord record; + record.binX = bin1; + record.binY = bin2; + record.counts = c; + v[index]=record; + index++; + } + } + else { + bufferin.read((char*)&counts, sizeof(float)); + if (counts != 0x7fc00000) { // not sure this works + // if (!Float.isNaN(counts)) { + contactRecord record; + record.binX = bin1; + record.binY = bin2; + record.counts = counts; + v[index]=record; + index++; + } + } + } + } + } + delete[] compressedBytes; + delete[] uncompressedBytes; // don't forget to delete your heap arrays in C++! + return v; +} + +// reads the normalization vector from the file at the specified location +vector readNormalizationVector(istream& bufferin) { + int nValues; + bufferin.read((char*)&nValues, sizeof(int)); + vector values(nValues); + // bool allNaN = true; + + for (int i = 0; i < nValues; i++) { + double d; + bufferin.read((char*)&d, sizeof(double)); + values[i] = d; + /* if (!Double.isNaN(values[i])) { + allNaN = false; + }*/ + } + // if (allNaN) return null; + return values; +} + +void straw(string norm, string fname, int binsize, string chr1loc, string chr2loc, string unit, vector &xActual, vector &yActual, vector &counts) +{ + int earlyexit=1; + if (!(norm=="NONE"||norm=="VC"||norm=="VC_SQRT"||norm=="KR")) { + throw strawException("Invalid Norm value - must be one of "); + } + if (!(unit=="BP"||unit=="FRAG")) { + throw strawException("Invalid Unit value - must be one of "); + } + + // parse chromosome positions + stringstream ss(chr1loc); + string chr1, chr2, x, y; + int c1pos1=-100, c1pos2=-100, c2pos1=-100, c2pos2=-100; + getline(ss, chr1, ':'); + if (getline(ss, x, ':') && getline(ss, y, ':')) { + c1pos1 = stoi(x); + c1pos2 = stoi(y); + } + stringstream ss1(chr2loc); + getline(ss1, chr2, ':'); + if (getline(ss1, x, ':') && getline(ss1, y, ':')) { + c2pos1 = stoi(x); + c2pos2 = stoi(y); + } + int chr1ind, chr2ind; + + // HTTP code + string prefix="http"; + bool isHttp = false; + ifstream fin; + + // read header into buffer; 100K should be sufficient + char *url = NULL; + + long master; + if (std::strncmp(fname.c_str(), prefix.c_str(), prefix.size()) == 0) { + isHttp = true; + char * buffer; + url = (char*) fname.c_str(); + buffer = getHttpData(url, 0, 100000); + membuf sbuf(buffer, buffer + 100000); + istream bufin(&sbuf); + master = readHeader(bufin, chr1, chr2, c1pos1, c1pos2, c2pos1, c2pos2, chr1ind, chr2ind); + delete buffer; + } + else { + fin.open(fname, fstream::in); + if (!fin) { + cerr << "File " << fname << " cannot be opened for reading" << endl; + return; + } + master = readHeader(fin, chr1, chr2, c1pos1, c1pos2, c2pos1, c2pos2, chr1ind, chr2ind); + } + + // from header have size of chromosomes, set region to read + int c1=min(chr1ind,chr2ind); + int c2=max(chr1ind,chr2ind); + int origRegionIndices[4]; // as given by user + int regionIndices[4]; // used to find the blocks we need to access + // reverse order if necessary + if (chr1ind > chr2ind) { + origRegionIndices[0] = c2pos1; + origRegionIndices[1] = c2pos2; + origRegionIndices[2] = c1pos1; + origRegionIndices[3] = c1pos2; + regionIndices[0] = c2pos1 / binsize; + regionIndices[1] = c2pos2 / binsize; + regionIndices[2] = c1pos1 / binsize; + regionIndices[3] = c1pos2 / binsize; + } + else { + origRegionIndices[0] = c1pos1; + origRegionIndices[1] = c1pos2; + origRegionIndices[2] = c2pos1; + origRegionIndices[3] = c2pos2; + regionIndices[0] = c1pos1 / binsize; + regionIndices[1] = c1pos2 / binsize; + regionIndices[2] = c2pos1 / binsize; + regionIndices[3] = c2pos2 / binsize; + } + + indexEntry c1NormEntry, c2NormEntry; + long myFilePos; + + unsigned long bytes_to_read = total_bytes - master; + + if (isHttp) { + char* buffer2; + buffer2 = getHttpData(url, master, bytes_to_read); + membuf sbuf2(buffer2, buffer2 + bytes_to_read); + istream bufin2(&sbuf2); + readFooter(bufin2, master, c1, c2, norm, unit, binsize, myFilePos, c1NormEntry, c2NormEntry); + delete buffer2; + } + else { + fin.seekg(master, ios::beg); + readFooter(fin, master, c1, c2, norm, unit, binsize, myFilePos, c1NormEntry, c2NormEntry); + } + // readFooter will assign the above variables + + + vector c1Norm; + vector c2Norm; + + if (norm != "NONE") { + char* buffer3; + if (isHttp) { + buffer3 = getHttpData(url, c1NormEntry.position, c1NormEntry.size); + } + else { + buffer3 = new char[c1NormEntry.size]; + fin.seekg(c1NormEntry.position, ios::beg); + fin.read(buffer3, c1NormEntry.size); + } + membuf sbuf3(buffer3, buffer3 + c1NormEntry.size); + istream bufferin(&sbuf3); + c1Norm = readNormalizationVector(bufferin); + + char* buffer4; + if (isHttp) { + buffer4 = getHttpData(url, c2NormEntry.position, c2NormEntry.size); + } + else { + buffer4 = new char[c2NormEntry.size]; + fin.seekg(c2NormEntry.position, ios::beg); + fin.read(buffer4, c2NormEntry.size); + } + membuf sbuf4(buffer4, buffer4 + c2NormEntry.size); + istream bufferin2(&sbuf4); + c2Norm = readNormalizationVector(bufferin2); + delete buffer3; + delete buffer4; + } + + int blockBinCount, blockColumnCount; + if (isHttp) { + // readMatrix will assign blockBinCount and blockColumnCount + readMatrixHttp(url, myFilePos, unit, binsize, blockBinCount, blockColumnCount); + } + else { + // readMatrix will assign blockBinCount and blockColumnCount + readMatrix(fin, myFilePos, unit, binsize, blockBinCount, blockColumnCount); + } + set blockNumbers = getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockColumnCount, c1==c2); + + // getBlockIndices + vector records; + for (set::iterator it=blockNumbers.begin(); it!=blockNumbers.end(); ++it) { + // get contacts in this block + records = readBlock(fin, url, isHttp, *it); + for (vector::iterator it2=records.begin(); it2!=records.end(); ++it2) { + contactRecord rec = *it2; + + int x = rec.binX * binsize; + int y = rec.binY * binsize; + float c = rec.counts; + if (norm != "NONE") { + c = c / (c1Norm[rec.binX] * c2Norm[rec.binY]); + } + + if ((x >= origRegionIndices[0] && x <= origRegionIndices[1] && + y >= origRegionIndices[2] && y <= origRegionIndices[3]) || + // or check regions that overlap with lower left + ((c1==c2) && y >= origRegionIndices[0] && y <= origRegionIndices[1] && x >= origRegionIndices[2] && x <= origRegionIndices[3])) { + xActual.push_back(x); + yActual.push_back(y); + counts.push_back(c); + //printf("%d\t%d\t%.14g\n", x, y, c); + } + } + } +} + + +void parseHeaderFields(istream& fin, string &genome, vector &chromNames, vector &bpResolutions, vector &fragResolutions) +/* Parse out header fields from the supplied data buffer, assumed to come from the beginning of + * a .hic file. */ +{ + if (!readMagicString(fin)) { + throw strawException("Hi-C magic string is missing, does not appear to be a hic file"); + } + + fin.read((char*)&version, sizeof(int)); + if (version < 6) { + throw strawException("Version " + to_string(version) + " no longer supported"); + } + long master; + fin.read((char*)&master, sizeof(long)); + getline(fin, genome, '\0' ); + int nattributes; + fin.read((char*)&nattributes, sizeof(int)); + + // reading and ignoring attribute-value dictionary + // Should expand this to save these to another structure + for (int i=0; i &chromNames, vector &bpResolutions, vector &fragResolutions) +/* Retrieve .hic header fields from the supplied filename and return them in the supplied variables. */ +{ + // HTTP code + string prefix="http"; + bool isHttp = false; + + // read header into buffer; 100K should be sufficient + char * buffer = NULL; + char *url = NULL; + + if (std::strncmp(fname.c_str(), prefix.c_str(), prefix.size()) == 0) { + isHttp = true; + url = (char*) fname.c_str(); + buffer = getHttpData(url, 0, 100000); + membuf sbuf(buffer, buffer + 100000); + istream bufin(&sbuf); + parseHeaderFields(bufin, genome, chromNames, bpResolutions, fragResolutions); + delete buffer; + } + else { + fstream fin; + fin.open(fname, fstream::in); + if (!fin) { + throw strawException("File " + fname + " cannot be opened for reading"); + } + parseHeaderFields(fin, genome, chromNames, bpResolutions, fragResolutions); + fin.close(); + } +} + + + +extern "C" char *Cstraw (char *norm, char *fname, int binsize, char *chr1loc, char *chr2loc, char *unit, int **xActual, int **yActual, double **counts, int *numRecords) +/* Wrapper function to retrieve a data chunk from a .hic file, for use by C libraries. + * norm is one of NONE/VC/VC_SQRT/KR. + * binsize is one of the supported bin sizes as determined by CstrawHeader. + * chr1loc and chr2loc are the two positions to retrieve interaction data for, specified as chr:start-end. + * unit is one of BP/FRAG. + * Values are returned in newly allocated arrays in xActual, yActual, and counts, with the number of + * returned records in numRecords. + * The function returns NULL unless an error was encountered, in which case the return value points + * to a character string explaining the error. */ +{ + string thisnorm(norm); + string thisfname(fname); + string thischr1loc(chr1loc); + string thischr2loc(chr2loc); + string thisunit(unit); + vector thisx; + vector thisy; + vector thiscounts; + try { + straw(thisnorm, thisfname, binsize, thischr1loc, thischr2loc, unit, thisx, thisy, thiscounts); + } catch (strawException& err) { + char *errMsg = (char*) calloc((size_t) strlen(err.what())+1, sizeof(char)); + strcpy(errMsg, err.what()); + return errMsg; + } + *numRecords = thisx.size(); + *xActual = (int*) calloc((size_t) *numRecords, sizeof(int)); + *yActual = (int*) calloc((size_t) *numRecords, sizeof(int)); + *counts = (double*) calloc((size_t) *numRecords, sizeof(double)); + for (int i=0; i<*numRecords; i++) + { + (*xActual)[i] = thisx[i]; + (*yActual)[i] = thisy[i]; + (*counts)[i] = (double) thiscounts[i]; + } + return NULL; +} + + +extern "C" char *CstrawHeader (char *filename, char **genome, char ***chromNames, int *nChroms, char ***bpResolutions, int *nBpRes, char ***fragResolutions, int *nFragRes) +/* Wrapper function to retrieve header fields from a .hic file, for use by C libraries. + * This retrieves the assembly name, list of chromosome names, list of available binsize resolutions, + * and list of available fragment resolutions in the specific .hic file. + * The function returns NULL unless an error was encountered, in which case the return value points + * to a character string explaining the error. */ +{ + string filenameString(filename); + string genomeString; + vector chromNameVector; + vector bpResolutionVector; + vector fragResolutionVector; + try { + getHeaderFields(filenameString, genomeString, chromNameVector, bpResolutionVector, fragResolutionVector); + } catch (strawException& err) { + char *errMsg = (char*) calloc((size_t) strlen(err.what())+1, sizeof(char)); + strcpy(errMsg, err.what()); + return errMsg; + } + if (genome != NULL) + { + *genome = (char*) malloc((genomeString.length()+1)*sizeof(char)); + strcpy(*genome, genomeString.c_str()); + } + if (nChroms != NULL) + { + *nChroms = chromNameVector.size(); + } + if (chromNames != NULL) + { + *chromNames = (char**) calloc((size_t) chromNameVector.size(), sizeof(char*)); + for (int i=0; i