60aca91bcce6d4fa555e6c7c91d8ff8aa9e7bd2b jcasper Fri Jun 11 15:17:21 2021 -0700 Updating hic support for files with large headers (over 100kb) and improving multi-region performance, refs #18842, #27593 diff --git src/hg/lib/straw/cStraw.cpp src/hg/lib/straw/cStraw.cpp new file mode 100644 index 0000000..b284723 --- /dev/null +++ src/hg/lib/straw/cStraw.cpp @@ -0,0 +1,169 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zlib.h" +#include "straw.h" +using namespace std; + +// Supplementary functions for invoking Straw in C + + +void getHeaderFields(Straw *hicFile, string &genome, vector &chromNames, vector &chromSizes, vector &bpResolutions, vector &fragResolutions, vector &attributes) +/* Retrieve .hic header fields from the supplied filename and return them in the supplied variables. */ +{ + hicFile->loadHeader(); + genome = hicFile->genome; + chromNames = hicFile->chrNames; + chromSizes = hicFile->chrSizes; + bpResolutions = hicFile->bpResolutions; + fragResolutions = hicFile->fragResolutions; + map::iterator loop; + for (loop=hicFile->attributes.begin(); loop!=hicFile->attributes.end(); loop++) { + attributes.insert(attributes.end(), loop->first); + attributes.insert(attributes.end(), loop->second); + } +} + + +extern "C" Straw *cStrawOpen(char *fname) { + return new Straw(fname); +} + +extern "C" void cStrawClose(Straw **hicFile) { + delete *hicFile; + *hicFile = NULL; +} + +extern "C" char *cStraw (Straw *hicFile, char *norm, int binsize, char *chr1loc, char *chr2loc, char *unit, int **xActual, int **yActual, double **counts, int *numRecords) +/* Wrapper function to retrieve a data chunk from a .hic file, for use by C libraries. + * norm is one of NONE/VC/VC_SQRT/KR. + * binsize is one of the supported bin sizes as determined by cStrawHeader. + * chr1loc and chr2loc are the two positions to retrieve interaction data for, specified as chr:start-end. + * unit is one of BP/FRAG. + * Values are returned in newly allocated arrays in xActual, yActual, and counts, with the number of + * returned records in numRecords. + * The function returns NULL unless an error was encountered, in which case the return value points + * to a character string explaining the error. */ +{ + string thisnorm(norm); + string thischr1loc(chr1loc); + string thischr2loc(chr2loc); + string thisunit(unit); + vector thisx; + vector thisy; + vector thiscounts; + try { + hicFile->straw(thisnorm, binsize, thischr1loc, thischr2loc, unit, thisx, thisy, thiscounts); + } catch (strawException& err) { + char *errMsg = (char*) calloc((size_t) strlen(err.what())+1, sizeof(char)); + strcpy(errMsg, err.what()); + return errMsg; + } + *numRecords = thisx.size(); + *xActual = (int*) calloc((size_t) *numRecords, sizeof(int)); + *yActual = (int*) calloc((size_t) *numRecords, sizeof(int)); + *counts = (double*) calloc((size_t) *numRecords, sizeof(double)); + for (int i=0; i<*numRecords; i++) + { + (*xActual)[i] = thisx[i]; + (*yActual)[i] = thisy[i]; + (*counts)[i] = (double) thiscounts[i]; + } + return NULL; +} + + +extern "C" char *cStrawHeader (Straw *hicFile, char **genome, char ***chromNames, int **chromSizes, int *nChroms, char ***bpResolutions, int *nBpRes, char ***fragResolutions, int *nFragRes, char ***attributes, int *nAttributes) +/* Wrapper function to retrieve header fields from a .hic file, for use by C libraries. + * This retrieves the assembly name, list of chromosome names, list of available binsize resolutions, + * and list of available fragment resolutions in the specific .hic file. + * The function returns NULL unless an error was encountered, in which case the return value points + * to a character string explaining the error. */ +{ + string genomeString; + vector chromNameVector; + vector chromSizeVector; + vector bpResolutionVector; + vector fragResolutionVector; + vector attributeVector; + try { + getHeaderFields(hicFile, genomeString, chromNameVector, chromSizeVector, bpResolutionVector, fragResolutionVector, attributeVector); + } catch (strawException& err) { + char *errMsg = (char*) calloc((size_t) strlen(err.what())+1, sizeof(char)); + strcpy(errMsg, err.what()); + return errMsg; + } + if (genome != NULL) + { + *genome = (char*) malloc((genomeString.length()+1)*sizeof(char)); + strcpy(*genome, genomeString.c_str()); + } + if (nChroms != NULL) + { + *nChroms = chromNameVector.size(); + } + if (chromNames != NULL) + { + *chromNames = (char**) calloc((size_t) chromNameVector.size(), sizeof(char*)); + for (int i=0; i