839d2067c1871e5ffa53842f1b9c04806cf14f97 jcasper Thu Jul 25 11:51:10 2024 -0700 The new straw library should recognize when it's doing a bad data retrieval. Also added a rudimentary way to retrieve available normalization options, refs #33225 diff --git src/hg/lib/straw/new_cStraw.cpp src/hg/lib/straw/new_cStraw.cpp index 2242452..019c6d4 100644 --- src/hg/lib/straw/new_cStraw.cpp +++ src/hg/lib/straw/new_cStraw.cpp @@ -14,83 +14,117 @@ // Supplementary functions for invoking Straw in C struct Straw { string *fileName; string *genome; vector<string> chromNames; vector<int> chromSizes; int nChroms; vector<int> bpResolutions; int nBpRes; vector<int> fragResolutions; int nFragRes; vector<string> attributes; int nAttributes; + set<string> normOptions; }; extern "C" char *cStrawOpen(char *fname, Straw **p) /* Create a Straw object based on the hic file at the provided path and set *p to point to it. * On error, set *p = NULL and return a non-null string describing the error. */ { *p = (Straw*) calloc (1, sizeof(struct Straw)); (*p)->fileName = new string(fname); (*p)->genome = new string(); - string genome; try { - getHeaderFields(fname, genome, (*p)->chromNames, (*p)->chromSizes, (*p)->bpResolutions, + getHeaderFields(fname, *((*p)->genome), (*p)->chromNames, (*p)->chromSizes, (*p)->bpResolutions, (*p)->fragResolutions, (*p)->attributes); } catch (strawException& err) { char *errMsg = (char*) calloc((size_t) strlen(err.what())+1, sizeof(char)); strcpy(errMsg, err.what()); free(*p); *p = NULL; return errMsg; } - (*p)->genome->assign(genome); (*p)->nChroms = (*p)->chromNames.size(); (*p)->nBpRes = (*p)->bpResolutions.size(); (*p)->nFragRes = (*p)->fragResolutions.size(); (*p)->nAttributes = (*p)->attributes.size(); - if ((*p)->nBpRes > 0) - return NULL; + + // I seem to recall situations where getHeaderFields doesn't throw an error, but the data are + // bad regardless (e.g. when the structure of a header changed between .hic versions). This + // should help catch those. + if ((*p)->nBpRes == 0) + { char *errString = (char*) malloc (strlen("Unable to retrieve header data from file") + 1); strcpy(errString, "Unable to retrieve header data from file"); return errString; } + // Time to get the normalization options. As a hack, we get these by making a dummy data request, + // having that set up a global variable with the options seen, then retrieving that. This will + // miss situations where different normalization options are available at different resolutions. + // That is a thing that can happen, but I haven't tested the performace hit of running through every + // available resolution for the possible options just yet (let alone restructuring the browser side + // of the code to support that kind of dependency). In the interim, this method is still better + // than the previous strategy of hard-coding in options that sometimes aren't available (and missing + // ones that are). + + vector<contactRecord> strawRecords; + try { + // using chrom[1] because [0] is always "All", which doesn't seem to play well because it + // may have a different set of resolution options + string chrPos = (*p)->chromNames[1] + ":1:1"; + // Intentionally feed straw an empty normalization option. This will cause an error (which we trap), + // but it's the easiest way to make the library load and compare the available options (the library + // short-circuits out early if "NONE" is provided). + strawRecords = straw("observed", "", string(fname), + chrPos, chrPos, "BP", (*p)->bpResolutions[0]); + } catch (strawException& err) { + // Do nothing - we're intentionally feeding it a bad norm option just so it'll go through the list + // and realize it can't find it, populating a list along the way. + } + + // Now the list that getNormOptions() depends on should be populated + (*p)->normOptions = getNormOptions(); + + return NULL; +} + extern "C" void cStrawClose(Straw **hicFile) /* Free up a straw object created with cStrawOpen() */ { if (*hicFile != NULL) { delete (*hicFile)->genome; (*hicFile)->chromNames.clear(); (*hicFile)->chromSizes.clear(); (*hicFile)->bpResolutions.clear(); (*hicFile)->fragResolutions.clear(); (*hicFile)->attributes.clear(); + (*hicFile)->normOptions.clear(); } free(*hicFile); *hicFile = NULL; } extern "C" char *cStraw (Straw *hicFile, char *norm, int binsize, char *chr1loc, char *chr2loc, char *unit, int **xActual, int **yActual, double **counts, int *numRecords) /* Wrapper function to retrieve a data chunk from a .hic file, for use by C libraries. - * norm is one of NONE/VC/VC_SQRT/KR. + * norm is probably one of NONE/VC/VC_SQRT/KR/SCALE, but it depends on the file. * binsize is one of the supported bin sizes as determined by cStrawHeader. * chr1loc and chr2loc are the two positions to retrieve interaction data for, specified as chr:start-end. * unit is one of BP/FRAG. * Values are returned in newly allocated arrays in xActual, yActual, and counts, with the number of * returned records in numRecords. * The function returns NULL unless an error was encountered, in which case the return value points * to a character string explaining the error. */ { string thisnorm(norm); string thischr1loc(chr1loc); string thischr2loc(chr2loc); string thisunit(unit); vector<contactRecord> strawRecords; try { strawRecords = straw("observed", thisnorm, *(hicFile->fileName), @@ -102,34 +136,38 @@ } *numRecords = strawRecords.size(); *xActual = (int*) calloc((size_t) *numRecords, sizeof(int)); *yActual = (int*) calloc((size_t) *numRecords, sizeof(int)); *counts = (double*) calloc((size_t) *numRecords, sizeof(double)); for (int i=0; i<*numRecords; i++) { (*xActual)[i] = strawRecords[i].binX; (*yActual)[i] = strawRecords[i].binY; (*counts)[i] = (double) strawRecords[i].counts; } return NULL; } -extern "C" char *cStrawHeader (Straw *hicFile, char **genome, char ***chromNames, int **chromSizes, int *nChroms, char ***bpResolutions, int *nBpRes, char ***fragResolutions, int *nFragRes, char ***attributes, int *nAttributes) +extern "C" char *cStrawHeader (Straw *hicFile, char **genome, char ***chromNames, int **chromSizes, int *nChroms, + char ***bpResolutions, int *nBpRes, char ***fragResolutions, int *nFragRes, char ***attributes, + int *nAttributes, char ***normOptions, int* nNormOptions) /* Wrapper function to retrieve header fields from a .hic file, for use by C libraries. * This retrieves the assembly name, list of chromosome names, list of available binsize resolutions, - * and list of available fragment resolutions in the specific .hic file. + * the list of available fragment resolutions in the specific .hic file, and a list of available + * normalization options. Technically the normalization options are supposed to be resolution-specific, + * but we're not ready for a redesign in that direction just yet. * The function returns NULL unless an error was encountered, in which case the return value points * to a character string explaining the error. */ { if (genome != NULL) { *genome = (char*) malloc((hicFile->genome->length()+1)*sizeof(char)); strcpy(*genome, hicFile->genome->c_str()); } if (nChroms != NULL) { *nChroms = hicFile->nChroms; } if (chromNames != NULL) { *chromNames = (char**) calloc((size_t) hicFile->chromNames.size(), sizeof(char*)); @@ -148,45 +186,62 @@ } } if (nBpRes != NULL) { *nBpRes = hicFile->nBpRes; } if (bpResolutions != NULL) { *bpResolutions = (char**) calloc((size_t) hicFile->bpResolutions.size(), sizeof(char*)); for (int i=0; i<hicFile->bpResolutions.size(); i++) { (*bpResolutions)[i] = (char*) malloc((to_string(hicFile->bpResolutions[i]).length()+1)*sizeof(char)); strcpy((*bpResolutions)[i], to_string(hicFile->bpResolutions[i]).c_str()); } } - // skipping fragment stuff for now + // skipping fragment stuff for now; we don't use it if (nFragRes != NULL && false) { *nFragRes = hicFile->nFragRes; } - // skipping fragment stuff for now + // skipping fragment stuff for now; we don't use it if (fragResolutions != NULL && false) { *fragResolutions = (char**) calloc((size_t) hicFile->fragResolutions.size(), sizeof(char*)); for (int i=0; i<hicFile->fragResolutions.size(); i++) { (*fragResolutions)[i] = (char*) malloc((to_string(hicFile->fragResolutions[i]).length()+1)*sizeof(char)); strcpy((*fragResolutions)[i], to_string(hicFile->fragResolutions[i]).c_str()); } } if (nAttributes != NULL) { *nAttributes = hicFile->nAttributes; } if (attributes != NULL) { *attributes = (char**) calloc((size_t) hicFile->attributes.size(), sizeof(char*)); for (int i=0; i<hicFile->attributes.size(); i++) { (*attributes)[i] = (char*) malloc((hicFile->attributes[i].length()+1)*sizeof(char)); strcpy((*attributes)[i], hicFile->attributes[i].c_str()); } } + if (nNormOptions != NULL) + { + // Include one extra for "NONE", which doesn't appear in normOptions + *nNormOptions = hicFile->normOptions.size()+1; + } + if (normOptions != NULL) + { + *normOptions = (char**) calloc((size_t) hicFile->normOptions.size()+1, sizeof(char*)); + (*normOptions)[0] = (char*) "NONE"; + int i = 1; + std::set<string>::iterator it = hicFile->normOptions.begin(); + for (; it != hicFile->normOptions.end(); it++, i++) + { + (*normOptions)[i] = (char*) malloc(((*it).length()+1)*sizeof(char)); + strcpy((*normOptions)[i], (*it).c_str()); + } + } return NULL; }