60aca91bcce6d4fa555e6c7c91d8ff8aa9e7bd2b jcasper Fri Jun 11 15:17:21 2021 -0700 Updating hic support for files with large headers (over 100kb) and improving multi-region performance, refs #18842, #27593 diff --git src/hg/lib/hic.c src/hg/lib/hic.c index bfbc6d5..d1c03c1 100644 --- src/hg/lib/hic.c +++ src/hg/lib/hic.c @@ -1,61 +1,63 @@ /* hic.c contains a few helpful wrapper functions for managing Hi-C data. */ #include "common.h" #include "linefile.h" #include "dystring.h" #include "jksql.h" #include "hic.h" #include "hdb.h" #include "trackHub.h" -#include "Cstraw.h" +#include "cStraw.h" #include "hash.h" #include "chromAlias.h" #include "interact.h" #ifdef USE_HIC void mangleName(char *ucscName, char mangledUcscName[], int size) /* Generate a version of an assembly's chromosome name that matches * the mangling performed by the Juicer .hic creation tool (strip any initial * "chr" and capitalize the rest). */ { int offset = 0; char workingName[size]; safef(workingName, sizeof(workingName), "%s", ucscName); touppers(workingName); if (startsWith("CHR", workingName)) offset = 3; safencpy(mangledUcscName, size, workingName+offset, strlen(workingName+offset)); } char *hicLoadHeader(char *filename, struct hicMeta **header, char *ucscAssembly) /* Create a hicMeta structure for the supplied Hi-C file. If * the return value is non-NULL, it points to a string containing * an error message that explains why the retrieval failed. */ { char *genome; char **chromosomes, **bpResolutions, **attributes; int *chromSizes, nChroms, nBpRes, nAttributes; -char *errMsg = CstrawHeader(filename, &genome, &chromosomes, &chromSizes, &nChroms, &bpResolutions, &nBpRes, NULL, NULL, &attributes, &nAttributes); +Straw *newStraw = cStrawOpen(filename); +char *errMsg = cStrawHeader(newStraw, &genome, &chromosomes, &chromSizes, &nChroms, &bpResolutions, &nBpRes, NULL, NULL, &attributes, &nAttributes); if (errMsg != NULL) return errMsg; struct hicMeta *newMeta = NULL; AllocVar(newMeta); +newMeta->strawObj = newStraw; newMeta->fileAssembly = genome; newMeta->nRes = nBpRes; newMeta->resolutions = bpResolutions; newMeta->nChroms = nChroms; newMeta->chromNames = chromosomes; newMeta->chromSizes = chromSizes; newMeta->ucscToAlias = NULL; newMeta->ucscAssembly = cloneString(ucscAssembly); newMeta->filename = cloneString(filename); newMeta->attributes = attributes; newMeta->nAttributes = nAttributes; *header = newMeta; struct slName *ucscNameList = NULL, *ucscName = NULL; @@ -163,31 +165,31 @@ char *leftChromName = chrom1; char *rightChromName = chrom2; if (fileInfo->ucscToAlias != NULL) { leftChromName = (char*) hashFindVal(fileInfo->ucscToAlias, leftChromName); if (leftChromName == NULL) leftChromName = chrom1; rightChromName = (char*) hashFindVal(fileInfo->ucscToAlias, rightChromName); if (rightChromName == NULL) rightChromName = chrom2; } dyStringPrintf(leftWindowPos, "%s:%d:%d", leftChromName, start1, end1); dyStringPrintf(rightWindowPos, "%s:%d:%d", rightChromName, start2, end2); -char *networkErrMsg = Cstraw(normalization, fileInfo->filename, resolution, dyStringContents(leftWindowPos), +char *networkErrMsg = cStraw(fileInfo->strawObj, normalization, resolution, dyStringContents(leftWindowPos), dyStringContents(rightWindowPos), "BP", &x, &y, &counts, &numRecords); int i=0; for (i=0; i<numRecords; i++) { if (isnan(counts[i])) { // Yes, apparently NAN is possible with normalized values in some methods. Ignore those. continue; } struct interact *new = interactFromHic(chrom1, x[i], chrom2, y[i], resolution, counts[i]); slAddHead(resultPtr, new); if (differentWord(chrom1, chrom2))