d12a52edb9b3841adcfe42fc25b361e59654ef11 jcasper Fri Feb 28 15:59:19 2020 -0800 Better accounting for chromosome aliases in .hic files, refs #25055 diff --git src/hg/lib/hic.c src/hg/lib/hic.c index 6ee075b..b7ea44f 100644 --- src/hg/lib/hic.c +++ src/hg/lib/hic.c @@ -1,156 +1,197 @@ /* hic.c contains a few helpful wrapper functions for managing Hi-C data. */ #include "common.h" #include "linefile.h" #include "dystring.h" #include "jksql.h" #include "hic.h" #include "hdb.h" #include "trackHub.h" #include "Cstraw.h" #include "hash.h" #include "chromAlias.h" #include "interact.h" +void mangleName(char *ucscName, char mangledUcscName[], int size) +/* Generate a version of an assembly's chromosome name that matches + * the mangling performed by the Juicer .hic creation tool (strip any initial + * "chr" and capitalize the rest). */ +{ + int offset = 0; + char workingName[size]; + safef(workingName, sizeof(workingName), "%s", ucscName); + touppers(workingName); + if (startsWith("CHR", workingName)) + offset = 3; + strncpy(mangledUcscName, workingName+offset, size-offset); +} + + char *hicLoadHeader(char *filename, struct hicMeta **header, char *ucscAssembly) /* Create a hicMeta structure for the supplied Hi-C file. If * the return value is non-NULL, it points to a string containing * an error message that explains why the retrieval failed. */ { char *genome; char **chromosomes, **bpResolutions, **attributes; int *chromSizes, nChroms, nBpRes, nAttributes; char *errMsg = CstrawHeader(filename, &genome, &chromosomes, &chromSizes, &nChroms, &bpResolutions, &nBpRes, NULL, NULL, &attributes, &nAttributes); if (errMsg != NULL) return errMsg; struct hicMeta *newMeta = NULL; AllocVar(newMeta); newMeta->fileAssembly = genome; newMeta->nRes = nBpRes; newMeta->resolutions = bpResolutions; newMeta->nChroms = nChroms; newMeta->chromNames = chromosomes; newMeta->chromSizes = chromSizes; newMeta->ucscToAlias = NULL; newMeta->ucscAssembly = cloneString(ucscAssembly); newMeta->filename = cloneString(filename); newMeta->attributes = attributes; newMeta->nAttributes = nAttributes; *header = newMeta; -if (trackHubDatabase(genome)) - return NULL; -// add alias hash in case file uses 1 vs chr1, etc. -if (newMeta->ucscAssembly != NULL) +struct slName *ucscNameList = hAllChromNames(newMeta->ucscAssembly), *ucscName = NULL; +struct hash *ucscToChromAlias = NULL; +if ((newMeta->ucscAssembly != NULL) && !trackHubDatabase(ucscAssembly)) + ucscToChromAlias = chromAliasMakeReverseLookupTable(newMeta->ucscAssembly); + +struct slName *hicChromNames = slNameListFromStringArray(chromosomes, nChroms); +struct hash *hicChromHash = hashSetFromSlNameList(hicChromNames); +struct hash *ucscToHicName = newHash(0); + +// For each UCSC chrom name, try to find a .hic file chromosome to fetch annotation from. +for (ucscName = ucscNameList; ucscName != NULL; ucscName = ucscName->next) + { + char mangledName[2048]; + mangleName(ucscName->name, mangledName, sizeof(mangledName)); + if (hashLookup(hicChromHash, ucscName->name)) + hashAdd(ucscToHicName, ucscName->name, cloneString(ucscName->name)); + else if (hashLookup(hicChromHash, mangledName)) + hashAdd(ucscToHicName, ucscName->name, cloneString(mangledName)); + else if (ucscToChromAlias != NULL) { - struct hash *aliasToUcsc = chromAliasMakeLookupTable(newMeta->ucscAssembly); - if (aliasToUcsc != NULL) + // No hits on the primary chromosome name; time to start going through aliases. + struct hashEl *thisEl = hashLookup(ucscToChromAlias, ucscName->name); + while (thisEl != NULL) { - struct hash *ucscToAlias = newHash(0); - int i; - for (i=0; ival; + if (hashLookup(hicChromHash, cA->alias)) { - struct chromAlias *cA = hashFindVal(aliasToUcsc, chromosomes[i]); - if (cA != NULL) + hashAdd(ucscToHicName, ucscName->name, cloneString(cA->alias)); + break; + } + mangleName(cA->alias, mangledName, sizeof(mangledName)); + if (hashLookup(hicChromHash, mangledName)) { - hashAdd(ucscToAlias, cA->chrom, cloneString(chromosomes[i])); + hashAdd(ucscToHicName, ucscName->name, cloneString(mangledName)); + break; } + thisEl = hashLookupNext(thisEl); } - newMeta->ucscToAlias = ucscToAlias; - hashFree(&aliasToUcsc); } } +newMeta->ucscToAlias = ucscToHicName; + +if (ucscToChromAlias != NULL) + hashFreeWithVals(&ucscToChromAlias, chromAliasFree); +hashFree(&hicChromHash); +slNameFreeList(&hicChromNames); +slNameFreeList(&ucscNameList); + return NULL; } struct interact *interactFromHic(char *chrom1, int start1, char *chrom2, int start2, int size, double value) /* Given some data values from an interaction in a hic file, build a corresponding interact structure. */ { struct interact *new = NULL; AllocVar(new); new->chrom = cloneString(chrom1); // start1 is always before start2 on same-chromosome records, so start1 is always the start. // On records that link between chromosomes, just use the coordinates for this chromosome. new->chromStart = start1; if (sameWord(chrom1, chrom2)) new->chromEnd = start2+size; else new->chromEnd = start1+size; new->name = cloneString(""); new->score = 0; // ignored new->value = value; new->exp = cloneString("."); new->color = 0; new->sourceChrom = cloneString(chrom1); new->sourceStart = start1; new->sourceEnd = start1+size; new->sourceName = cloneString(""); new->sourceStrand = cloneString("."); new->targetChrom = cloneString(chrom2); new->targetStart = start2; new->targetEnd = start2+size; new->targetName = cloneString(""); new->targetStrand = cloneString("."); return new; } char *hicLoadData(struct hicMeta *fileInfo, int resolution, char *normalization, char *chrom1, int start1, int end1, char *chrom2, int start2, int end2, struct interact **resultPtr) /* Fetch heatmap data from a hic file. The hic file info must be provided in fileInfo, which should be * populated by hicLoadHeader. The result is a linked list of interact structures in *resultPtr, * and the return value (if non-NULL) is the text of any error message encountered by the underlying * Straw library. */ { int *x, *y, numRecords=0; double *counts; if (!fileInfo) errAbort("Attempting to load hic data from a NULL hicMeta pointer"); struct dyString *leftWindowPos = dyStringNew(0); struct dyString *rightWindowPos = dyStringNew(0); char *leftChromName = chrom1; char *rightChromName = chrom2; if (fileInfo->ucscToAlias != NULL) { leftChromName = (char*) hashFindVal(fileInfo->ucscToAlias, leftChromName); if (leftChromName == NULL) leftChromName = chrom1; rightChromName = (char*) hashFindVal(fileInfo->ucscToAlias, rightChromName); if (rightChromName == NULL) rightChromName = chrom2; } dyStringPrintf(leftWindowPos, "%s:%d:%d", leftChromName, start1, end1); dyStringPrintf(rightWindowPos, "%s:%d:%d", rightChromName, start2, end2); char *networkErrMsg = Cstraw(normalization, fileInfo->filename, resolution, dyStringContents(leftWindowPos), dyStringContents(rightWindowPos), "BP", &x, &y, &counts, &numRecords); int i=0; for (i=0; i