cc610239716fe32f9c774d98a71f75e8c6b5fba3 braney Tue Apr 21 17:23:50 2026 -0700 mafToBigMafSummary: new utility that emits bed3+4 input ready for bedToBigBed -as=mafSummary.as, replacing the hgLoadMafSummary -test / cut -f2- / sort hack documented in bigMaf.html. Companion to mafToBigMaf. The summary scoring/merging logic is intentionally duplicated from hgLoadMafSummary.c (see header comment) — that code is stable and refactoring would force retesting all the makedocs that call hgLoadMafSummary. Also fixes a bug in the duplicated copy of mafSplitSrcGetChrom: the original errAborts on plain 'hg38.chrY' style master src because of an inverted differentString check; rewritten with cleaner correct logic. Updates bigMaf.html and trackDb/trackDbLibrary.shtml to reference the new tool. refs #37404 diff --git src/hg/utils/mafToBigMafSummary/mafToBigMafSummary.c src/hg/utils/mafToBigMafSummary/mafToBigMafSummary.c new file mode 100644 index 00000000000..ddc22a86a69 --- /dev/null +++ src/hg/utils/mafToBigMafSummary/mafToBigMafSummary.c @@ -0,0 +1,273 @@ +/* mafToBigMafSummary - Convert a maf into bed3+4 input ready for bedToBigBed + * to make a bigMaf summary (.bb) file. Companion to mafToBigMaf. */ + +/* The summary scoring/merging logic in this file is deliberately duplicated + * from hgLoadMafSummary.c rather than refactored into a shared library. + * That code is stable, hgLoadMafSummary is widely referenced from existing + * makedocs, and a refactor would force retesting all of those pipelines for + * minimal payoff. If you change one, change the other. */ + +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "maf.h" +#include "mafSummary.h" + +static struct optionSpec optionSpecs[] = { + {"mergeGap", OPTION_INT}, + {"minSize", OPTION_INT}, + {"maxSize", OPTION_INT}, + {"minSeqSize", OPTION_INT}, + {NULL, 0} +}; + +int mergeGap = 500; +int minSize = 10000; +int maxSize = 50000; +int minSeqSize = 1; +char *referenceDb = NULL; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( +"mafToBigMafSummary - Convert a maf into the bed3+4 input for a bigMaf summary file\n" + "usage:\n" + " mafToBigMafSummary referenceDb input.maf out.bed\n" + "Pipe the output through 'sort -k1,1 -k2,2n', then run bedToBigBed:\n" + " bedToBigBed -type=bed3+4 -as=mafSummary.as -tab out.sorted.bed \\\n" + " referenceDb.chrom.sizes bigMafSummary.bb\n" + "options:\n" + " -mergeGap=N max size of gap to merge regions (default %d)\n" + " -minSize=N merge blocks smaller than N (default %d)\n" + " -maxSize=N break up blocks larger than N (default %d)\n" + " -minSeqSize=N skip alignments when reference sequence is less than N\n" + " (default %d)\n", +mergeGap, minSize, maxSize, minSeqSize + ); +} + +double scorePairwise(struct mafAli *maf) +/* generate score from 0.0 to 1.0 for an alignment pair */ +/* Adapted from multiz scoring in hgTracks/mafTrack.c */ +{ +int endB; /* end in the reference (master) genome */ +int deltaB; +int endT; /* end in the master genome maf text (includes gaps) */ +double score; +double minScore = -100.0, maxScore = 100.0; +double scoreScale = 1.0 / (maxScore - minScore); +struct mafComp *mcMaster = maf->components; + +endB = mcMaster->size; +deltaB = endB; +for (endT = 0; endT < maf->textSize; endT++) + { + if (deltaB <= 0) + break; + if (mcMaster->text[endT] != '-') + deltaB -= 1; + } +score = mafScoreRangeMultiz(maf, 0, endT)/endB; + +score = (score - minScore) * scoreScale; +if (score < 0.0) score = 0.0; +if (score > 1.0) score = 1.0; +return score; +} + +void outputSummary(FILE *f, struct mafSummary *ms) +/* Output a single summary block as bed3+4 (no SQL bin column). */ +{ +mafSummaryTabOut(ms, f); +} + +double mergeScores(struct mafSummary *ms1, struct mafSummary *ms2) +/* Weighted-average score for two adjacent summary blocks (ms1 first). */ +{ +double total = ms1->score * (ms1->chromEnd - ms1->chromStart) + + ms2->score * (ms2->chromEnd - ms2->chromStart); +return total / (ms2->chromEnd - ms1->chromStart); +} + +struct mafComp *mafMaster(struct mafAli *maf, struct mafFile *mf, char *fileName) +/* Get master component from maf. errAbort if not found. */ +{ +struct mafComp *mcMaster = mafMayFindCompPrefix(maf, referenceDb, "."); +if (mcMaster == NULL) + errAbort("Couldn't find %s. sequence line %d of %s\n", + referenceDb, mf->lf->lineIx, fileName); +return mcMaster; +} + +char *mafSplitSrcGetChrom(char *src, char *database) +/* src can be in format chrom, db|chrom or db.chrom: split string on separator and return pointer to chrom. + * If database is non-NULL and src starts with "database.", strip exactly that prefix + * (so a database name like GCF_1234.3 with its own dot is handled correctly). + * Side effect: src is truncated at the separator (so it becomes just the db). */ +{ +char *pipe = strchr(src, '|'); +if (pipe) + { + *pipe = '\0'; + return pipe + 1; + } + +if (database != NULL) + { + int dbLen = strlen(database); + if (strncmp(src, database, dbLen) == 0 && src[dbLen] == '.') + { + src[dbLen] = '\0'; + return src + dbLen + 1; + } + } + +char *dot = strchr(src, '.'); +if (!dot) + return src; +*dot = '\0'; +return dot + 1; +} + +long processMaf(struct mafAli *maf, struct hash *componentHash, + FILE *f, struct mafFile *mf, char *fileName) +/* Compute scores for each pairwise component in the maf and emit summary blocks. */ +{ +struct mafComp *mc = NULL, *nextMc = NULL; +struct mafSummary *ms, *msPending; +struct mafAli pairMaf; +long componentCount = 0; +struct mafComp *mcMaster = mafMaster(maf, mf, fileName); +struct mafComp *oldMasterNext = mcMaster->next; +char *chrom; +char src[256]; + +strcpy(src, mcMaster->src); +chrom = mafSplitSrcGetChrom(src, referenceDb); + +for (mc = maf->components; mc != NULL; mc = nextMc) + { + nextMc = mc->next; + if (sameString(mcMaster->src, mc->src) || mc->size == 0) + continue; + + AllocVar(ms); + ms->chrom = cloneString(chrom); + ms->chromStart = mcMaster->start; + ms->chromEnd = mcMaster->start + mcMaster->size; + ms->src = cloneString(mc->src); + mafSplitSrcGetChrom(ms->src, referenceDb); + + ZeroVar(&pairMaf); + pairMaf.textSize = maf->textSize; + pairMaf.components = mcMaster; + mcMaster->next = mc; + mc->next = NULL; + ms->score = scorePairwise(&pairMaf); + ms->leftStatus[0] = mc->leftStatus; + ms->rightStatus[0] = mc->rightStatus; + + mcMaster->next = oldMasterNext; + mc->next = nextMc; + + if ((msPending = (struct mafSummary *) hashFindVal(componentHash, ms->src)) != NULL) + { + if (sameString(ms->chrom, msPending->chrom) && + (ms->chromStart+1 - msPending->chromEnd < mergeGap)) + { + ms->score = mergeScores(msPending, ms); + ms->chromStart = msPending->chromStart; + ms->leftStatus[0] = msPending->leftStatus[0]; + ms->rightStatus[0] = ms->rightStatus[0]; + } + else + outputSummary(f, msPending); + hashRemove(componentHash, msPending->src); + mafSummaryFree(&msPending); + } + if (ms->chromEnd - ms->chromStart > minSize) + { + outputSummary(f, ms); + mafSummaryFree(&ms); + } + else + hashAdd(componentHash, ms->src, ms); + componentCount++; + } +return componentCount; +} + +void flushSummaryBlocks(struct hash *componentHash, FILE *f) +/* flush any pending summary blocks */ +{ +struct mafSummary *ms; +struct hashCookie hc = hashFirst(componentHash); + +while ((ms = (struct mafSummary *)hashNextVal(&hc)) != NULL) + outputSummary(f, ms); +} + +void mafToBigMafSummary(char *db, char *inMaf, char *outBed) +/* mafToBigMafSummary - emit summary blocks for the maf as bed3+4. */ +{ +long mafCount = 0, allMafCount = 0; +struct mafComp *mcMaster = NULL; +struct mafAli *maf; +struct mafFile *mf = mafOpen(inMaf); +FILE *f = mustOpen(outBed, "w"); +long componentCount = 0; +struct hash *componentHash = newHash(0); + +verbose(1, "Indexing and tabulating %s\n", inMaf); + +while ((maf = mafNext(mf)) != NULL) + { + mcMaster = mafMaster(maf, mf, inMaf); + allMafCount++; + if (mcMaster->srcSize < minSeqSize) + continue; + while (mcMaster->size > maxSize) + { + int end = mcMaster->start + maxSize; + struct mafAli *subMaf = + mafSubset(maf, mcMaster->src, mcMaster->start, end); + verbose(3, "Splitting maf %s:%d len %d\n", mcMaster->src, + mcMaster->start, mcMaster->size); + componentCount += + processMaf(subMaf, componentHash, f, mf, inMaf); + mafAliFree(&subMaf); + subMaf = mafSubset(maf, mcMaster->src, + end, end + (mcMaster->size - maxSize)); + mafAliFree(&maf); + maf = subMaf; + mcMaster = mafMaster(maf, mf, inMaf); + } + if (mcMaster->size != 0) + componentCount += + processMaf(maf, componentHash, f, mf, inMaf); + mafAliFree(&maf); + mafCount++; + } +mafFileFree(&mf); +flushSummaryBlocks(componentHash, f); +carefulClose(&f); +verbose(1, "%ld components, %ld mafs from %s\n", + componentCount, allMafCount, inMaf); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, optionSpecs); +mergeGap = optionInt("mergeGap", mergeGap); +minSize = optionInt("minSize", minSize); +maxSize = optionInt("maxSize", maxSize); +minSeqSize = optionInt("minSeqSize", minSeqSize); +if (argc != 4) + usage(); +referenceDb = argv[1]; +mafToBigMafSummary(referenceDb, argv[2], argv[3]); +return 0; +}