8f29deda5ff897fbf1d208907ac7f734afc07ed2 jcasper Tue Oct 1 20:31:17 2024 -0700 Many liftOver functions now have a preserveInput argument, which appends the input position to the name field. This makes it easier to see what got mapped where. The option is made available through a command-line argument to the liftOver utility and a checkbox in the hgLiftOver CGI, refs #28023 diff --git src/hg/lib/liftOver.c src/hg/lib/liftOver.c index 429199c..11e1626 100644 --- src/hg/lib/liftOver.c +++ src/hg/lib/liftOver.c @@ -26,30 +26,52 @@ char *name; /* Chromosome name. */ struct binKeeper *bk; /* Keyed by old position, values are chains. */ }; static char otherStrand(char c) /* Swap +/- */ { if (c == '-') return '+'; else if (c == '+') return '-'; else return c; } + +char *extendNameWithPosition(char *name, char *chrom, int s, int e, bool prepend) +/* Extend an item's name with a position, intended for allowing liftOver to preserve a + * sign of where a lifted item was mapped from. If prepend is set, the position is placed + * before the name. Otherwise after. Input coordinates are expected to be BED (0-based), + * and the written position is 1-based (chr:s-e). + * The old name can then be freed (if appropriate - some loader routines just split a + * string with \0s and use the pieces in place). + */ +{ +char pos[2048], *new = NULL; +safef(pos, sizeof(pos), "%s:%d-%d", chrom, s+1, e); +if (name == NULL) + new = cloneString(pos); +else if (prepend) + new = catThreeStrings(pos, ":", name); +else + new = catThreeStrings(name, ":", pos); +return new; +} + + // The maximum number of words per line that can be lifted: #define LIFTOVER_MAX_WORDS 2048 void liftOverAddChainHash(struct hash *chainHash, struct chain *chain) /* Add this chain to the hash of chains used by remapBlockedBed */ { struct chromMap *map; if ((map = hashFindVal(chainHash, chain->tName)) == NULL) { AllocVar(map); map->bk = binKeeperNew(0, chain->tSize); hashAddSaveName(chainHash, chain->tName, map, &map->name); } binKeeperAdd(map->bk, chain->tStart, chain->tEnd, chain); @@ -396,31 +418,31 @@ int i; wordCount--; for (i = 1; i <= wordCount; i++) words[i-1] = words[i]; } if (wordCount <= 0) return 0; return wordCount; } static int bedOverSmallEnds(struct lineFile *lf, int refCount, struct hash *chainHash, double minMatch, int minSizeT, int minSizeQ, int minChainT, int minChainQ, FILE *mapped, FILE *unmapped, bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin, - bool tabSep, int ends, int *errCt) + bool tabSep, int ends, int *errCt, bool preserveInput) /* Do a bed without a block-list. * NOTE: it would be preferable to have all of the lift * functions work at the line level, rather than the file level. * Multiple option can be used with bed3 -- it will write a list of * regions as a bed4, where score is the "part #". This is used for * ENCODE region mapping */ { int i, wordCount, s, e; char *words[LIFTOVER_MAX_WORDS+1]; // +1 to detect overflow char *chrom; char strand = '.', strandString[2]; char *error, *error2 = NULL; int ct = 0; int errs = 0; struct bed *bedList = NULL, *unmappedBedList = NULL; @@ -453,30 +475,33 @@ "ERROR: Has %s%d fields, should have %d fields on line %d of bed file %s\n", (wordCount > LIFTOVER_MAX_WORDS) ? "at least ":"", wordCount, refCount, lf->lineIx, lf->fileName); FILE *f = mapped; chrom = words[0]; s = lineFileNeedFullNum(lf, words, 1); e = lineFileNeedFullNum(lf, words, 2); bool useThick = FALSE; int thickStart = 0, thickEnd = 0; int afterS = s + ends; int beforeE = e - ends; bool doEnds = ((ends > 0) && (beforeE > afterS)); if (s > e) errAbort( "ERROR: start coordinate is after end coordinate (chromStart > chromEnd) on line %d of bed file %s\nERROR: %s %d %d", lf->lineIx, lf->fileName, chrom, s, e); + if (wordCount > 3 && preserveInput) + // Extend item names, if present, with chrom:(start+1)-end + words[3] = extendNameWithPosition(words[3], chrom, s, e, TRUE); if (multiple) { if (wordCount > 3) region = words[3]; else { safef(regionBuf, sizeof(regionBuf), "%s:%d-%d", words[0], s+1, e); region = regionBuf; } } if (wordCount >= 6 && (bedPlus == 0 || bedPlus >= 6)) strand = words[5][0]; if (wordCount >= 8 && (bedPlus == 0 || bedPlus >= 8)) { useThick = TRUE; @@ -633,32 +658,33 @@ } static int gffNeedNum(struct lineFile *lf, char *s) /* Convert s to an integer or die trying. */ { char c = *s; if (isdigit(c) || c == '-') return atoi(s); else errAbort("Expecting number line %d of %s", lf->lineIx, lf->fileName); return 0; } void liftOverGff(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, - FILE *mapped, FILE *unmapped) -/* Lift over GFF file */ + FILE *mapped, FILE *unmapped, bool preserveInput) +/* Lift over GFF file, with an option to preserve the input position by + * appending it to the source */ { char *error = NULL; struct lineFile *lf = lineFileOpen(fileName, TRUE); char c, *s, *line, *word; char *seq, *source, *feature; int start, end; char *score, *strand; FILE *f; while (lineFileNext(lf, &line, NULL)) { /* Pass through blank lines and those that start with a sharp. */ s = skipLeadingSpaces(line); c = *s; if (c == '#' || c == 0) @@ -672,30 +698,33 @@ shortGffLine(lf); if ((feature = nextWord(&s)) == NULL) shortGffLine(lf); if ((word = nextWord(&s)) == NULL) shortGffLine(lf); start = gffNeedNum(lf, word) - 1; if ((word = nextWord(&s)) == NULL) shortGffLine(lf); end = gffNeedNum(lf, word); if ((score = nextWord(&s)) == NULL) shortGffLine(lf); if ((strand = nextWord(&s)) == NULL) shortGffLine(lf); s = skipLeadingSpaces(s); + if (preserveInput) + source = extendNameWithPosition(source, seq, start, end, FALSE); + /* Convert seq/start/end/strand. */ error = liftOverRemapRange(chainHash, minMatch, seq, start, end, *strand, minMatch, &seq, &start, &end, strand); f = mapped; if (error != NULL) { f = unmapped; fprintf(f, "# %s\n", error); } fprintf(f, "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", seq, source, feature, start+1, end, score, strand, s); } } struct liftRange @@ -1169,61 +1198,69 @@ freez(&bed->expScores); freez(&bed->label); memcpy(bed, splicedBed, sizeof(struct bed)); freez(&splicedBed); error = NULL; } } bedFree(&bedCopy); slFreeList(&binList); return error; } static int bedOverBig(struct lineFile *lf, int refCount, struct hash *chainHash, double minMatch, double minBlocks, bool fudgeThick, FILE *mapped, FILE *unmapped, bool multiple, char *chainTable, - int bedPlus, bool hasBin, bool tabSep, int *errCt) + int bedPlus, bool hasBin, bool tabSep, int *errCt, bool preserveInput) /* Do a bed with block-list. */ { int wordCount, bedCount; char *line, *words[LIFTOVER_MAX_WORDS+1]; // plus one so it can detect overflow past the end. char *whyNot = NULL; int ct = 0; int errs = 0; int i; char *db = NULL, *chainTableName = NULL; if (chainTable) { chainTableName = chopPrefix(chainTable); db = chainTable; chopSuffix(chainTable); } while (lineFileNextReal(lf, &line)) { struct bed *bed; wordCount = chopLineBin(line, words, ArraySize(words), hasBin, tabSep); if (wordCount < 3) errAbort( "ERROR: At least 3 fields are required, chrom, start, end on line %d of bed file %s\n", lf->lineIx, lf->fileName); if (refCount != wordCount) lineFileExpectWords(lf, refCount, wordCount); if (wordCount == bedPlus) bedPlus = 0; /* no extra fields */ bedCount = (bedPlus ? bedPlus : wordCount); bed = bedLoadN(words, bedCount); + + if (wordCount > 3 && preserveInput) + { + char *old = bed->name; + bed->name = extendNameWithPosition(old, bed->chrom, bed->chromStart, bed->chromEnd, TRUE); + free(old); // bedLoadN uses cloneString to copy the name + } + whyNot = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick, multiple, db, chainTableName); if (whyNot == NULL) { struct bed *bedList = bed; for (; bed != NULL; bed = bed->next) { if (hasBin) fprintf(mapped, "%d\t", binFromRange(bed->chromStart, bed->chromEnd)); bedOutputN(bed, bedCount, mapped, '\t', (bedCount != wordCount) ? '\t':'\n'); /* print extra "non-bed" fields in line */ for (i = bedCount; i < wordCount; i++) fprintf(mapped, "%s%c", words[i], (i == wordCount-1) ? '\n':'\t'); @@ -1243,31 +1280,31 @@ errs++; } bedFree(&bed); } if (errCt) *errCt = errs; return ct; } int liftOverBedPlusEnds(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin, - bool tabSep, int ends, int *errCt) + bool tabSep, int ends, int *errCt, bool preserveInput) /* Lift bed N+ file. * Return the number of records successfully converted */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; int bedFieldCount = bedPlus; char *line; int ct = 0; if (lineFileNextReal(lf, &line)) { line = cloneString(line); if (tabSep) { wordCount = chopByChar(line, '\t', NULL, LIFTOVER_MAX_WORDS); @@ -1278,70 +1315,72 @@ if (wordCount > LIFTOVER_MAX_WORDS) errAbort("Too many fields. Fieldcount %d > maximum fields %d in file %s", wordCount, LIFTOVER_MAX_WORDS, fileName); if (hasBin) wordCount--; lineFileReuse(lf); freez(&line); if (wordCount < 3) errAbort("Data format error: expecting at least 3 fields in BED file (%s)", fileName); if (bedFieldCount == 0) bedFieldCount = wordCount; if (bedFieldCount <= 10) { ct = bedOverSmallEnds(lf, wordCount, chainHash, minMatch, minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, - multiple, noSerial, chainTable, bedPlus, hasBin, tabSep, ends, errCt); + multiple, noSerial, chainTable, bedPlus, hasBin, tabSep, ends, errCt, + preserveInput); } else if (ends) errAbort("Cannot use -ends with blocked BED\n"); else ct = bedOverBig(lf, wordCount, chainHash, minMatch, minBlocks, fudgeThick, f, unmapped, multiple, chainTable, - bedPlus, hasBin, tabSep, errCt); + bedPlus, hasBin, tabSep, errCt, preserveInput); } lineFileClose(&lf); return ct; } int liftOverBedPlus(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin, - bool tabSep, int *errCt) + bool tabSep, int *errCt, bool preserveInput) /* Lift bed N+ file. * Return the number of records successfully converted */ { return liftOverBedPlusEnds(fileName, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, f, unmapped, multiple, noSerial, chainTable, - bedPlus, hasBin, tabSep, 0, errCt); + bedPlus, hasBin, tabSep, 0, errCt, preserveInput); } int liftOverBed(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, - bool multiple, bool noSerial, char *chainTable, int *errCt) + bool multiple, bool noSerial, char *chainTable, int *errCt, + bool preserveInput) /* Open up file, decide what type of bed it is, and lift it. * Return the number of records successfully converted */ { return liftOverBedPlus(fileName, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, f, unmapped, multiple, noSerial, chainTable, - 0, FALSE, FALSE, errCt); + 0, FALSE, FALSE, errCt, preserveInput); } #define LIFTOVER_FILE_PREFIX "liftOver" #define BEDSTART_TO_POSITION(coord) (coord+1) int liftOverPositions(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *mapped, FILE *unmapped, bool multiple, char *chainTable, int *errCt) /* Create bed file from positions (chrom:start-end) and lift. * Then convert back to positions. (TODO: line-by-line instead of by file) * Return the number of records successfully converted */ { @@ -1373,31 +1412,31 @@ fprintf(bedFile, "%s\n", line); } freez(&line); } carefulClose(&bedFile); chmod(bedTn.forCgi, 0666); lineFileClose(&lf); /* Set up temp bed files for output, and lift to those */ makeTempName(&mappedBedTn, LIFTOVER_FILE_PREFIX, ".bedmapped"); makeTempName(&unmappedBedTn, LIFTOVER_FILE_PREFIX, ".bedunmapped"); mappedBed = mustOpen(mappedBedTn.forCgi, "w"); unmappedBed = mustOpen(unmappedBedTn.forCgi, "w"); ct = liftOverBed(bedTn.forCgi, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, - mappedBed, unmappedBed, multiple, TRUE, chainTable, errCt); + mappedBed, unmappedBed, multiple, TRUE, chainTable, errCt, FALSE); carefulClose(&mappedBed); chmod(mappedBedTn.forCgi, 0666); carefulClose(&unmappedBed); chmod(unmappedBedTn.forCgi, 0666); lineFileClose(&lf); /* Convert output files back to positions */ lf = lineFileOpen(mappedBedTn.forCgi, TRUE); while ((wordCount = lineFileChop(lf, words)) != 0) { chrom = words[0]; start = lineFileNeedNum(lf, words, 1); end = lineFileNeedNum(lf, words, 2); fprintf(mapped, "%s:%d-%d\n", chrom, BEDSTART_TO_POSITION(start), end); } @@ -1470,31 +1509,31 @@ int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *mapped, FILE *unmapped, bool multiple, bool noSerial, char *chainTable, int *errCt) /* Sniff the first line of the file, and determine whether it's a */ /* bed, a positions file, or neither. */ { enum liftOverFileType lft = liftOverSniff(fileName); if (lft == positions) return liftOverPositions(fileName, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, mapped, unmapped, multiple, chainTable, errCt); if (lft == bed) return liftOverBed(fileName, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, mapped, unmapped, - multiple, noSerial, chainTable, errCt); + multiple, noSerial, chainTable, errCt, FALSE); return -1; } static char *remapBlockedPsl(struct hash *chainHash, struct psl *psl, double minMatch, double minBlocks, bool fudgeThick) /* Remap blocks in psl, and also chromStart/chromEnd. * Return NULL on success, an error string on failure. */ { struct chain *chainList = NULL, *chain; int pslSize = sumPslBlocks(psl); struct binElement *binList; struct binElement *el; struct liftRange *rangeList, *badRanges = NULL, *range; char *error = NULL; int i, start, end = 0; @@ -1621,42 +1660,48 @@ bed->blockCount = count = gp->exonCount; AllocArray(bed->blockSizes, count); AllocArray(bed->chromStarts, count); for (i=0; i<count; ++i) { int s = gp->exonStarts[i]; int e = gp->exonEnds[i]; bed->blockSizes[i] = e - s; bed->chromStarts[i] = s - start; } return bed; } void liftOverGenePred(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, bool fudgeThick, - FILE *mapped, FILE *unmapped, boolean multiple) + FILE *mapped, FILE *unmapped, boolean multiple, bool preserveInput) /* Lift over file in genePred format. */ { char *db = NULL, *chainTable = NULL; struct bed *bed; struct genePred *gp = NULL; char *error; struct genePred *gpList = genePredExtLoadAll(fileName); for (gp = gpList ; gp != NULL ; gp = gp->next) { // uglyf("%s %s %d %d %s\n", gp->name, gp->chrom, gp->txStart, gp->txEnd, gp->strand); + if (preserveInput) + { + char *old = gp->name; + gp->name = extendNameWithPosition(gp->name, gp->chrom, gp->txStart, gp->txEnd, TRUE); + free(old); // genePredExtLoadAll uses cloneString to populate this + } bed = genePredToBed(gp); error = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick, multiple, db, chainTable); if (error) { fprintf(unmapped, "# %s\n", error); bedFree(&bed); genePredTabOut(gp, unmapped); } else { int exonCount = gp->exonCount; struct bed *bedList = bed; for (; bed != NULL; bed = bed->next) { @@ -1771,40 +1816,47 @@ } if (rangeList != NULL) { ns = rangeListToSample(rangeList, sample->chrom, sample->name, sample->score, sample->strand); fprintf(unmapped, "# Leftover %d of %d\n", ns->sampleCount, sample->sampleCount); sampleTabOut(ns, unmapped); sampleFree(&ns); slFreeList(&rangeList); } slFreeList(&binList); } void liftOverSample(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, bool fudgeThick, - FILE *mapped, FILE *unmapped) + FILE *mapped, FILE *unmapped, bool preserveInput) /* Open up file, decide what type of bed it is, and lift it. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[9]; struct sample *sample; while (lineFileRow(lf, row)) { sample = sampleLoad(row); + if (preserveInput) + { + char *old = sample->name; + sample->name = extendNameWithPosition(sample->name, sample->chrom, + sample->chromStart, sample->chromEnd, TRUE); + free(old); + } remapSample(chainHash, sample, minBlocks, fudgeThick, mapped, unmapped); sampleFree(&sample); } lineFileClose(&lf); } struct liftOverChain *liftOverChainList() /* Get list of all liftOver chains in the central database */ { struct sqlConnection *conn = hConnectCentral(); struct liftOverChain *list = NULL; char query[1024]; sqlSafef(query, sizeof query, "select * from liftOverChain"); list = liftOverChainLoadByQuery(conn, query); hDisconnectCentral(&conn);