8f29deda5ff897fbf1d208907ac7f734afc07ed2
jcasper
  Tue Oct 1 20:31:17 2024 -0700
Many liftOver functions now have a preserveInput argument, which appends the input
position to the name field.  This makes it easier to see what got mapped where.  The option is made available through a
command-line argument to the liftOver utility and a checkbox in the hgLiftOver CGI, refs #28023

diff --git src/hg/lib/liftOver.c src/hg/lib/liftOver.c
index 429199c..11e1626 100644
--- src/hg/lib/liftOver.c
+++ src/hg/lib/liftOver.c
@@ -26,30 +26,52 @@
     char *name;                 /* Chromosome name. */
     struct binKeeper *bk;       /* Keyed by old position, values are chains. */
     };
 
 static char otherStrand(char c)
 /* Swap +/- */
 {
 if (c == '-')
     return '+';
 else if (c == '+')
     return '-';
 else
     return c;
 }
 
+
+char *extendNameWithPosition(char *name, char *chrom, int s, int e, bool prepend)
+/* Extend an item's name with a position, intended for allowing liftOver to preserve a
+ * sign of where a lifted item was mapped from.  If prepend is set, the position is placed
+ * before the name.  Otherwise after.  Input coordinates are expected to be BED (0-based),
+ * and the written position is 1-based (chr:s-e).
+ * The old name can then be freed (if appropriate - some loader routines just split a
+ * string with \0s and use the pieces in place).
+ */
+{
+char pos[2048], *new = NULL;
+safef(pos, sizeof(pos), "%s:%d-%d", chrom, s+1, e);
+if (name == NULL)
+    new = cloneString(pos);
+else if (prepend)
+    new = catThreeStrings(pos, ":", name);
+else
+    new = catThreeStrings(name, ":", pos);
+return new;
+}
+
+
 // The maximum number of words per line that can be lifted:
 #define LIFTOVER_MAX_WORDS 2048
 
 void liftOverAddChainHash(struct hash *chainHash, struct chain *chain)
 /* Add this chain to the hash of chains used by remapBlockedBed */
 {       
 struct chromMap *map;
         
 if ((map = hashFindVal(chainHash, chain->tName)) == NULL)
     {   
     AllocVar(map);
     map->bk = binKeeperNew(0, chain->tSize);
     hashAddSaveName(chainHash, chain->tName, map, &map->name);
     }
 binKeeperAdd(map->bk, chain->tStart, chain->tEnd, chain);
@@ -396,31 +418,31 @@
     int i;
     wordCount--;
     for (i = 1; i <= wordCount; i++)
         words[i-1] = words[i];
     }
 if (wordCount <= 0)
     return 0;
 return wordCount;
 }
 
 static int bedOverSmallEnds(struct lineFile *lf, int refCount,
                         struct hash *chainHash, double minMatch, int minSizeT, 
                         int minSizeQ, int minChainT, int minChainQ, 
                         FILE *mapped, FILE *unmapped, bool multiple, bool noSerial,
                         char *chainTable, int bedPlus, bool hasBin, 
-			bool tabSep, int ends, int *errCt)
+			bool tabSep, int ends, int *errCt, bool preserveInput)
 /* Do a bed without a block-list.
  * NOTE: it would be preferable to have all of the lift
  * functions work at the line level, rather than the file level.
  * Multiple option can be used with bed3 -- it will write a list of
  * regions as a bed4, where score is the "part #". This is used for
  * ENCODE region mapping */  
 {
 int i, wordCount, s, e;
 char *words[LIFTOVER_MAX_WORDS+1];   // +1 to detect overflow
 char *chrom;
 char strand = '.', strandString[2];
 char *error, *error2 = NULL;
 int ct = 0;
 int errs = 0;
 struct bed *bedList = NULL, *unmappedBedList = NULL;
@@ -453,30 +475,33 @@
 	"ERROR: Has %s%d fields, should have %d fields on line %d of bed file %s\n", 
 	    (wordCount > LIFTOVER_MAX_WORDS) ? "at least ":"", wordCount, refCount, lf->lineIx, lf->fileName);
     FILE *f = mapped;
     chrom = words[0];
     s = lineFileNeedFullNum(lf, words, 1);
     e = lineFileNeedFullNum(lf, words, 2);
     bool useThick = FALSE;
     int thickStart = 0, thickEnd = 0;
     int afterS = s + ends;
     int beforeE = e - ends;
     bool doEnds = ((ends > 0) && (beforeE > afterS));
     if (s > e)
 	errAbort(
 	"ERROR: start coordinate is after end coordinate (chromStart > chromEnd) on line %d of bed file %s\nERROR: %s %d %d", 
 	    lf->lineIx, lf->fileName, chrom, s, e);
+    if (wordCount > 3 && preserveInput)
+        // Extend item names, if present, with chrom:(start+1)-end
+        words[3] = extendNameWithPosition(words[3], chrom, s, e, TRUE);
     if (multiple)
         {
         if (wordCount > 3)
             region = words[3];
         else
             {
             safef(regionBuf, sizeof(regionBuf), "%s:%d-%d", words[0], s+1, e);
             region = regionBuf;
             }
         }
     if (wordCount >= 6 && (bedPlus == 0 || bedPlus >= 6))
 	strand = words[5][0];
     if (wordCount >= 8 && (bedPlus == 0 || bedPlus >= 8))
 	{
 	useThick = TRUE;
@@ -633,32 +658,33 @@
 }
 
 static int gffNeedNum(struct lineFile *lf, char *s)
 /* Convert s to an integer or die trying. */
 {
 char c = *s;
 if (isdigit(c) || c == '-')
     return atoi(s);
 else
     errAbort("Expecting number line %d of %s", lf->lineIx, lf->fileName);
 return 0;
 }
 
 void liftOverGff(char *fileName, struct hash *chainHash, 
                                 double minMatch, double minBlocks, 
-                                FILE *mapped, FILE *unmapped)
-/* Lift over GFF file */
+                                FILE *mapped, FILE *unmapped, bool preserveInput)
+/* Lift over GFF file, with an option to preserve the input position by
+ * appending it to the source */
 {
 char *error = NULL;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char c, *s, *line, *word;
 char *seq, *source, *feature;
 int start, end;
 char *score, *strand;
 FILE *f;
 
 while (lineFileNext(lf, &line, NULL))
     {
     /* Pass through blank lines and those that start with a sharp. */
     s = skipLeadingSpaces(line);
     c = *s;
     if (c == '#' || c == 0)
@@ -672,30 +698,33 @@
 	shortGffLine(lf);
     if ((feature = nextWord(&s)) == NULL)
 	shortGffLine(lf);
     if ((word = nextWord(&s)) == NULL)
 	shortGffLine(lf);
     start = gffNeedNum(lf, word) - 1;
     if ((word = nextWord(&s)) == NULL)
 	shortGffLine(lf);
     end = gffNeedNum(lf, word);
     if ((score = nextWord(&s)) == NULL)
 	shortGffLine(lf);
     if ((strand = nextWord(&s)) == NULL)
 	shortGffLine(lf);
     s = skipLeadingSpaces(s);
 
+    if (preserveInput)
+        source = extendNameWithPosition(source, seq, start, end, FALSE);
+
     /* Convert seq/start/end/strand. */
     error = liftOverRemapRange(chainHash, minMatch, seq, start, end, *strand,
 	                minMatch, &seq, &start, &end, strand);
     f = mapped;
     if (error != NULL)
 	{
 	f = unmapped;
 	fprintf(f, "# %s\n", error);
 	}
     fprintf(f, "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n",
 	seq, source, feature, start+1, end, score, strand, s);
     }
 }
 
 struct liftRange
@@ -1169,61 +1198,69 @@
         freez(&bed->expScores);
         freez(&bed->label);
         memcpy(bed, splicedBed, sizeof(struct bed));
         freez(&splicedBed);
         error = NULL;
         }
     }
 bedFree(&bedCopy);
 slFreeList(&binList);
 return error;
 }
 
 static int bedOverBig(struct lineFile *lf, int refCount, 
                     struct hash *chainHash, double minMatch, double minBlocks,
                     bool fudgeThick, FILE *mapped, FILE *unmapped, bool multiple, char *chainTable,
-                    int bedPlus, bool hasBin, bool tabSep, int *errCt)
+                    int bedPlus, bool hasBin, bool tabSep, int *errCt, bool preserveInput)
 /* Do a bed with block-list. */
 {
 int wordCount, bedCount;
 char *line, *words[LIFTOVER_MAX_WORDS+1];  // plus one so it can detect overflow past the end.
 char *whyNot = NULL;
 int ct = 0;
 int errs = 0;
 int i;
 char *db = NULL, *chainTableName = NULL;
 
 if (chainTable)
     {
     chainTableName = chopPrefix(chainTable);
     db = chainTable;
     chopSuffix(chainTable);
     }
 while (lineFileNextReal(lf, &line))
     {
     struct bed *bed;
     wordCount = chopLineBin(line, words, ArraySize(words), hasBin, tabSep);
     if (wordCount < 3)
         errAbort(
         "ERROR: At least 3 fields are required, chrom, start, end on line %d of bed file %s\n",
             lf->lineIx, lf->fileName);
     if (refCount != wordCount)
 	lineFileExpectWords(lf, refCount, wordCount);
     if (wordCount == bedPlus)
         bedPlus = 0;    /* no extra fields */
     bedCount = (bedPlus ? bedPlus : wordCount);
     bed = bedLoadN(words, bedCount);
+
+    if (wordCount > 3 && preserveInput)
+        {
+        char *old = bed->name;
+        bed->name = extendNameWithPosition(old, bed->chrom, bed->chromStart, bed->chromEnd, TRUE);
+        free(old); // bedLoadN uses cloneString to copy the name
+        }
+
     whyNot = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick,
                              multiple, db, chainTableName);
     if (whyNot == NULL)
 	{
         struct bed *bedList = bed;
         for (;  bed != NULL;  bed = bed->next)
             {
             if (hasBin)
                 fprintf(mapped, "%d\t", 
                         binFromRange(bed->chromStart, bed->chromEnd));
             bedOutputN(bed, bedCount, mapped, '\t', 
                        (bedCount != wordCount) ? '\t':'\n');
             /* print extra "non-bed" fields in line */
             for (i = bedCount; i < wordCount; i++)
                 fprintf(mapped, "%s%c", words[i], (i == wordCount-1) ? '\n':'\t');
@@ -1243,31 +1280,31 @@
         errs++;
 	}
     bedFree(&bed);
     }
 if (errCt)
     *errCt = errs;
 return ct;
 }
 
 
 
 int liftOverBedPlusEnds(char *fileName, struct hash *chainHash, double minMatch,  
                     double minBlocks, int minSizeT, int minSizeQ, int minChainT,
                     int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, 
                     bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin,
-                    bool tabSep, int ends, int *errCt)
+                    bool tabSep, int ends, int *errCt, bool preserveInput)
 /* Lift bed N+ file.
  * Return the number of records successfully converted */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 int wordCount;
 int bedFieldCount = bedPlus;
 char *line;
 int ct = 0;
 
 if (lineFileNextReal(lf, &line))
     {
     line = cloneString(line);
     if (tabSep)
 	{
         wordCount = chopByChar(line, '\t', NULL, LIFTOVER_MAX_WORDS);
@@ -1278,70 +1315,72 @@
     if (wordCount > LIFTOVER_MAX_WORDS)
 	errAbort("Too many fields. Fieldcount %d > maximum fields %d in file %s", wordCount, LIFTOVER_MAX_WORDS, fileName);
 
     if (hasBin)
         wordCount--;
     lineFileReuse(lf);
     freez(&line);
     if (wordCount < 3)
 	 errAbort("Data format error: expecting at least 3 fields in BED file (%s)", fileName);
     if (bedFieldCount == 0)
         bedFieldCount = wordCount;
     if (bedFieldCount <= 10)
 	{
         ct = bedOverSmallEnds(lf, wordCount, chainHash, minMatch,
                               minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, 
-                              multiple, noSerial, chainTable, bedPlus, hasBin, tabSep, ends, errCt);
+                              multiple, noSerial, chainTable, bedPlus, hasBin, tabSep, ends, errCt,
+                              preserveInput);
 	}
     else if (ends)
 	errAbort("Cannot use -ends with blocked BED\n");
     else
 	 ct = bedOverBig(lf, wordCount, chainHash, minMatch, minBlocks, 
                          fudgeThick, f, unmapped, multiple, chainTable,
-                         bedPlus, hasBin, tabSep, errCt);
+                         bedPlus, hasBin, tabSep, errCt, preserveInput);
     }
 lineFileClose(&lf);
 return ct;
 }
 
 int liftOverBedPlus(char *fileName, struct hash *chainHash, double minMatch,  
                     double minBlocks, int minSizeT, int minSizeQ, int minChainT,
                     int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, 
                     bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin,
-                    bool tabSep, int *errCt)
+                    bool tabSep, int *errCt, bool preserveInput)
 /* Lift bed N+ file.
  * Return the number of records successfully converted */
 {
 return liftOverBedPlusEnds(fileName, chainHash, minMatch, minBlocks,
                         minSizeT, minSizeQ, minChainT, minChainQ,
                         fudgeThick, f, unmapped, multiple, noSerial, chainTable,
-			bedPlus, hasBin, tabSep, 0, errCt);
+			bedPlus, hasBin, tabSep, 0, errCt, preserveInput);
 }
 
 int liftOverBed(char *fileName, struct hash *chainHash, 
                         double minMatch,  double minBlocks, 
                         int minSizeT, int minSizeQ,
                         int minChainT, int minChainQ,
                         bool fudgeThick, FILE *f, FILE *unmapped, 
-                        bool multiple, bool noSerial, char *chainTable, int *errCt)
+                        bool multiple, bool noSerial, char *chainTable, int *errCt,
+                        bool preserveInput)
 /* Open up file, decide what type of bed it is, and lift it. 
  * Return the number of records successfully converted */
 {
 return liftOverBedPlus(fileName, chainHash, minMatch, minBlocks,
                         minSizeT, minSizeQ, minChainT, minChainQ,
                         fudgeThick, f, unmapped, multiple, noSerial, chainTable,
-                        0, FALSE, FALSE, errCt);
+                        0, FALSE, FALSE, errCt, preserveInput);
 }
 
 #define LIFTOVER_FILE_PREFIX    "liftOver"
 #define BEDSTART_TO_POSITION(coord)     (coord+1)
 
 int liftOverPositions(char *fileName, struct hash *chainHash, 
                         double minMatch,  double minBlocks, 
                         int minSizeT, int minSizeQ,
                         int minChainT, int minChainQ,
 		        bool fudgeThick, FILE *mapped, FILE *unmapped, 
 		        bool multiple, char *chainTable, int *errCt)
 /* Create bed file from positions (chrom:start-end) and lift.
  * Then convert back to positions.  (TODO: line-by-line instead of by file)
  * Return the number of records successfully converted */
 {
@@ -1373,31 +1412,31 @@
         fprintf(bedFile, "%s\n", line);
         }
     freez(&line);
     }
 carefulClose(&bedFile);
 chmod(bedTn.forCgi, 0666);
 lineFileClose(&lf);
 
 /* Set up temp bed files for output, and lift to those */
 makeTempName(&mappedBedTn, LIFTOVER_FILE_PREFIX, ".bedmapped");
 makeTempName(&unmappedBedTn, LIFTOVER_FILE_PREFIX, ".bedunmapped");
 mappedBed = mustOpen(mappedBedTn.forCgi, "w");
 unmappedBed = mustOpen(unmappedBedTn.forCgi, "w");
 ct = liftOverBed(bedTn.forCgi, chainHash, minMatch, minBlocks,
                  minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick,
-                 mappedBed, unmappedBed, multiple, TRUE, chainTable, errCt);
+                 mappedBed, unmappedBed, multiple, TRUE, chainTable, errCt, FALSE);
 carefulClose(&mappedBed);
 chmod(mappedBedTn.forCgi, 0666);
 carefulClose(&unmappedBed);
 chmod(unmappedBedTn.forCgi, 0666);
 lineFileClose(&lf);
 
 /* Convert output files back to positions */
 lf = lineFileOpen(mappedBedTn.forCgi, TRUE);
 while ((wordCount = lineFileChop(lf, words)) != 0)
     {
     chrom = words[0];
     start = lineFileNeedNum(lf, words, 1);
     end = lineFileNeedNum(lf, words, 2);
     fprintf(mapped, "%s:%d-%d\n", chrom, BEDSTART_TO_POSITION(start), end);
     }
@@ -1470,31 +1509,31 @@
                         int minSizeT, int minSizeQ,
                         int minChainT, int minChainQ,
 		        bool fudgeThick, FILE *mapped, FILE *unmapped, 
                         bool multiple, bool noSerial, char *chainTable, int *errCt)
 /* Sniff the first line of the file, and determine whether it's a */
 /* bed, a positions file, or neither. */
 {
 enum liftOverFileType lft = liftOverSniff(fileName);
 if (lft == positions)
     return liftOverPositions(fileName, chainHash, minMatch, minBlocks, minSizeT, 
 			     minSizeQ, minChainT, minChainQ, fudgeThick, mapped, unmapped,
 			     multiple, chainTable, errCt);
 if (lft == bed)
     return liftOverBed(fileName, chainHash, minMatch, minBlocks, minSizeT, 
 			     minSizeQ, minChainT, minChainQ, fudgeThick, mapped, unmapped,
-                             multiple, noSerial, chainTable, errCt);
+                             multiple, noSerial, chainTable, errCt, FALSE);
 return -1;
 }
 
 static char *remapBlockedPsl(struct hash *chainHash, struct psl *psl, 
                             double minMatch, double minBlocks, bool fudgeThick)
 /* Remap blocks in psl, and also chromStart/chromEnd.  
  * Return NULL on success, an error string on failure. */
 {
 struct chain *chainList = NULL,  *chain;
 int pslSize = sumPslBlocks(psl);
 struct binElement *binList;
 struct binElement *el;
 struct liftRange *rangeList, *badRanges = NULL, *range;
 char *error = NULL;
 int i, start, end = 0;
@@ -1621,42 +1660,48 @@
 bed->blockCount = count = gp->exonCount;
 AllocArray(bed->blockSizes, count);
 AllocArray(bed->chromStarts, count);
 for (i=0; i<count; ++i)
     {
     int s = gp->exonStarts[i];
     int e = gp->exonEnds[i];
     bed->blockSizes[i] = e - s;
     bed->chromStarts[i] = s - start;
     }
 return bed;
 }
 
 void liftOverGenePred(char *fileName, struct hash *chainHash, 
                         double minMatch, double minBlocks, bool fudgeThick,
-                      FILE *mapped, FILE *unmapped, boolean multiple)
+                      FILE *mapped, FILE *unmapped, boolean multiple, bool preserveInput)
 /* Lift over file in genePred format. */
 {
 char *db = NULL, *chainTable = NULL;
 
 struct bed *bed;
 struct genePred *gp = NULL;
 char *error;
 struct genePred *gpList = genePredExtLoadAll(fileName);
 for (gp = gpList ; gp != NULL ; gp = gp->next)
     {
     // uglyf("%s %s %d %d %s\n", gp->name, gp->chrom, gp->txStart, gp->txEnd, gp->strand);
+    if (preserveInput)
+        {
+        char *old = gp->name;
+        gp->name = extendNameWithPosition(gp->name, gp->chrom, gp->txStart, gp->txEnd, TRUE);
+        free(old); // genePredExtLoadAll uses cloneString to populate this
+        }
     bed = genePredToBed(gp);
     error = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick,
                             multiple, db, chainTable);
     if (error)
 	{
 	fprintf(unmapped, "# %s\n", error);
         bedFree(&bed);
         genePredTabOut(gp, unmapped);
 	}
    else
 	{
         int exonCount = gp->exonCount;
         struct bed *bedList = bed;
         for (;  bed != NULL;  bed = bed->next)
             {
@@ -1771,40 +1816,47 @@
     }
 if (rangeList != NULL)
     {
     ns = rangeListToSample(rangeList, sample->chrom, sample->name,
     	sample->score, sample->strand);
     fprintf(unmapped, "# Leftover %d of %d\n", ns->sampleCount, sample->sampleCount);
     sampleTabOut(ns, unmapped);
     sampleFree(&ns);
     slFreeList(&rangeList);
     }
 slFreeList(&binList);
 }
 
 void liftOverSample(char *fileName, struct hash *chainHash, 
                         double minMatch, double minBlocks, bool fudgeThick,
-                        FILE *mapped, FILE *unmapped)
+                        FILE *mapped, FILE *unmapped, bool preserveInput)
 /* Open up file, decide what type of bed it is, and lift it. */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[9];
 struct sample *sample;
 
 while (lineFileRow(lf, row))
     {
     sample = sampleLoad(row);
+    if (preserveInput)
+        {
+        char *old = sample->name;
+        sample->name = extendNameWithPosition(sample->name, sample->chrom,
+                sample->chromStart, sample->chromEnd, TRUE);
+        free(old);
+        }
     remapSample(chainHash, sample, minBlocks, fudgeThick, mapped, unmapped);
     sampleFree(&sample);
     }
 lineFileClose(&lf);
 }
 
 struct liftOverChain *liftOverChainList()
 /* Get list of all liftOver chains in the central database */
 {
 struct sqlConnection *conn = hConnectCentral();
 struct liftOverChain *list = NULL;
 char query[1024];
 sqlSafef(query, sizeof query, "select * from liftOverChain");
 list = liftOverChainLoadByQuery(conn, query);
 hDisconnectCentral(&conn);