93bdf148d3a0408d7e668190c5046fd3b34eef90
galt
  Tue Feb 23 22:32:50 2021 -0800
Fixing some bugs in liftOver. Increased max fields from 64 to 256. Now it requires 3 or more fields. It detects if the field counts are not consistent for all rows. And it detects if there are too many fields. Fixes annoying bug in bed 1 or bed 2 that would confabulate output. refs #27023

diff --git src/hg/lib/liftOver.c src/hg/lib/liftOver.c
index 38f7a25..f2ecab2 100644
--- src/hg/lib/liftOver.c
+++ src/hg/lib/liftOver.c
@@ -27,31 +27,31 @@
     struct binKeeper *bk;       /* Keyed by old position, values are chains. */
     };
 
 static char otherStrand(char c)
 /* Swap +/- */
 {
 if (c == '-')
     return '+';
 else if (c == '+')
     return '-';
 else
     return c;
 }
 
 // The maximum number of words per line that can be lifted:
-#define LIFTOVER_MAX_WORDS 64
+#define LIFTOVER_MAX_WORDS 256
 
 void readLiftOverMap(char *fileName, struct hash *chainHash)
 /* Read map file into hashes. */
 {
 
 struct lineFile *lf;
 struct netParsedUrl *npu;
 if (udcIsLocal(fileName))
     lf = lineFileOpen(fileName, TRUE);
 else
     lf = netHttpLineFileMayOpen(fileName, &npu);
 
 struct chain *chain;
 struct chromMap *map;
 int chainCount = 0;
@@ -369,70 +369,79 @@
     wordCount = lineFileChopCharNext(lf, '\t', words, maxWord);
 else
     wordCount = lineFileChopNext(lf, words, maxWord);
 if (hasBin)
     {
     int i;
     wordCount--;
     for (i = 1; i <= wordCount; i++)
         words[i-1] = words[i];
     }
 if (wordCount <= 0)
     return 0;
 return wordCount;
 }
 
-static int bedOverSmallEnds(struct lineFile *lf,
+static int bedOverSmallEnds(struct lineFile *lf, int refCount,
                         struct hash *chainHash, double minMatch, int minSizeT, 
                         int minSizeQ, int minChainT, int minChainQ, 
                         FILE *mapped, FILE *unmapped, bool multiple, bool noSerial,
                         char *chainTable, int bedPlus, bool hasBin, 
 			bool tabSep, int ends, int *errCt)
 /* Do a bed without a block-list.
  * NOTE: it would be preferable to have all of the lift
  * functions work at the line level, rather than the file level.
  * Multiple option can be used with bed3 -- it will write a list of
  * regions as a bed4, where score is the "part #". This is used for
  * ENCODE region mapping */  
 {
 int i, wordCount, s, e;
-char *words[LIFTOVER_MAX_WORDS], *chrom;
+char *words[LIFTOVER_MAX_WORDS+1];   // +1 to detect overflow
+char *chrom;
 char strand = '.', strandString[2];
 char *error, *error2 = NULL;
 int ct = 0;
 int errs = 0;
 struct bed *bedList = NULL, *unmappedBedList = NULL;
 /* result lists for -ends option */
 struct bed *bedList2 = NULL, *unmappedBedList2 = NULL;
 int totalUnmapped = 0;
 double unmappedRatio;
 int totalUnmappedAll = 0;
 int totalBases = 0;
 double mappedRatio;
 char *region = NULL;   /* region name from BED file-- used with  -multiple */
 char regionBuf[2048];
 char *db = NULL, *chainTableName = NULL;
 
 if (chainTable)
     {
     chainTableName = chopPrefix(chainTable);
     db = chainTable;
     chopSuffix(chainTable);
     }
 while ((wordCount = 
             lineFileChopBin(lf, words, ArraySize(words), hasBin, tabSep)) != 0)
     {
+    if (wordCount < 3)
+	errAbort(
+	"ERROR: At least 3 fields are required, chrom, start, end on line %d of bed file %s\n", 
+	    lf->lineIx, lf->fileName);
+    if (wordCount != refCount)
+	errAbort(
+	"ERROR: Has %s%d fields, should have %d fields on line %d of bed file %s\n", 
+	    (wordCount > LIFTOVER_MAX_WORDS) ? "at least ":"", wordCount, refCount, lf->lineIx, lf->fileName);
     FILE *f = mapped;
     chrom = words[0];
     s = lineFileNeedFullNum(lf, words, 1);
     e = lineFileNeedFullNum(lf, words, 2);
     bool useThick = FALSE;
     int thickStart = 0, thickEnd = 0;
     int afterS = s + ends;
     int beforeE = e - ends;
     bool doEnds = ((ends > 0) && (beforeE > afterS));
     if (s > e)
 	errAbort(
 	"ERROR: start coordinate is after end coordinate (chromStart > chromEnd) on line %d of bed file %s\nERROR: %s %d %d", 
 	    lf->lineIx, lf->fileName, chrom, s, e);
     if (multiple)
         {
@@ -1122,47 +1131,51 @@
         error = NULL;
         }
     }
 bedFree(&bedCopy);
 slFreeList(&binList);
 return error;
 }
 
 static int bedOverBig(struct lineFile *lf, int refCount, 
                     struct hash *chainHash, double minMatch, double minBlocks,
                     bool fudgeThick, FILE *mapped, FILE *unmapped, bool multiple, char *chainTable,
                     int bedPlus, bool hasBin, bool tabSep, int *errCt)
 /* Do a bed with block-list. */
 {
 int wordCount, bedCount;
-char *line, *words[LIFTOVER_MAX_WORDS];
+char *line, *words[LIFTOVER_MAX_WORDS+1];  // plus one so it can detect overflow past the end.
 char *whyNot = NULL;
 int ct = 0;
 int errs = 0;
 int i;
 char *db = NULL, *chainTableName = NULL;
 
 if (chainTable)
     {
     chainTableName = chopPrefix(chainTable);
     db = chainTable;
     chopSuffix(chainTable);
     }
 while (lineFileNextReal(lf, &line))
     {
     struct bed *bed;
     wordCount = chopLineBin(line, words, ArraySize(words), hasBin, tabSep);
+    if (wordCount < 3)
+        errAbort(
+        "ERROR: At least 3 fields are required, chrom, start, end on line %d of bed file %s\n",
+            lf->lineIx, lf->fileName);
     if (refCount != wordCount)
 	lineFileExpectWords(lf, refCount, wordCount);
     if (wordCount == bedPlus)
         bedPlus = 0;    /* no extra fields */
     bedCount = (bedPlus ? bedPlus : wordCount);
     bed = bedLoadN(words, bedCount);
     whyNot = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick,
                              multiple, db, chainTableName);
     if (whyNot == NULL)
 	{
         struct bed *bedList = bed;
         for (;  bed != NULL;  bed = bed->next)
             {
             if (hasBin)
                 fprintf(mapped, "%d\t", 
@@ -1196,51 +1209,56 @@
 
 
 
 int liftOverBedPlusEnds(char *fileName, struct hash *chainHash, double minMatch,  
                     double minBlocks, int minSizeT, int minSizeQ, int minChainT,
                     int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, 
                     bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin,
                     bool tabSep, int ends, int *errCt)
 /* Lift bed N+ file.
  * Return the number of records successfully converted */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 int wordCount;
 int bedFieldCount = bedPlus;
 char *line;
-char *words[LIFTOVER_MAX_WORDS];
 int ct = 0;
 
 if (lineFileNextReal(lf, &line))
     {
     line = cloneString(line);
     if (tabSep)
-        wordCount = chopByChar(line, '\t', words, ArraySize(words));
+	{
+        wordCount = chopByChar(line, '\t', NULL, LIFTOVER_MAX_WORDS);
+	}
     else
-        wordCount = chopLine(line, words);
+        wordCount = chopLine(line, NULL);
+
+    if (wordCount > LIFTOVER_MAX_WORDS)
+	errAbort("Too many fields. Fieldcount %d > maximum fields %d in file %s", wordCount, LIFTOVER_MAX_WORDS, fileName);
+
     if (hasBin)
         wordCount--;
     lineFileReuse(lf);
     freez(&line);
     if (wordCount < 3)
 	 errAbort("Data format error: expecting at least 3 fields in BED file (%s)", fileName);
     if (bedFieldCount == 0)
         bedFieldCount = wordCount;
     if (bedFieldCount <= 10)
 	{
-        ct = bedOverSmallEnds(lf, chainHash, minMatch,
+        ct = bedOverSmallEnds(lf, wordCount, chainHash, minMatch,
                               minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, 
                               multiple, noSerial, chainTable, bedPlus, hasBin, tabSep, ends, errCt);
 	}
     else if (ends)
 	errAbort("Cannot use -ends with blocked BED\n");
     else
 	 ct = bedOverBig(lf, wordCount, chainHash, minMatch, minBlocks, 
                          fudgeThick, f, unmapped, multiple, chainTable,
                          bedPlus, hasBin, tabSep, errCt);
     }
 lineFileClose(&lf);
 return ct;
 }
 
 int liftOverBedPlus(char *fileName, struct hash *chainHash, double minMatch,