93bdf148d3a0408d7e668190c5046fd3b34eef90 galt Tue Feb 23 22:32:50 2021 -0800 Fixing some bugs in liftOver. Increased max fields from 64 to 256. Now it requires 3 or more fields. It detects if the field counts are not consistent for all rows. And it detects if there are too many fields. Fixes annoying bug in bed 1 or bed 2 that would confabulate output. refs #27023 diff --git src/hg/lib/liftOver.c src/hg/lib/liftOver.c index 38f7a25..f2ecab2 100644 --- src/hg/lib/liftOver.c +++ src/hg/lib/liftOver.c @@ -27,31 +27,31 @@ struct binKeeper *bk; /* Keyed by old position, values are chains. */ }; static char otherStrand(char c) /* Swap +/- */ { if (c == '-') return '+'; else if (c == '+') return '-'; else return c; } // The maximum number of words per line that can be lifted: -#define LIFTOVER_MAX_WORDS 64 +#define LIFTOVER_MAX_WORDS 256 void readLiftOverMap(char *fileName, struct hash *chainHash) /* Read map file into hashes. */ { struct lineFile *lf; struct netParsedUrl *npu; if (udcIsLocal(fileName)) lf = lineFileOpen(fileName, TRUE); else lf = netHttpLineFileMayOpen(fileName, &npu); struct chain *chain; struct chromMap *map; int chainCount = 0; @@ -369,70 +369,79 @@ wordCount = lineFileChopCharNext(lf, '\t', words, maxWord); else wordCount = lineFileChopNext(lf, words, maxWord); if (hasBin) { int i; wordCount--; for (i = 1; i <= wordCount; i++) words[i-1] = words[i]; } if (wordCount <= 0) return 0; return wordCount; } -static int bedOverSmallEnds(struct lineFile *lf, +static int bedOverSmallEnds(struct lineFile *lf, int refCount, struct hash *chainHash, double minMatch, int minSizeT, int minSizeQ, int minChainT, int minChainQ, FILE *mapped, FILE *unmapped, bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin, bool tabSep, int ends, int *errCt) /* Do a bed without a block-list. * NOTE: it would be preferable to have all of the lift * functions work at the line level, rather than the file level. * Multiple option can be used with bed3 -- it will write a list of * regions as a bed4, where score is the "part #". This is used for * ENCODE region mapping */ { int i, wordCount, s, e; -char *words[LIFTOVER_MAX_WORDS], *chrom; +char *words[LIFTOVER_MAX_WORDS+1]; // +1 to detect overflow +char *chrom; char strand = '.', strandString[2]; char *error, *error2 = NULL; int ct = 0; int errs = 0; struct bed *bedList = NULL, *unmappedBedList = NULL; /* result lists for -ends option */ struct bed *bedList2 = NULL, *unmappedBedList2 = NULL; int totalUnmapped = 0; double unmappedRatio; int totalUnmappedAll = 0; int totalBases = 0; double mappedRatio; char *region = NULL; /* region name from BED file-- used with -multiple */ char regionBuf[2048]; char *db = NULL, *chainTableName = NULL; if (chainTable) { chainTableName = chopPrefix(chainTable); db = chainTable; chopSuffix(chainTable); } while ((wordCount = lineFileChopBin(lf, words, ArraySize(words), hasBin, tabSep)) != 0) { + if (wordCount < 3) + errAbort( + "ERROR: At least 3 fields are required, chrom, start, end on line %d of bed file %s\n", + lf->lineIx, lf->fileName); + if (wordCount != refCount) + errAbort( + "ERROR: Has %s%d fields, should have %d fields on line %d of bed file %s\n", + (wordCount > LIFTOVER_MAX_WORDS) ? "at least ":"", wordCount, refCount, lf->lineIx, lf->fileName); FILE *f = mapped; chrom = words[0]; s = lineFileNeedFullNum(lf, words, 1); e = lineFileNeedFullNum(lf, words, 2); bool useThick = FALSE; int thickStart = 0, thickEnd = 0; int afterS = s + ends; int beforeE = e - ends; bool doEnds = ((ends > 0) && (beforeE > afterS)); if (s > e) errAbort( "ERROR: start coordinate is after end coordinate (chromStart > chromEnd) on line %d of bed file %s\nERROR: %s %d %d", lf->lineIx, lf->fileName, chrom, s, e); if (multiple) { @@ -1122,47 +1131,51 @@ error = NULL; } } bedFree(&bedCopy); slFreeList(&binList); return error; } static int bedOverBig(struct lineFile *lf, int refCount, struct hash *chainHash, double minMatch, double minBlocks, bool fudgeThick, FILE *mapped, FILE *unmapped, bool multiple, char *chainTable, int bedPlus, bool hasBin, bool tabSep, int *errCt) /* Do a bed with block-list. */ { int wordCount, bedCount; -char *line, *words[LIFTOVER_MAX_WORDS]; +char *line, *words[LIFTOVER_MAX_WORDS+1]; // plus one so it can detect overflow past the end. char *whyNot = NULL; int ct = 0; int errs = 0; int i; char *db = NULL, *chainTableName = NULL; if (chainTable) { chainTableName = chopPrefix(chainTable); db = chainTable; chopSuffix(chainTable); } while (lineFileNextReal(lf, &line)) { struct bed *bed; wordCount = chopLineBin(line, words, ArraySize(words), hasBin, tabSep); + if (wordCount < 3) + errAbort( + "ERROR: At least 3 fields are required, chrom, start, end on line %d of bed file %s\n", + lf->lineIx, lf->fileName); if (refCount != wordCount) lineFileExpectWords(lf, refCount, wordCount); if (wordCount == bedPlus) bedPlus = 0; /* no extra fields */ bedCount = (bedPlus ? bedPlus : wordCount); bed = bedLoadN(words, bedCount); whyNot = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick, multiple, db, chainTableName); if (whyNot == NULL) { struct bed *bedList = bed; for (; bed != NULL; bed = bed->next) { if (hasBin) fprintf(mapped, "%d\t", @@ -1196,51 +1209,56 @@ int liftOverBedPlusEnds(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin, bool tabSep, int ends, int *errCt) /* Lift bed N+ file. * Return the number of records successfully converted */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; int bedFieldCount = bedPlus; char *line; -char *words[LIFTOVER_MAX_WORDS]; int ct = 0; if (lineFileNextReal(lf, &line)) { line = cloneString(line); if (tabSep) - wordCount = chopByChar(line, '\t', words, ArraySize(words)); + { + wordCount = chopByChar(line, '\t', NULL, LIFTOVER_MAX_WORDS); + } else - wordCount = chopLine(line, words); + wordCount = chopLine(line, NULL); + + if (wordCount > LIFTOVER_MAX_WORDS) + errAbort("Too many fields. Fieldcount %d > maximum fields %d in file %s", wordCount, LIFTOVER_MAX_WORDS, fileName); + if (hasBin) wordCount--; lineFileReuse(lf); freez(&line); if (wordCount < 3) errAbort("Data format error: expecting at least 3 fields in BED file (%s)", fileName); if (bedFieldCount == 0) bedFieldCount = wordCount; if (bedFieldCount <= 10) { - ct = bedOverSmallEnds(lf, chainHash, minMatch, + ct = bedOverSmallEnds(lf, wordCount, chainHash, minMatch, minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, multiple, noSerial, chainTable, bedPlus, hasBin, tabSep, ends, errCt); } else if (ends) errAbort("Cannot use -ends with blocked BED\n"); else ct = bedOverBig(lf, wordCount, chainHash, minMatch, minBlocks, fudgeThick, f, unmapped, multiple, chainTable, bedPlus, hasBin, tabSep, errCt); } lineFileClose(&lf); return ct; } int liftOverBedPlus(char *fileName, struct hash *chainHash, double minMatch,