5dd57ba4756985d35f8ca72756d963b77934fcf0 angie Thu Jun 7 16:08:53 2018 -0700 Expand support for -multiple to BED 3 to 12 (instead of 4 to 6). refs #18853 Added a -noSerial option to prevent -multiple from forcing the score field to contain a serial number for BED without blocks. Also fixed a bug with the loop on chains overwriting args thickStart and thickEnd. Todo: genePred. Probably won't bother doing PSL because pslMap should be used for that. Also removed some very old #ifdef'd code. diff --git src/hg/lib/liftOver.c src/hg/lib/liftOver.c index e1b8adc..0f862fa 100644 --- src/hg/lib/liftOver.c +++ src/hg/lib/liftOver.c @@ -241,61 +241,62 @@ struct chain *next = chain->next; chain = chainLoadIdRange(db, chainTableName, chrom, s, e, chain->id); chain->next = next; verbose(3,"chain from db %s:%d %s:%d-%d %c (%d)\n", chain->tName, chain->tStart, chain->qName, chain->qStart, chain->qEnd, chain->qStrand, chain->id); } } if (!mapThroughChain(chain, minRatio, &start, &end, &subChain, &toFree)) errAbort("Chain mapping error: %s:%d-%d\n", chain->qName, start, end); if (chain->qStrand == '-') strand = otherStrand(qStrand); else strand = qStrand; + int mappedThickStart = thickStart, mappedThickEnd = thickEnd; if (useThick) { struct chain *subChain2 = NULL; struct chain *toFree2 = NULL; - if (!mapThroughChain(chain, minRatio, &thickStart, &thickEnd, + if (!mapThroughChain(chain, minRatio, &mappedThickStart, &mappedThickEnd, &subChain2, &toFree2)) - thickStart = thickEnd = start; + mappedThickStart = mappedThickEnd = start; chainFree(&toFree2); } verbose(3, "mapped %s:%d-%d\n", chain->qName, start, end); verbose(4, "Mapped! Target:\t%s\t%d\t%d\t%c\tQuery:\t%s\t%d\t%d\t%c\n", chain->tName, subChain->tStart, subChain->tEnd, strand, chain->qName, subChain->qStart, subChain->qEnd, chain->qStrand); if (multiple && end - start < minSizeQ) { verbose(2,"dropping %s:%d-%d size %d (too small)\n", chain->qName, start, end, end - start); continue; } AllocVar(bed); bed->chrom = cloneString(chain->qName); bed->chromStart = start; bed->chromEnd = end; if (regionName) bed->name = cloneString(regionName); bed->strand[0] = strand; bed->strand[1] = 0; if (useThick) { - bed->thickStart = thickStart; - bed->thickEnd = thickEnd; + bed->thickStart = mappedThickStart; + bed->thickEnd = mappedThickEnd; } slAddHead(&bedList, bed); if (tStart < subChain->tStart) { /* unmapped portion of target */ AllocVar(bed); bed->chrom = cloneString(chain->tName); bed->chromStart = tStart; bed->chromEnd = subChain->tStart; if (regionName) bed->name = cloneString(regionName); slAddHead(&unmappedBedList, bed); } tStart = subChain->tEnd; chainFree(&toFree); @@ -368,87 +369,92 @@ wordCount = lineFileChopCharNext(lf, '\t', words, maxWord); else wordCount = lineFileChopNext(lf, words, maxWord); if (hasBin) { int i; wordCount--; for (i = 1; i <= wordCount; i++) words[i-1] = words[i]; } if (wordCount <= 0) return 0; return wordCount; } -static int bedOverSmallEnds(struct lineFile *lf, int fieldCount, +static int bedOverSmallEnds(struct lineFile *lf, struct hash *chainHash, double minMatch, int minSizeT, int minSizeQ, int minChainT, int minChainQ, - FILE *mapped, FILE *unmapped, bool multiple, + FILE *mapped, FILE *unmapped, bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin, bool tabSep, int ends, int *errCt) /* Do a bed without a block-list. * NOTE: it would be preferable to have all of the lift * functions work at the line level, rather than the file level. * Multiple option can be used with bed3 -- it will write a list of * regions as a bed4, where score is the "part #". This is used for * ENCODE region mapping */ { int i, wordCount, s, e; char *words[LIFTOVER_MAX_WORDS], *chrom; char strand = '.', strandString[2]; char *error, *error2 = NULL; int ct = 0; int errs = 0; struct bed *bedList = NULL, *unmappedBedList = NULL; /* result lists for -ends option */ struct bed *bedList2 = NULL, *unmappedBedList2 = NULL; int totalUnmapped = 0; double unmappedRatio; int totalUnmappedAll = 0; int totalBases = 0; double mappedRatio; char *region = NULL; /* region name from BED file-- used with -multiple */ +char regionBuf[2048]; char *db = NULL, *chainTableName = NULL; if (chainTable) { chainTableName = chopPrefix(chainTable); db = chainTable; chopSuffix(chainTable); } while ((wordCount = lineFileChopBin(lf, words, ArraySize(words), hasBin, tabSep)) != 0) { FILE *f = mapped; chrom = words[0]; s = lineFileNeedFullNum(lf, words, 1); e = lineFileNeedFullNum(lf, words, 2); bool useThick = FALSE; int thickStart = 0, thickEnd = 0; int afterS = s + ends; int beforeE = e - ends; bool doEnds = ((ends > 0) && (beforeE > afterS)); if (s > e) errAbort( "ERROR: start coordinate is after end coordinate (chromStart > chromEnd) on line %d of bed file %s\nERROR: %s %d %d", lf->lineIx, lf->fileName, chrom, s, e); if (multiple) { - if (wordCount < 4 || wordCount > 6) - errAbort("Can only lift BED4, BED5, BED6 to multiple regions"); + if (wordCount > 3) region = words[3]; + else + { + safef(regionBuf, sizeof(regionBuf), "%s:%d-%d", words[0], s+1, e); + region = regionBuf; + } } if (wordCount >= 6 && (bedPlus == 0 || bedPlus >= 6)) strand = words[5][0]; if (wordCount >= 8 && (bedPlus == 0 || bedPlus >= 8)) { useThick = TRUE; thickStart = lineFileNeedFullNum(lf, words, 6); thickEnd = lineFileNeedFullNum(lf, words, 7); } if (doEnds) { if (useThick) errAbort("Can only lift BED6 or lower with -ends option"); if (multiple) errAbort("Cannot use -multiple with -ends"); @@ -505,55 +511,58 @@ fprintf(f, "\n"); errs++; } else if (error == NULL) { /* successfully mapped */ int ix = 1; struct bed *bed, *next = bedList->next; for (bed = bedList; bed != NULL; bed = next) { if (hasBin) fprintf(f, "%d\t", binFromRange(bed->chromStart, bed->chromEnd)); fprintf(f, "%s\t%d\t%d", bed->chrom, bed->chromStart, bed->chromEnd); - if (multiple) + if (wordCount < 4 && multiple) { - /* region name and part number */ - fprintf(f, "\t%s\t%d", region, ix++); - if (wordCount == 6) - fprintf(f, "\t%c", bed->strand[0]); + fprintf(f, "\t%s", region); + if (!noSerial) + fprintf(f, "\t%d", ix++); } - else - { for (i=3; i<wordCount; ++i) { if (i == 5 && (bedPlus == 0 || bedPlus >= 6)) /* get strand from remap */ fprintf(f, "\t%c", bed->strand[0]); else if (i == 6 && useThick) /* get thickStart from remap */ fprintf(f, "\t%d", bed->thickStart); else if (i == 7 && useThick) /* get thickEnd from remap */ fprintf(f, "\t%d", bed->thickEnd); + else if (i == 3 && multiple && !noSerial) + { + fprintf(f, "\t%s\t%d", region, ix++); + // Skip the score field if there is one + if (bedPlus == 0 || bedPlus > 4) + i++; + } else /* everything else just passed through */ fprintf(f, "\t%s", words[i]); } - } fprintf(f, "\n"); next = bed->next; bedFree(&bed); } /* track how many successfully mapped */ ct++; totalUnmapped = 0; for (bed = unmappedBedList; bed != NULL; bed = bed->next) { int size = bed->chromEnd - bed->chromStart; totalUnmapped += size; verbose(2, "Unmapped: %s:%d-%d (size %d) %s\n", bed->chrom, bed->chromStart, bed->chromEnd, size, bed->name); @@ -574,48 +583,30 @@ fprintf(f, "#%s\n", error); fprintf(f, "%s\t%d\t%d", chrom, s, e); for (i=3; i<wordCount; ++i) fprintf(f, "\t%s", words[i]); fprintf(f, "\n"); errs++; } } if (errCt) *errCt = errs; mappedRatio = (totalBases - totalUnmappedAll)*100.0 / totalBases; verbose(2, "Mapped bases: \t%5.0f%%\n", mappedRatio); return ct; } -static int bedOverSmall(struct lineFile *lf, int fieldCount, - struct hash *chainHash, double minMatch, int minSizeT, - int minSizeQ, int minChainT, int minChainQ, - FILE *mapped, FILE *unmapped, bool multiple, - char *chainTable, int bedPlus, bool hasBin, - bool tabSep, int *errCt) -/* Do a bed without a block-list. - * NOTE: it would be preferable to have all of the lift - * functions work at the line level, rather than the file level. - * Multiple option can be used with bed3 -- it will write a list of - * regions as a bed4, where score is the "part #". This is used for - * ENCODE region mapping */ -{ -return bedOverSmallEnds(lf, fieldCount, chainHash, minMatch, - minSizeT, minSizeQ, minChainT, minChainQ, mapped, unmapped, - multiple, chainTable, bedPlus, hasBin, tabSep, 0, errCt); -} - static void shortGffLine(struct lineFile *lf) /* Complain about short line in GFF and abort. */ { errAbort("Expecting at least 8 words line %d of %s", lf->lineIx, lf->fileName); } static int gffNeedNum(struct lineFile *lf, char *s) /* Convert s to an integer or die trying. */ { char c = *s; if (isdigit(c) || c == '-') return atoi(s); else errAbort("Expecting number line %d of %s", lf->lineIx, lf->fileName); return 0; @@ -996,245 +987,309 @@ for (i=0; i<psl->blockCount; ++i) total += psl->blockSizes[i]; return total; } static struct liftRange *reverseRangeList(struct liftRange *rangeList, int chromSize) /* Return reverse-complemented rangeList. */ { struct liftRange *range; slReverse(&rangeList); for (range = rangeList; range != NULL; range = range->next) reverseIntRange(&range->start, &range->end, chromSize); return rangeList; } -static char *remapBlockedBed(struct hash *chainHash, struct bed *bed, - double minMatch, double minBlocks, bool fudgeThick) -/* Remap blocks in bed, and also chromStart/chromEnd. - * Return NULL on success, an error string on failure. */ -{ -struct chain *chainList = NULL, *chain; -int bedSize = sumBedBlocks(bed); -struct binElement *binList; -struct binElement *el; -struct liftRange *rangeList, *badRanges = NULL, *range; -char *error = NULL; -int i, start, end = 0; -int thickStart = bed->thickStart; -int thickEnd = bed->thickEnd; - -binList = findRange(chainHash, bed->chrom, bed->chromStart, bed->chromEnd); -if (binList == NULL) - return "Deleted in new"; - -/* Convert bed blocks to range list. */ -rangeList = bedToRangeList(bed); - -/* Evaluate all intersecting chains and sort so best is on top. */ -for (el = binList; el != NULL; el = el->next) +static void remapOneBlockedBed(struct chain *chain, struct bed *bed, int bedSize, + double minMatch, double minBlocks, boolean fudgeThick, + struct liftRange **pRangeList, char **retError) +/* If there is an error, set retError; otherwise, modify bed to contain coordinates mapped + * through chain. This nulls out *pRangelist after modifying and freeing the contents. */ { - chain = el->val; - chain->score = chainRangeIntersection(chain, rangeList); - slAddHead(&chainList, chain); - } -slSort(&chainList, chainCmpScore); - -/* See if duplicated. */ -chain = chainList->next; -if (chain != NULL && chain->score == chainList->score) - error = "Duplicated in new"; -chain = chainList; - +*retError = NULL; /* See if best one is good enough. */ if (chain->score < minMatch * bedSize) - error = "Partially deleted in new"; + *retError = "Partially deleted in new"; - -/* Call subroutine to remap range list. */ -if (error == NULL) - { - remapRangeList(chain, &rangeList, &thickStart, &thickEnd, - minBlocks, fudgeThick, - &rangeList, &badRanges, &error); - } +struct liftRange *rangeList = *pRangeList, *badRanges = NULL, *range; +int thickStart = bed->thickStart; +int thickEnd = bed->thickEnd; +if (*retError == NULL) + remapRangeList(chain, &rangeList, &thickStart, &thickEnd, minBlocks, fudgeThick, + &rangeList, &badRanges, retError); /* Convert rangeList back to bed blocks. Also calculate start and end. */ -if (error == NULL) +if (*retError == NULL) { + int i, start, end = 0; if (chain->qStrand == '-') { rangeList = reverseRangeList(rangeList, chain->qSize); reverseIntRange(&thickStart, &thickEnd, chain->qSize); bed->strand[0] = otherStrand(bed->strand[0]); } bed->chromStart = start = rangeList->start; bed->blockCount = slCount(rangeList); for (i=0, range = rangeList; range != NULL; range = range->next, ++i) { end = range->end; bed->blockSizes[i] = end - range->start; bed->chromStarts[i] = range->start - start; } if (!sameString(chain->qName, chain->tName)) { freeMem(bed->chrom); bed->chrom = cloneString(chain->qName); } bed->chromEnd = end; bed->thickStart = thickStart; bed->thickEnd = thickEnd; } slFreeList(&rangeList); slFreeList(&badRanges); +*pRangeList = NULL; +} + +static char *remapBlockedBed(struct hash *chainHash, struct bed *bed, + double minMatch, double minBlocks, bool fudgeThick, + bool multiple, char *db, char *chainTable) +/* Remap blocks in bed, and also chromStart/chromEnd. If multiple, then bed->next may be + * changed to point to additional newly allocated mapped beds, and bed's pointer members may + * be free'd so be sure to pass in a properly allocated bed. + * Return NULL on success, an error string on failure. */ +{ +char *error = NULL; + +struct binElement *binList = findRange(chainHash, bed->chrom, bed->chromStart, bed->chromEnd); +if (binList == NULL) + return "Deleted in new"; + +/* Convert bed blocks to range list. */ +struct liftRange *rangeList = bedToRangeList(bed); + +/* Evaluate all intersecting chains and sort so best is on top. */ +struct chain *chainList = NULL, *chain; +struct binElement *el; +for (el = binList; el != NULL; el = el->next) + { + chain = el->val; + chain->score = chainRangeIntersection(chain, rangeList); + slAddHead(&chainList, chain); + } +slSort(&chainList, chainCmpScore); + +/* See if duplicated. */ +chain = chainList->next; +if (chain != NULL && chain->score == chainList->score && !multiple) + error = "Duplicated in new"; +chain = chainList; +if (db) + { + /* use full chain, not the possibly truncated chain from the net */ + chain = chainLoadIdRange(db, chainTable, bed->chrom, bed->chromStart, bed->chromEnd, chain->id); + } +// bed will be overwritten, so if we're mapping through multiple chains we need to save a backup: +struct bed *bedCopy = multiple ? cloneBed(bed) : NULL; +int bedSize = sumBedBlocks(bed); +remapOneBlockedBed(chain, bed, bedSize, minMatch, minBlocks, fudgeThick, &rangeList, &error); + +if (multiple) + { + struct bed *bedList = NULL; + // Repeat for other chains + for (chain = chainList->next; chain != NULL; chain = chain->next) + { + // Make a new bed to be mapped from the original coordinates + struct bed *newBed = cloneBed(bedCopy); + rangeList = bedToRangeList(newBed); + char *newError = NULL; + remapOneBlockedBed(chain, newBed, bedSize, minMatch, minBlocks, fudgeThick, &rangeList, + &newError); + if (newError) + bedFree(&newBed); + else + slAddHead(&bedList, newBed); + } + if (bedList != NULL) + { + slReverse(&bedList); + bed->next = bedList; + } + if (error && bed->next) + { + // The first chain gave an error; replace the first bed with the first successfully + // mapped bed. + struct bed *splicedBed = bed->next; + freez(&bed->chrom); + freez(&bed->name); + freez(&bed->blockSizes); + freez(&bed->chromStarts); + freez(&bed->expIds); + freez(&bed->expScores); + freez(&bed->label); + memcpy(bed, splicedBed, sizeof(struct bed)); + freez(&splicedBed); + error = NULL; + } + } +bedFree(&bedCopy); slFreeList(&binList); return error; } static int bedOverBig(struct lineFile *lf, int refCount, struct hash *chainHash, double minMatch, double minBlocks, - bool fudgeThick, FILE *mapped, FILE *unmapped, int bedPlus, - bool hasBin, bool tabSep, int *errCt) + bool fudgeThick, FILE *mapped, FILE *unmapped, bool multiple, char *chainTable, + int bedPlus, bool hasBin, bool tabSep, int *errCt) /* Do a bed with block-list. */ { int wordCount, bedCount; char *line, *words[LIFTOVER_MAX_WORDS]; char *whyNot = NULL; int ct = 0; int errs = 0; int i; +char *db = NULL, *chainTableName = NULL; +if (chainTable) + { + chainTableName = chopPrefix(chainTable); + db = chainTable; + chopSuffix(chainTable); + } while (lineFileNextReal(lf, &line)) { struct bed *bed; wordCount = chopLineBin(line, words, ArraySize(words), hasBin, tabSep); if (refCount != wordCount) lineFileExpectWords(lf, refCount, wordCount); if (wordCount == bedPlus) bedPlus = 0; /* no extra fields */ bedCount = (bedPlus ? bedPlus : wordCount); bed = bedLoadN(words, bedCount); - whyNot = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick); + whyNot = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick, + multiple, db, chainTableName); if (whyNot == NULL) { + struct bed *bedList = bed; + for (; bed != NULL; bed = bed->next) + { if (hasBin) fprintf(mapped, "%d\t", binFromRange(bed->chromStart, bed->chromEnd)); bedOutputN(bed, bedCount, mapped, '\t', (bedCount != wordCount) ? '\t':'\n'); /* print extra "non-bed" fields in line */ for (i = bedCount; i < wordCount; i++) fprintf(mapped, "%s%c", words[i], (i == wordCount-1) ? '\n':'\t'); + } + bedFreeList(&bedList); ct++; } else { fprintf(unmapped, "#%s\n", whyNot); bedOutputN(bed, bedCount, unmapped, '\t', (bedCount != wordCount) ? '\t':'\n'); /* print extra "non-bed" fields in line */ for (i = bedCount; i < wordCount; i++) fprintf(unmapped, "%s%c", words[i], (i == wordCount-1) ? '\n':'\t'); errs++; } bedFree(&bed); } if (errCt) *errCt = errs; return ct; } int liftOverBedPlusEnds(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, - bool multiple, char *chainTable, int bedPlus, bool hasBin, + bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin, bool tabSep, int ends, int *errCt) /* Lift bed N+ file. * Return the number of records successfully converted */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; int bedFieldCount = bedPlus; char *line; char *words[LIFTOVER_MAX_WORDS]; int ct = 0; if (lineFileNextReal(lf, &line)) { line = cloneString(line); if (tabSep) wordCount = chopByChar(line, '\t', words, ArraySize(words)); else wordCount = chopLine(line, words); if (hasBin) wordCount--; lineFileReuse(lf); freez(&line); if (wordCount < 3) errAbort("Data format error: expecting at least 3 fields in BED file (%s)", fileName); if (bedFieldCount == 0) bedFieldCount = wordCount; if (bedFieldCount <= 10) { - if (ends) - ct = bedOverSmallEnds(lf, wordCount, chainHash, minMatch, - minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, - multiple, chainTable, bedPlus, hasBin, tabSep, ends, errCt); - else - ct = bedOverSmall(lf, wordCount, chainHash, minMatch, + ct = bedOverSmallEnds(lf, chainHash, minMatch, minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, - multiple, chainTable, bedPlus, hasBin, tabSep, errCt); + multiple, noSerial, chainTable, bedPlus, hasBin, tabSep, ends, errCt); } else if (ends) errAbort("Cannot use -ends with blocked BED\n"); else ct = bedOverBig(lf, wordCount, chainHash, minMatch, minBlocks, - fudgeThick, f, unmapped, bedPlus, hasBin, tabSep, errCt); + fudgeThick, f, unmapped, multiple, chainTable, + bedPlus, hasBin, tabSep, errCt); } lineFileClose(&lf); return ct; } int liftOverBedPlus(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, - bool multiple, char *chainTable, int bedPlus, bool hasBin, + bool multiple, bool noSerial, char *chainTable, int bedPlus, bool hasBin, bool tabSep, int *errCt) /* Lift bed N+ file. * Return the number of records successfully converted */ { return liftOverBedPlusEnds(fileName, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, - fudgeThick, f, unmapped, multiple, chainTable, + fudgeThick, f, unmapped, multiple, noSerial, chainTable, bedPlus, hasBin, tabSep, 0, errCt); } int liftOverBed(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, - bool multiple, char *chainTable, int *errCt) + bool multiple, bool noSerial, char *chainTable, int *errCt) /* Open up file, decide what type of bed it is, and lift it. * Return the number of records successfully converted */ { return liftOverBedPlus(fileName, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, - fudgeThick, f, unmapped, multiple, chainTable, + fudgeThick, f, unmapped, multiple, noSerial, chainTable, 0, FALSE, FALSE, errCt); } #define LIFTOVER_FILE_PREFIX "liftOver" #define BEDSTART_TO_POSITION(coord) (coord+1) int liftOverPositions(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *mapped, FILE *unmapped, bool multiple, char *chainTable, int *errCt) /* Create bed file from positions (chrom:start-end) and lift. * Then convert back to positions. (TODO: line-by-line instead of by file) * Return the number of records successfully converted */ @@ -1265,32 +1320,33 @@ /* let the bed parser worry about it */ // line = trimSpaces(line); fprintf(bedFile, "%s\n", line); } freez(&line); } carefulClose(&bedFile); chmod(bedTn.forCgi, 0666); lineFileClose(&lf); /* Set up temp bed files for output, and lift to those */ makeTempName(&mappedBedTn, LIFTOVER_FILE_PREFIX, ".bedmapped"); makeTempName(&unmappedBedTn, LIFTOVER_FILE_PREFIX, ".bedunmapped"); mappedBed = mustOpen(mappedBedTn.forCgi, "w"); unmappedBed = mustOpen(unmappedBedTn.forCgi, "w"); -ct = liftOverBed(bedTn.forCgi, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, - mappedBed, unmappedBed, multiple, chainTable, errCt); +ct = liftOverBed(bedTn.forCgi, chainHash, minMatch, minBlocks, + minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, + mappedBed, unmappedBed, multiple, TRUE, chainTable, errCt); carefulClose(&mappedBed); chmod(mappedBedTn.forCgi, 0666); carefulClose(&unmappedBed); chmod(unmappedBedTn.forCgi, 0666); lineFileClose(&lf); /* Convert output files back to positions */ lf = lineFileOpen(mappedBedTn.forCgi, TRUE); while ((wordCount = lineFileChop(lf, words)) != 0) { chrom = words[0]; start = lineFileNeedNum(lf, words, 1); end = lineFileNeedNum(lf, words, 2); fprintf(mapped, "%s:%d-%d\n", chrom, BEDSTART_TO_POSITION(start), end); } @@ -1350,43 +1406,43 @@ end = words[2]; } if (!isdigit(start[0]) || !isdigit(end[0])) return none; lineFileClose(&lf); if (isPosition) return positions; return bed; } int liftOverBedOrPositions(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *mapped, FILE *unmapped, - bool multiple, char *chainTable, int *errCt) + bool multiple, bool noSerial, char *chainTable, int *errCt) /* Sniff the first line of the file, and determine whether it's a */ /* bed, a positions file, or neither. */ { enum liftOverFileType lft = liftOverSniff(fileName); if (lft == positions) return liftOverPositions(fileName, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, mapped, unmapped, multiple, chainTable, errCt); if (lft == bed) return liftOverBed(fileName, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, mapped, unmapped, - multiple, chainTable, errCt); + multiple, noSerial, chainTable, errCt); return -1; } static char *remapBlockedPsl(struct hash *chainHash, struct psl *psl, double minMatch, double minBlocks, bool fudgeThick) /* Remap blocks in psl, and also chromStart/chromEnd. * Return NULL on success, an error string on failure. */ { struct chain *chainList = NULL, *chain; int pslSize = sumPslBlocks(psl); struct binElement *binList; struct binElement *el; struct liftRange *rangeList, *badRanges = NULL, *range; char *error = NULL; int i, start, end = 0; @@ -1516,155 +1572,76 @@ for (i=0; i<count; ++i) { int s = gp->exonStarts[i]; int e = gp->exonEnds[i]; bed->blockSizes[i] = e - s; bed->chromStarts[i] = s - start; } return bed; } void liftOverGenePred(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, bool fudgeThick, FILE *mapped, FILE *unmapped) /* Lift over file in genePred format. */ { +//#*** TODO make multiple an argument; could also add db and chainTable args. +bool multiple = FALSE; +char *db = NULL, *chainTable = NULL; + struct bed *bed; struct genePred *gp = NULL; char *error; FILE *f; struct genePred *gpList = genePredExtLoadAll(fileName); for (gp = gpList ; gp != NULL ; gp = gp->next) { // uglyf("%s %s %d %d %s\n", gp->name, gp->chrom, gp->txStart, gp->txEnd, gp->strand); f = mapped; bed = genePredToBed(gp); - error = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick); + error = remapBlockedBed(chainHash, bed, minMatch, minBlocks, fudgeThick, + multiple, db, chainTable); if (error) { f = unmapped; fprintf(unmapped, "# %s\n", error); } else { int count, i, start; freeMem(gp->chrom); gp->chrom = cloneString(bed->chrom); gp->txStart = start = bed->chromStart; gp->txEnd = bed->chromEnd; gp->strand[0] = bed->strand[0]; gp->cdsStart = bed->thickStart; gp->cdsEnd = bed->thickEnd; gp->exonCount = count = bed->blockCount; for (i=0; i<count; ++i) { int s = start + bed->chromStarts[i]; int e = s + bed->blockSizes[i]; gp->exonStarts[i] = s; gp->exonEnds[i] = e; } } genePredTabOut(gp, f); bedFree(&bed); // genePredFree(&gp); } } -#ifdef example -static char *remapBlockedBed(struct hash *chainHash, struct bed *bed) -/* Remap blocks in bed, and also chromStart/chromEnd. - * Return NULL on success, an error string on failure. */ -{ -struct chain *chainList = NULL, *chain, *subChain; -int bedSize = sumBedBlocks(bed); -struct binElement *binList, *el; -struct liftRange *rangeList, *badRanges = NULL, *range; -char *error = NULL; -int i, start, end = 0; -int thickStart = bed->thickStart; -int thickEnd = bed->thickEnd; - -binList = findRange(chainHash, bed->chrom, bed->chromStart, bed->chromEnd); -if (binList == NULL) - return "Deleted in new"; - -/* Convert bed blocks to range list. */ -rangeList = bedToRangeList(bed); - -/* Evaluate all intersecting chains and sort so best is on top. */ -for (el = binList; el != NULL; el = el->next) - { - chain = el->val; - chain->score = chainRangeIntersection(chain, rangeList); - slAddHead(&chainList, chain); - } -slSort(&chainList, chainCmpScore); - -/* See if duplicated. */ -chain = chainList->next; -if (chain != NULL && chain->score == chainList->score) - error = "Duplicated in new"; -chain = chainList; - -/* See if best one is good enough. */ -if (chain->score < minMatch * bedSize) - error = "Partially deleted in new"; - - -/* Call subroutine to remap range list. */ -if (error == NULL) - { - remapRangeList(chain, &rangeList, &thickStart, &thickEnd, minBlocks, - &rangeList, &badRanges, &error); - } - -/* Convert rangeList back to bed blocks. Also calculate start and end. */ -if (error == NULL) - { - if (chain->qStrand == '-') - { - struct liftRange *range; - slReverse(&rangeList); - for (range = rangeList; range != NULL; range = range->next) - reverseIntRange(&range->start, &range->end, chain->qSize); - reverseIntRange(&thickStart, &thickEnd, chain->qSize); - bed->strand[0] = otherStrand(bed->strand[0]); - } - bed->chromStart = start = rangeList->start; - bed->blockCount = slCount(rangeList); - for (i=0, range = rangeList; range != NULL; range = range->next, ++i) - { - end = range->end; - bed->blockSizes[i] = end - range->start; - bed->chromStarts[i] = range->start - start; - } - if (!sameString(chain->qName, chain->tName)) - { - freeMem(bed->chrom); - bed->chrom = cloneString(chain->qName); - } - bed->chromEnd = end; - bed->thickStart = thickStart; - bed->thickEnd = thickEnd; - } -slFreeList(&rangeList); -slFreeList(&badRanges); -slFreeList(&binList); -return error; -} -#endif - static struct liftRange *sampleToRangeList(struct sample *sample, int sizeOne) /* Make a range list corresponding to sample. */ { int i; struct liftRange *rangeList = NULL, *range; for (i=0; i<sample->sampleCount; ++i) { AllocVar(range); range->start = range->end = sample->chromStart + sample->samplePosition[i]; range->end += sizeOne; range->val = sample->sampleHeight[i]; slAddHead(&rangeList, range); } slReverse(&rangeList); return rangeList;