b224dbbd105b929a99841a7dd11526cc0b1a5422 aamp Fri Sep 14 11:20:42 2012 -0700 Changed liftOver to allow big-sized items by just lifting the ends. Needed library changes as well. diff --git src/hg/lib/liftOver.c src/hg/lib/liftOver.c index d9444ea..7e20e1c 100644 --- src/hg/lib/liftOver.c +++ src/hg/lib/liftOver.c @@ -260,31 +260,30 @@ } AllocVar(bed); bed->chrom = cloneString(chain->qName); bed->chromStart = start; bed->chromEnd = end; if (regionName) bed->name = cloneString(regionName); bed->strand[0] = strand; bed->strand[1] = 0; if (useThick) { bed->thickStart = thickStart; bed->thickEnd = thickEnd; } slAddHead(&bedList, bed); - if (tStart < subChain->tStart) { /* unmapped portion of target */ AllocVar(bed); bed->chrom = cloneString(chain->tName); bed->chromStart = tStart; bed->chromEnd = subChain->tStart; if (regionName) bed->name = cloneString(regionName); slAddHead(&unmappedBedList, bed); } tStart = subChain->tEnd; chainFree(&toFree); } slReverse(&bedList); @@ -355,96 +354,156 @@ wordCount = lineFileChopCharNext(lf, '\t', words, maxWord); else wordCount = lineFileChopNext(lf, words, maxWord); if (hasBin) { int i; wordCount--; for (i = 1; i <= wordCount; i++) words[i-1] = words[i]; } if (wordCount <= 0) return 0; return wordCount; } -static int bedOverSmall(struct lineFile *lf, int fieldCount, +static int bedOverSmallEnds(struct lineFile *lf, int fieldCount, struct hash *chainHash, double minMatch, int minSizeT, int minSizeQ, int minChainT, int minChainQ, FILE *mapped, FILE *unmapped, bool multiple, char *chainTable, int bedPlus, bool hasBin, - bool tabSep, int *errCt) + bool tabSep, int ends, int *errCt) /* Do a bed without a block-list. * NOTE: it would be preferable to have all of the lift * functions work at the line level, rather than the file level. * Multiple option can be used with bed3 -- it will write a list of * regions as a bed4, where score is the "part #". This is used for * ENCODE region mapping */ { int i, wordCount, s, e; char *words[LIFTOVER_MAX_WORDS], *chrom; char strand = '.', strandString[2]; -char *error; +char *error, *error2 = NULL; int ct = 0; int errs = 0; struct bed *bedList = NULL, *unmappedBedList = NULL; +/* result lists for -ends option */ +struct bed *bedList2 = NULL, *unmappedBedList2 = NULL; int totalUnmapped = 0; double unmappedRatio; int totalUnmappedAll = 0; int totalBases = 0; double mappedRatio; char *region = NULL; /* region name from BED file-- used with -multiple */ char *db = NULL, *chainTableName = NULL; if (chainTable) { chainTableName = chopPrefix(chainTable); db = chainTable; chopSuffix(chainTable); } while ((wordCount = lineFileChopBin(lf, words, ArraySize(words), hasBin, tabSep)) != 0) { FILE *f = mapped; chrom = words[0]; s = lineFileNeedFullNum(lf, words, 1); e = lineFileNeedFullNum(lf, words, 2); bool useThick = FALSE; int thickStart = 0, thickEnd = 0; + int afterS = s + ends; + int beforeE = e - ends; + bool doEnds = ((ends > 0) && (beforeE > afterS)); if (s > e) errAbort( "ERROR: start coordinate is after end coordinate (chromStart > chromEnd) on line %d of bed file %s\nERROR: %s %d %d", lf->lineIx, lf->fileName, chrom, s, e); if (multiple) { if (wordCount < 4 || wordCount > 6) errAbort("Can only lift BED4, BED5, BED6 to multiple regions"); region = words[3]; } if (wordCount >= 6 && (bedPlus == 0 || bedPlus >= 6)) strand = words[5][0]; if (wordCount >= 8 && (bedPlus == 0 || bedPlus >= 8)) { useThick = TRUE; thickStart = lineFileNeedFullNum(lf, words, 6); thickEnd = lineFileNeedFullNum(lf, words, 7); } + if (doEnds) + { + if (useThick) + errAbort("Can only lift BED6 or lower with -ends option"); + if (multiple) + errAbort("Cannot use -multiple with -ends"); + error = remapRange(chainHash, minMatch, minSizeT, minSizeQ, minChainT, + minChainQ, chrom, s, afterS, strand, + thickStart, thickEnd, useThick, minMatch, + region, db, chainTableName, &bedList, &unmappedBedList); + error2 = remapRange(chainHash, minMatch, minSizeT, minSizeQ, minChainT, + minChainQ, chrom, beforeE, e, strand, + thickStart, thickEnd, useThick, minMatch, + region, db, chainTableName, &bedList2, &unmappedBedList2); + } + else error = remapRange(chainHash, minMatch, minSizeT, minSizeQ, minChainT, minChainQ, chrom, s, e, strand, thickStart, thickEnd, useThick, minMatch, region, db, chainTableName, &bedList, &unmappedBedList); - if (error == NULL) + if (doEnds && !error && !error2 && bedList && bedList2 && (slCount(bedList) == 1) && (slCount(bedList2) == 1) + && sameString(bedList->chrom, bedList2->chrom) && (bedList->chromStart < bedList2->chromEnd) + && (((wordCount >= 6) && ((bedPlus == 0) || (bedPlus >= 6)) && (bedList->strand[0] == bedList2->strand[0])) || (wordCount < 6) || (bedPlus < 6))) + { + /* really the most restrictive in terms of mapping */ + if (hasBin) + fprintf(f, "%d\t", binFromRange(bedList->chromStart, bedList2->chromEnd)); + fprintf(f, "%s\t%d\t%d", bedList->chrom, bedList->chromStart, bedList2->chromEnd); + for (i=3; i= 6)) + /* get strand from remap */ + fprintf(f, "\t%c", bedList->strand[0]); + else + /* everything else just passed through */ + fprintf(f, "\t%s", words[i]); + } + fprintf(f, "\n"); + ct++; + } + else if (doEnds) + { + /* error like below */ + f = unmapped; + strandString[0] = strand; + strandString[1] = 0; + words[5] = strandString; + if (error || error2) + fprintf(f, "#%s on one or both ends\n", (error) ? error : error2); + else if (!bedList || !bedList2 || (slCount(bedList) > 1) || (slCount(bedList2) > 1) + || !sameString(bedList->chrom, bedList2->chrom) || (bedList->chromStart >= bedList2->chromEnd) + || (((wordCount >= 6) && ((bedPlus == 0) || (bedPlus >= 6))) && (bedList->strand[0] != bedList2->strand[0]))) + fprintf(f, "#ends mapped differently or incompletely\n"); + fprintf(f, "%s\t%d\t%d", chrom, s, e); + for (i=3; inext; for (bed = bedList; bed != NULL; bed = next) { if (hasBin) fprintf(f, "%d\t", binFromRange(bed->chromStart, bed->chromEnd)); fprintf(f, "%s\t%d\t%d", bed->chrom, bed->chromStart, bed->chromEnd); if (multiple) { /* region name and part number */ fprintf(f, "\t%s\t%d", region, ix++); @@ -501,30 +560,48 @@ fprintf(f, "#%s\n", error); fprintf(f, "%s\t%d\t%d", chrom, s, e); for (i=3; ilineIx, lf->fileName); } static int gffNeedNum(struct lineFile *lf, char *s) /* Convert s to an integer or die trying. */ { char c = *s; if (isdigit(c) || c == '-') return atoi(s); else errAbort("Expecting number line %d of %s", lf->lineIx, lf->fileName); return 0; @@ -1041,72 +1118,97 @@ bedOutputN(bed, bedCount, unmapped, '\t', (bedCount != wordCount) ? '\t':'\n'); /* print extra "non-bed" fields in line */ for (i = bedCount; i < wordCount; i++) fprintf(unmapped, "%s%c", words[i], (i == wordCount-1) ? '\n':'\t'); errs++; } bedFree(&bed); } if (errCt) *errCt = errs; return ct; } -int liftOverBedPlus(char *fileName, struct hash *chainHash, double minMatch, + + +int liftOverBedPlusEnds(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, bool multiple, char *chainTable, int bedPlus, bool hasBin, - bool tabSep, int *errCt) + bool tabSep, int ends, int *errCt) /* Lift bed N+ file. * Return the number of records successfully converted */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; int bedFieldCount = bedPlus; char *line; char *words[LIFTOVER_MAX_WORDS]; int ct = 0; if (lineFileNextReal(lf, &line)) { line = cloneString(line); if (tabSep) wordCount = chopByChar(line, '\t', words, ArraySize(words)); else wordCount = chopLine(line, words); if (hasBin) wordCount--; lineFileReuse(lf); freez(&line); if (wordCount < 3) errAbort("Data format error: expecting at least 3 fields in BED file (%s)", fileName); if (bedFieldCount == 0) bedFieldCount = wordCount; if (bedFieldCount <= 10) + { + if (ends) + ct = bedOverSmallEnds(lf, wordCount, chainHash, minMatch, + minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, + multiple, chainTable, bedPlus, hasBin, tabSep, ends, errCt); + else ct = bedOverSmall(lf, wordCount, chainHash, minMatch, minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, multiple, chainTable, bedPlus, hasBin, tabSep, errCt); + } + else if (ends) + errAbort("Cannot use -ends with blocked BED\n"); else ct = bedOverBig(lf, wordCount, chainHash, minMatch, minBlocks, fudgeThick, f, unmapped, bedPlus, hasBin, tabSep, errCt); } lineFileClose(&lf); return ct; } +int liftOverBedPlus(char *fileName, struct hash *chainHash, double minMatch, + double minBlocks, int minSizeT, int minSizeQ, int minChainT, + int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, + bool multiple, char *chainTable, int bedPlus, bool hasBin, + bool tabSep, int *errCt) +/* Lift bed N+ file. + * Return the number of records successfully converted */ +{ +return liftOverBedPlusEnds(fileName, chainHash, minMatch, minBlocks, + minSizeT, minSizeQ, minChainT, minChainQ, + fudgeThick, f, unmapped, multiple, chainTable, + 0, FALSE, FALSE, 0, errCt); +} + int liftOverBed(char *fileName, struct hash *chainHash, double minMatch, double minBlocks, int minSizeT, int minSizeQ, int minChainT, int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, bool multiple, char *chainTable, int *errCt) /* Open up file, decide what type of bed it is, and lift it. * Return the number of records successfully converted */ { return liftOverBedPlus(fileName, chainHash, minMatch, minBlocks, minSizeT, minSizeQ, minChainT, minChainQ, fudgeThick, f, unmapped, multiple, chainTable, 0, FALSE, FALSE, errCt); }