e875e085d0c976ca62b895bdfa5180a942a1854a aamp Thu May 2 14:01:47 2013 -0700 Rolled back some changes to liftOver's presence in the library. If it goes into the top-level maybe it should get some design changes as well. At the moment the design is a bit stretched. diff --git src/hg/lib/liftOver.c src/hg/lib/liftOver.c index d3c456b..95e2893 100644 --- src/hg/lib/liftOver.c +++ src/hg/lib/liftOver.c @@ -3,92 +3,309 @@ #include "linefile.h" #include "hash.h" #include "options.h" #include "binRange.h" #include "chain.h" #include "chainNetDbLoad.h" #include "bed.h" #include "genePred.h" #include "sample.h" #include "hdb.h" #include "liftOverChain.h" #include "liftOver.h" #include "portable.h" #include "obscure.h" + +struct chromMap +/* Remapping information for one (old) chromosome */ + { + char *name; /* Chromosome name. */ + struct binKeeper *bk; /* Keyed by old position, values are chains. */ + }; + static char otherStrand(char c) /* Swap +/- */ { if (c == '-') return '+'; else if (c == '+') return '-'; else return c; } // The maximum number of words per line that can be lifted: #define LIFTOVER_MAX_WORDS 64 -static struct binElement *findRange(struct hash *chainHash, char *chrom, int start, int end) -/* Find elements that intersect range. */ -{ -struct liftOverChromMap *map = hashFindVal(chainHash, chrom); -if (map == NULL) - return NULL; -return binKeeperFind(map->bk, start, end); -} - void readLiftOverMap(char *fileName, struct hash *chainHash) /* Read map file into hashes. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct chain *chain; -struct liftOverChromMap *map; +struct chromMap *map; int chainCount = 0; while ((chain = chainRead(lf)) != NULL) { if ((map = hashFindVal(chainHash, chain->tName)) == NULL) { AllocVar(map); map->bk = binKeeperNew(0, chain->tSize); hashAddSaveName(chainHash, chain->tName, map, &map->name); } binKeeperAdd(map->bk, chain->tStart, chain->tEnd, chain); ++chainCount; } } +static struct binElement *findRange(struct hash *chainHash, + char *chrom, int start, int end) +/* Find elements that intersect range. */ +{ +struct chromMap *map = hashFindVal(chainHash, chrom); +if (map == NULL) + return NULL; +return binKeeperFind(map->bk, start, end); +} + +static int chainAliSize(struct chain *chain) +/* Return size of all blocks in chain. */ +{ +struct cBlock *b; +int total = 0; +for (b = chain->blockList; b != NULL; b = b->next) + total += b->qEnd - b->qStart; +return total; +} + +static int aliIntersectSize(struct chain *chain, int tStart, int tEnd) +/* How many bases in chain intersect region from tStart to tEnd */ +{ +int total = 0, one; +struct cBlock *b; + +for (b = chain->blockList; b != NULL; b = b->next) + { + one = rangeIntersection(tStart, tEnd, b->tStart, b->tEnd); + if (one > 0) + total += one; + } +return total; +} + +static boolean mapThroughChain(struct chain *chain, double minRatio, + int *pStart, int *pEnd, struct chain **retSubChain, + struct chain **retChainToFree) +/* Map interval from start to end from target to query side of chain. + * Return FALSE if not possible, otherwise update *pStart, *pEnd. */ +{ +struct chain *subChain = NULL; +struct chain *freeChain = NULL; +int s = *pStart, e = *pEnd; +int oldSize = e - s; +int newCover = 0; +int ok = TRUE; + +chainSubsetOnT(chain, s, e, &subChain, &freeChain); +if (subChain == NULL) + { + *retSubChain = NULL; + *retChainToFree = NULL; + return FALSE; + } +newCover = chainAliSize(subChain); +if (newCover < oldSize * minRatio) + ok = FALSE; +else if (chain->qStrand == '+') + { + *pStart = subChain->qStart; + *pEnd = subChain->qEnd; + } +else + { + *pStart = subChain->qSize - subChain->qEnd; + *pEnd = subChain->qSize - subChain->qStart; + } +*retSubChain = subChain; +*retChainToFree = freeChain; +return ok; +} + static char *remapRange(struct hash *chainHash, double minRatio, int minSizeT, int minSizeQ, int minChainSizeT, int minChainSizeQ, char *chrom, int s, int e, char qStrand, int thickStart, int thickEnd, bool useThick, double minMatch, char *regionName, char *db, char *chainTableName, struct bed **bedRet, struct bed **unmappedBedRet) /* Remap a range through chain hash. If all is well return NULL * and results in a BED (or a list of BED's, if regionName is set (-multiple). * Otherwise return a string describing the problem. * note: minSizeT is currently NOT implemented. feel free to add it. * (its not e-s, its the boundaries of what maps of chrom:s-e to Q) */ { -return remapRangeCore(chainHash, minRatio, minSizeT, minSizeQ, minChainSizeT, minChainSizeQ, - chrom, s, e, qStrand, thickStart, thickEnd, useThick, minMatch, regionName, - db, chainTableName, chainLoadIdRange, bedRet, unmappedBedRet); +struct binElement *list = findRange(chainHash, chrom, s, e), *el; +struct chain *chainsHit = NULL, + *chainsPartial = NULL, + *chainsMissed = NULL, *chain; +struct bed *bedList = NULL, *unmappedBedList = NULL; +struct bed *bed = NULL; +char strand = qStrand; +/* initialize for single region case */ +int start = s, end = e; +double minMatchSize = minMatch * (end - start); +int intersectSize; +int tStart; +bool multiple = (regionName != NULL); + +verbose(2, "%s:%d-%d", chrom, s, e); +verbose(2, multiple ? "\t%s\n": "\n", regionName); +for (el = list; el != NULL; el = el->next) + { + chain = el->val; + if (multiple) + { + if (chain->qEnd - chain->qStart < minChainSizeQ || + chain->tEnd - chain->tStart < minChainSizeT) + continue; + /* limit required match to chain range on target */ + end = min(e, chain->tEnd); + start = max(s, chain->tStart); + minMatchSize = minMatch * (end - start); + } + intersectSize = aliIntersectSize(chain, start, end); + if (intersectSize >= minMatchSize) + slAddHead(&chainsHit, chain); + else if (intersectSize > 0) + { + slAddHead(&chainsPartial, chain); + } + else + { + /* shouldn't happen ? */ + slAddHead(&chainsMissed, chain); + } + } +slFreeList(&list); + +if (chainsHit == NULL) + { + if (chainsPartial == NULL) + return "Deleted in new"; + else if (chainsPartial->next == NULL) + return "Partially deleted in new"; + else + return "Split in new"; + } +else if (chainsHit->next != NULL && !multiple) + { + return "Duplicated in new"; + } +/* sort chains by position in target to order subregions by orthology */ +slSort(&chainsHit, chainCmpTarget); + +tStart = s; +struct chain *next = chainsHit->next; +for (chain = chainsHit; chain != NULL; chain = next) + { + struct chain *subChain = NULL; + struct chain *toFree = NULL; + int start=s, end=e; + next = chain->next; + verbose(3,"hit chain %s:%d %s:%d-%d %c (%d)\n", + chain->tName, chain->tStart, chain->qName, chain->qStart, chain->qEnd, + chain->qStrand, chain->id); + if (multiple) + { + /* no real need to verify ratio again (it would require + * adjusting coords again). */ + minRatio = 0; + if (db) + { + /* use full chain, not the possibly truncated chain + * from the net */ + struct chain *next = chain->next; + chain = chainLoadIdRange(db, chainTableName, + chrom, s, e, chain->id); + chain->next = next; + verbose(3,"chain from db %s:%d %s:%d-%d %c (%d)\n", + chain->tName, chain->tStart, chain->qName, + chain->qStart, chain->qEnd, chain->qStrand, chain->id); + } + } + if (!mapThroughChain(chain, minRatio, &start, &end, &subChain, &toFree)) + errAbort("Chain mapping error: %s:%d-%d\n", chain->qName, start, end); + if (chain->qStrand == '-') + strand = otherStrand(qStrand); + if (useThick) + { + struct chain *subChain2 = NULL; + struct chain *toFree2 = NULL; + if (!mapThroughChain(chain, minRatio, &thickStart, &thickEnd, + &subChain2, &toFree2)) + thickStart = thickEnd = start; + chainFree(&toFree2); + } + verbose(3, "mapped %s:%d-%d\n", chain->qName, start, end); + verbose(4, "Mapped! Target:\t%s\t%d\t%d\t%c\tQuery:\t%s\t%d\t%d\t%c\n", + chain->tName, subChain->tStart, subChain->tEnd, strand, + chain->qName, subChain->qStart, subChain->qEnd, chain->qStrand); + if (multiple && end - start < minSizeQ) + { + verbose(2,"dropping %s:%d-%d size %d (too small)\n", + chain->qName, start, end, end - start); + continue; + } + AllocVar(bed); + bed->chrom = cloneString(chain->qName); + bed->chromStart = start; + bed->chromEnd = end; + if (regionName) + bed->name = cloneString(regionName); + bed->strand[0] = strand; + bed->strand[1] = 0; + if (useThick) + { + bed->thickStart = thickStart; + bed->thickEnd = thickEnd; + } + slAddHead(&bedList, bed); + if (tStart < subChain->tStart) + { + /* unmapped portion of target */ + AllocVar(bed); + bed->chrom = cloneString(chain->tName); + bed->chromStart = tStart; + bed->chromEnd = subChain->tStart; + if (regionName) + bed->name = cloneString(regionName); + slAddHead(&unmappedBedList, bed); + } + tStart = subChain->tEnd; + chainFree(&toFree); + } +slReverse(&bedList); +*bedRet = bedList; +slReverse(&unmappedBedList); +if (unmappedBedRet) + *unmappedBedRet = unmappedBedList; +if (bedList==NULL) + return "Deleted in new"; +return NULL; } char *liftOverRemapRange(struct hash *chainHash, double minRatio, char *chrom, int s, int e, char strand, double minMatch, char **retChrom, int *retStart, int *retEnd, char *retStrand) /* Remap a range through chain hash. If all is well return NULL * and results in retChrom, retStart, retEnd. Otherwise * return a string describing the problem. */ { struct bed *bed; char *error; error = remapRange(chainHash, minRatio, 0, 0, 0, 0, chrom, s, e, strand, 0, 0, FALSE, minMatch, NULL, NULL, NULL, &bed, NULL);