99f94720f116c9b3056bd2b1c202949bf613f8cf
aamp
  Wed Apr 24 05:47:59 2013 -0700
teased out a few routines for a sans-MySQL version of liftOver stuff in the top-level lib.
diff --git src/hg/lib/liftOver.c src/hg/lib/liftOver.c
index 95e2893..d3c456b 100644
--- src/hg/lib/liftOver.c
+++ src/hg/lib/liftOver.c
@@ -3,309 +3,92 @@
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "binRange.h"
 #include "chain.h"
 #include "chainNetDbLoad.h"
 #include "bed.h"
 #include "genePred.h"
 #include "sample.h"
 #include "hdb.h"
 #include "liftOverChain.h"
 #include "liftOver.h"
 #include "portable.h"
 #include "obscure.h"
 
-
-struct chromMap
-/* Remapping information for one (old) chromosome */
-    {
-    char *name;                 /* Chromosome name. */
-    struct binKeeper *bk;       /* Keyed by old position, values are chains. */
-    };
-
 static char otherStrand(char c)
 /* Swap +/- */
 {
 if (c == '-')
     return '+';
 else if (c == '+')
     return '-';
 else
     return c;
 }
 
 // The maximum number of words per line that can be lifted:
 #define LIFTOVER_MAX_WORDS 64
 
+static struct binElement *findRange(struct hash *chainHash, char *chrom, int start, int end)
+/* Find elements that intersect range. */
+{
+struct liftOverChromMap *map = hashFindVal(chainHash, chrom);
+if (map == NULL)
+        return NULL;
+return binKeeperFind(map->bk, start, end);
+}
+
 void readLiftOverMap(char *fileName, struct hash *chainHash)
 /* Read map file into hashes. */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 struct chain *chain;
-struct chromMap *map;
+struct liftOverChromMap *map;
 int chainCount = 0;
 
 while ((chain = chainRead(lf)) != NULL)
     {
     if ((map = hashFindVal(chainHash, chain->tName)) == NULL)
 	{
 	AllocVar(map);
 	map->bk = binKeeperNew(0, chain->tSize);
 	hashAddSaveName(chainHash, chain->tName, map, &map->name);
 	}
     binKeeperAdd(map->bk, chain->tStart, chain->tEnd, chain);
     ++chainCount;
     }
 }
 
-static struct binElement *findRange(struct hash *chainHash, 
-                                char *chrom, int start, int end)
-/* Find elements that intersect range. */
-{
-struct chromMap *map = hashFindVal(chainHash, chrom);
-if (map == NULL)
-    return NULL;
-return binKeeperFind(map->bk, start, end);
-}
-
-static int chainAliSize(struct chain *chain)
-/* Return size of all blocks in chain. */
-{
-struct cBlock *b;
-int total = 0;
-for (b = chain->blockList; b != NULL; b = b->next)
-    total += b->qEnd - b->qStart;
-return total;
-}
-
-static int aliIntersectSize(struct chain *chain, int tStart, int tEnd)
-/* How many bases in chain intersect region from tStart to tEnd */
-{
-int total = 0, one;
-struct cBlock *b;
-
-for (b = chain->blockList; b != NULL; b = b->next)
-    {
-    one = rangeIntersection(tStart, tEnd, b->tStart, b->tEnd);
-    if (one > 0)
-	total += one;
-    }
-return total;
-}
-
-static boolean mapThroughChain(struct chain *chain, double minRatio, 
-	int *pStart, int *pEnd, struct chain **retSubChain,
-	struct chain **retChainToFree)
-/* Map interval from start to end from target to query side of chain.
- * Return FALSE if not possible, otherwise update *pStart, *pEnd. */
-{
-struct chain *subChain = NULL;
-struct chain *freeChain = NULL;
-int s = *pStart, e = *pEnd;
-int oldSize = e - s;
-int newCover = 0;
-int ok = TRUE;
-
-chainSubsetOnT(chain, s, e, &subChain, &freeChain);
-if (subChain == NULL)
-    {
-    *retSubChain = NULL;
-    *retChainToFree = NULL;
-    return FALSE;
-    }
-newCover = chainAliSize(subChain);
-if (newCover < oldSize * minRatio)
-    ok = FALSE;
-else if (chain->qStrand == '+')
-    {
-    *pStart = subChain->qStart;
-    *pEnd = subChain->qEnd;
-    }
-else
-    {
-    *pStart = subChain->qSize - subChain->qEnd;
-    *pEnd = subChain->qSize - subChain->qStart;
-    }
-*retSubChain = subChain;
-*retChainToFree = freeChain;
-return ok;
-}
-
 static char *remapRange(struct hash *chainHash, double minRatio, 
                         int minSizeT, int minSizeQ, 
                         int minChainSizeT, int minChainSizeQ, 
                         char *chrom, int s, int e, char qStrand,
 			int thickStart, int thickEnd, bool useThick,
 			double minMatch,
                         char *regionName, char *db, char *chainTableName,
                         struct bed **bedRet, struct bed **unmappedBedRet)
 /* Remap a range through chain hash.  If all is well return NULL
  * and results in a BED (or a list of BED's, if regionName is set (-multiple).
  * Otherwise return a string describing the problem. 
  * note: minSizeT is currently NOT implemented. feel free to add it.
  *       (its not e-s, its the boundaries of what maps of chrom:s-e to Q)
  */
 {
-struct binElement *list = findRange(chainHash, chrom, s, e), *el;
-struct chain *chainsHit = NULL, 
-                *chainsPartial = NULL, 
-                *chainsMissed = NULL, *chain;
-struct bed *bedList = NULL, *unmappedBedList = NULL;
-struct bed *bed = NULL;
-char strand = qStrand;
-/* initialize for single region case */
-int start = s, end = e;
-double minMatchSize = minMatch * (end - start);
-int intersectSize;
-int tStart;
-bool multiple = (regionName != NULL);
-
-verbose(2, "%s:%d-%d", chrom, s, e);
-verbose(2, multiple ? "\t%s\n": "\n", regionName);
-for (el = list; el != NULL; el = el->next)
-    {
-    chain = el->val;
-    if (multiple)
-        {
-        if (chain->qEnd - chain->qStart < minChainSizeQ ||
-            chain->tEnd - chain->tStart < minChainSizeT)
-                continue;
-        /* limit required match to chain range on target */
-        end = min(e, chain->tEnd);
-        start = max(s, chain->tStart);
-        minMatchSize = minMatch *  (end - start);
-        }
-    intersectSize = aliIntersectSize(chain, start, end);
-    if (intersectSize >= minMatchSize)
-	slAddHead(&chainsHit, chain);
-    else if (intersectSize > 0)
-	{
-	slAddHead(&chainsPartial, chain);
-	}
-    else
-	{
-        /* shouldn't happen ? */
-	slAddHead(&chainsMissed, chain);
-	}
-    }
-slFreeList(&list);
-
-if (chainsHit == NULL)
-    {
-    if (chainsPartial == NULL)
-	return "Deleted in new";
-    else if (chainsPartial->next == NULL)
-	return "Partially deleted in new";
-    else
-	return "Split in new";
-    }
-else if (chainsHit->next != NULL && !multiple)
-    {
-    return "Duplicated in new";
-    }
-/* sort chains by position in target to order subregions by orthology */
-slSort(&chainsHit, chainCmpTarget);
-
-tStart = s;
-struct chain *next = chainsHit->next;
-for (chain = chainsHit; chain != NULL; chain = next)
-    {
-    struct chain *subChain = NULL;
-    struct chain *toFree = NULL;
-    int start=s, end=e;
-    next = chain->next;
-    verbose(3,"hit chain %s:%d %s:%d-%d %c (%d)\n",
-        chain->tName, chain->tStart,  chain->qName, chain->qStart, chain->qEnd,
-        chain->qStrand, chain->id);
-    if (multiple)
-        {
-        /* no real need to verify ratio again (it would require
-         * adjusting coords again). */
-        minRatio = 0;
-        if (db)
-            {
-            /* use full chain, not the possibly truncated chain
-             * from the net */
-            struct chain *next = chain->next;
-            chain = chainLoadIdRange(db, chainTableName,
-                                        chrom, s, e, chain->id);
-            chain->next = next;
-            verbose(3,"chain from db %s:%d %s:%d-%d %c (%d)\n",
-                chain->tName, chain->tStart,  chain->qName, 
-                chain->qStart, chain->qEnd, chain->qStrand, chain->id);
-            }
-        }
-    if (!mapThroughChain(chain, minRatio, &start, &end, &subChain, &toFree))
-        errAbort("Chain mapping error: %s:%d-%d\n", chain->qName, start, end);
-    if (chain->qStrand == '-')
-	strand = otherStrand(qStrand);
-    if (useThick)
-	{
-	struct chain *subChain2 = NULL;
-	struct chain *toFree2 = NULL;
-	if (!mapThroughChain(chain, minRatio, &thickStart, &thickEnd,
-		&subChain2, &toFree2))
-	    thickStart = thickEnd = start;
-	chainFree(&toFree2);
-	}
-    verbose(3, "mapped %s:%d-%d\n", chain->qName, start, end);
-    verbose(4, "Mapped! Target:\t%s\t%d\t%d\t%c\tQuery:\t%s\t%d\t%d\t%c\n",
-	    chain->tName, subChain->tStart, subChain->tEnd, strand, 
-	    chain->qName, subChain->qStart, subChain->qEnd, chain->qStrand);
-    if (multiple && end - start < minSizeQ)
-        {
-        verbose(2,"dropping %s:%d-%d size %d (too small)\n", 
-                       chain->qName, start, end, end - start);
-        continue;
-        }
-    AllocVar(bed);
-    bed->chrom = cloneString(chain->qName);
-    bed->chromStart = start;
-    bed->chromEnd = end;
-    if (regionName)
-        bed->name = cloneString(regionName);
-    bed->strand[0] = strand;
-    bed->strand[1] = 0;
-    if (useThick)
-	{
-	bed->thickStart = thickStart;
-	bed->thickEnd = thickEnd;
-	}
-    slAddHead(&bedList, bed);
-    if (tStart < subChain->tStart)
-        {
-        /* unmapped portion of target */
-        AllocVar(bed);
-        bed->chrom = cloneString(chain->tName);
-        bed->chromStart = tStart;
-        bed->chromEnd = subChain->tStart;
-        if (regionName)
-            bed->name = cloneString(regionName);
-        slAddHead(&unmappedBedList, bed);
-        }
-    tStart = subChain->tEnd;
-    chainFree(&toFree);
-    }
-slReverse(&bedList);
-*bedRet = bedList;
-slReverse(&unmappedBedList);
-if (unmappedBedRet)
-    *unmappedBedRet = unmappedBedList;
-if (bedList==NULL)
-    return "Deleted in new";
-return NULL;
+return remapRangeCore(chainHash, minRatio, minSizeT, minSizeQ, minChainSizeT, minChainSizeQ, 
+		      chrom, s, e, qStrand, thickStart, thickEnd, useThick, minMatch, regionName, 
+		      db, chainTableName, chainLoadIdRange, bedRet, unmappedBedRet);
 }
 
 char *liftOverRemapRange(struct hash *chainHash, double minRatio,
                             char *chrom, int s, int e, char strand, 
                             double minMatch, char **retChrom, int *retStart, 
                             int *retEnd, char *retStrand)
 /* Remap a range through chain hash.  If all is well return NULL
  * and results in retChrom, retStart, retEnd.  Otherwise
  * return a string describing the problem. */
 {
 struct bed *bed;
 char *error;
 
 error = remapRange(chainHash, minRatio, 0, 0, 0, 0, chrom, s, e, strand, 0, 0,
 		   FALSE, minMatch, NULL, NULL, NULL, &bed, NULL);