b224dbbd105b929a99841a7dd11526cc0b1a5422
aamp
  Fri Sep 14 11:20:42 2012 -0700
Changed liftOver to allow big-sized items by just lifting the ends.  Needed library changes as well.
diff --git src/hg/lib/liftOver.c src/hg/lib/liftOver.c
index d9444ea..7e20e1c 100644
--- src/hg/lib/liftOver.c
+++ src/hg/lib/liftOver.c
@@ -260,31 +260,30 @@
         }
     AllocVar(bed);
     bed->chrom = cloneString(chain->qName);
     bed->chromStart = start;
     bed->chromEnd = end;
     if (regionName)
         bed->name = cloneString(regionName);
     bed->strand[0] = strand;
     bed->strand[1] = 0;
     if (useThick)
 	{
 	bed->thickStart = thickStart;
 	bed->thickEnd = thickEnd;
 	}
     slAddHead(&bedList, bed);
-
     if (tStart < subChain->tStart)
         {
         /* unmapped portion of target */
         AllocVar(bed);
         bed->chrom = cloneString(chain->tName);
         bed->chromStart = tStart;
         bed->chromEnd = subChain->tStart;
         if (regionName)
             bed->name = cloneString(regionName);
         slAddHead(&unmappedBedList, bed);
         }
     tStart = subChain->tEnd;
     chainFree(&toFree);
     }
 slReverse(&bedList);
@@ -355,96 +354,156 @@
     wordCount = lineFileChopCharNext(lf, '\t', words, maxWord);
 else
     wordCount = lineFileChopNext(lf, words, maxWord);
 if (hasBin)
     {
     int i;
     wordCount--;
     for (i = 1; i <= wordCount; i++)
         words[i-1] = words[i];
     }
 if (wordCount <= 0)
     return 0;
 return wordCount;
 }
 
-static int bedOverSmall(struct lineFile *lf, int fieldCount, 
+static int bedOverSmallEnds(struct lineFile *lf, int fieldCount, 
                         struct hash *chainHash, double minMatch, int minSizeT, 
                         int minSizeQ, int minChainT, int minChainQ, 
 			FILE *mapped, FILE *unmapped, bool multiple, 
                         char *chainTable, int bedPlus, bool hasBin, 
-                        bool tabSep, int *errCt)
+			bool tabSep, int ends, int *errCt)
 /* Do a bed without a block-list.
  * NOTE: it would be preferable to have all of the lift
  * functions work at the line level, rather than the file level.
  * Multiple option can be used with bed3 -- it will write a list of
  * regions as a bed4, where score is the "part #". This is used for
  * ENCODE region mapping */  
 {
 int i, wordCount, s, e;
 char *words[LIFTOVER_MAX_WORDS], *chrom;
 char strand = '.', strandString[2];
-char *error;
+char *error, *error2 = NULL;
 int ct = 0;
 int errs = 0;
 struct bed *bedList = NULL, *unmappedBedList = NULL;
+/* result lists for -ends option */
+struct bed *bedList2 = NULL, *unmappedBedList2 = NULL;
 int totalUnmapped = 0;
 double unmappedRatio;
 int totalUnmappedAll = 0;
 int totalBases = 0;
 double mappedRatio;
 char *region = NULL;   /* region name from BED file-- used with  -multiple */
 char *db = NULL, *chainTableName = NULL;
 
 if (chainTable)
     {
     chainTableName = chopPrefix(chainTable);
     db = chainTable;
     chopSuffix(chainTable);
     }
 while ((wordCount = 
             lineFileChopBin(lf, words, ArraySize(words), hasBin, tabSep)) != 0)
     {
     FILE *f = mapped;
     chrom = words[0];
     s = lineFileNeedFullNum(lf, words, 1);
     e = lineFileNeedFullNum(lf, words, 2);
     bool useThick = FALSE;
     int thickStart = 0, thickEnd = 0;
+    int afterS = s + ends;
+    int beforeE = e - ends;
+    bool doEnds = ((ends > 0) && (beforeE > afterS));
     if (s > e)
 	errAbort(
 	"ERROR: start coordinate is after end coordinate (chromStart > chromEnd) on line %d of bed file %s\nERROR: %s %d %d", 
 	    lf->lineIx, lf->fileName, chrom, s, e);
     if (multiple)
         {
         if (wordCount < 4 || wordCount > 6)
             errAbort("Can only lift BED4, BED5, BED6 to multiple regions");
         region = words[3];
         }
     if (wordCount >= 6 && (bedPlus == 0 || bedPlus >= 6))
 	strand = words[5][0];
     if (wordCount >= 8 && (bedPlus == 0 || bedPlus >= 8))
 	{
 	useThick = TRUE;
 	thickStart = lineFileNeedFullNum(lf, words, 6);
 	thickEnd = lineFileNeedFullNum(lf, words, 7);
 	}
+    if (doEnds)
+	{
+	if (useThick)
+	    errAbort("Can only lift BED6 or lower with -ends option");
+	if (multiple)
+	    errAbort("Cannot use -multiple with -ends");
+	error = remapRange(chainHash, minMatch, minSizeT, minSizeQ, minChainT, 
+		       minChainQ, chrom, s, afterS, strand,
+		       thickStart, thickEnd, useThick, minMatch, 
+		       region, db, chainTableName, &bedList, &unmappedBedList);	
+	error2 = remapRange(chainHash, minMatch, minSizeT, minSizeQ, minChainT, 
+		       minChainQ, chrom, beforeE, e, strand,
+		       thickStart, thickEnd, useThick, minMatch, 
+		       region, db, chainTableName, &bedList2, &unmappedBedList2);	
+	}
+    else
     error = remapRange(chainHash, minMatch, minSizeT, minSizeQ, minChainT, 
 		       minChainQ, chrom, s, e, strand,
 		       thickStart, thickEnd, useThick, minMatch, 
 		       region, db, chainTableName, &bedList, &unmappedBedList);
-    if (error == NULL)
+    if (doEnds && !error && !error2 && bedList && bedList2 && (slCount(bedList) == 1) && (slCount(bedList2) == 1)
+	&& sameString(bedList->chrom, bedList2->chrom) && (bedList->chromStart < bedList2->chromEnd)
+	&& (((wordCount >= 6) && ((bedPlus == 0) || (bedPlus >= 6)) && (bedList->strand[0] == bedList2->strand[0])) || (wordCount < 6) || (bedPlus < 6)))
+	{
+	/* really the most restrictive in terms of mapping */
+	if (hasBin)
+	    fprintf(f, "%d\t", binFromRange(bedList->chromStart, bedList2->chromEnd));
+	fprintf(f, "%s\t%d\t%d", bedList->chrom, bedList->chromStart, bedList2->chromEnd);
+	for (i=3; i<wordCount; ++i)
+	    {
+	    if (i == 5 && (bedPlus == 0 || bedPlus >= 6))
+		/* get strand from remap */
+		fprintf(f, "\t%c", bedList->strand[0]);
+	    else
+		/* everything else just passed through */
+		fprintf(f, "\t%s", words[i]);
+	    }
+	fprintf(f, "\n");
+	ct++;
+	}
+    else if (doEnds)
+	{
+	/* error like below */
+	f = unmapped;
+        strandString[0] = strand;
+        strandString[1] = 0;
+        words[5] = strandString;
+	if (error || error2)
+	    fprintf(f, "#%s on one or both ends\n", (error) ? error : error2);
+	else if (!bedList || !bedList2 || (slCount(bedList) > 1) || (slCount(bedList2) > 1)
+		 || !sameString(bedList->chrom, bedList2->chrom) || (bedList->chromStart >= bedList2->chromEnd)
+		 || (((wordCount >= 6) && ((bedPlus == 0) || (bedPlus >= 6))) && (bedList->strand[0] != bedList2->strand[0])))
+	    fprintf(f, "#ends mapped differently or incompletely\n");
+        fprintf(f, "%s\t%d\t%d", chrom, s, e);
+        for (i=3; i<wordCount; ++i)
+            fprintf(f, "\t%s", words[i]);
+        fprintf(f, "\n");
+	errs++;
+	}
+    else if (error == NULL)
         {
         /* successfully mapped */
         int ix = 1;
         struct bed *bed, *next = bedList->next;
         for (bed = bedList; bed != NULL; bed = next)
             {
             if (hasBin)
                 fprintf(f, "%d\t", 
                         binFromRange(bed->chromStart, bed->chromEnd));
             fprintf(f, "%s\t%d\t%d", bed->chrom, 
                                     bed->chromStart, bed->chromEnd);
             if (multiple)
                 {
                 /* region name and part number */
                 fprintf(f, "\t%s\t%d", region, ix++);
@@ -501,30 +560,48 @@
 	fprintf(f, "#%s\n", error);
         fprintf(f, "%s\t%d\t%d", chrom, s, e);
         for (i=3; i<wordCount; ++i)
             fprintf(f, "\t%s", words[i]);
         fprintf(f, "\n");
         errs++;
         }
     }
 if (errCt)
     *errCt = errs;
 mappedRatio = (totalBases - totalUnmappedAll)*100.0 / totalBases;
 verbose(2, "Mapped bases: \t%5.0f%%\n", mappedRatio);
 return ct;
 }
 
+static int bedOverSmall(struct lineFile *lf, int fieldCount, 
+                        struct hash *chainHash, double minMatch, int minSizeT, 
+                        int minSizeQ, int minChainT, int minChainQ, 
+			FILE *mapped, FILE *unmapped, bool multiple, 
+                        char *chainTable, int bedPlus, bool hasBin, 
+                        bool tabSep, int *errCt)
+/* Do a bed without a block-list.
+ * NOTE: it would be preferable to have all of the lift
+ * functions work at the line level, rather than the file level.
+ * Multiple option can be used with bed3 -- it will write a list of
+ * regions as a bed4, where score is the "part #". This is used for
+ * ENCODE region mapping */  
+{
+return bedOverSmallEnds(lf, fieldCount, chainHash, minMatch, 
+                        minSizeT, minSizeQ, minChainT, minChainQ, mapped, unmapped, 
+			multiple, chainTable, bedPlus, hasBin, tabSep, 0, errCt);
+}
+
 static void shortGffLine(struct lineFile *lf)
 /* Complain about short line in GFF and abort. */
 {
 errAbort("Expecting at least 8 words line %d of %s", lf->lineIx, lf->fileName);
 }
 
 static int gffNeedNum(struct lineFile *lf, char *s)
 /* Convert s to an integer or die trying. */
 {
 char c = *s;
 if (isdigit(c) || c == '-')
     return atoi(s);
 else
     errAbort("Expecting number line %d of %s", lf->lineIx, lf->fileName);
 return 0;
@@ -1041,72 +1118,97 @@
 	bedOutputN(bed, bedCount, unmapped, '\t', 
                 (bedCount != wordCount) ? '\t':'\n');
         /* print extra "non-bed" fields in line */
         for (i = bedCount; i < wordCount; i++)
             fprintf(unmapped, "%s%c", words[i], 
                                 (i == wordCount-1) ? '\n':'\t');
         errs++;
 	}
     bedFree(&bed);
     }
 if (errCt)
     *errCt = errs;
 return ct;
 }
 
-int liftOverBedPlus(char *fileName, struct hash *chainHash, double minMatch,  
+
+
+int liftOverBedPlusEnds(char *fileName, struct hash *chainHash, double minMatch,  
                     double minBlocks, int minSizeT, int minSizeQ, int minChainT,
                     int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, 
                     bool multiple, char *chainTable, int bedPlus, bool hasBin, 
-                    bool tabSep, int *errCt)
+                    bool tabSep, int ends, int *errCt)
 /* Lift bed N+ file.
  * Return the number of records successfully converted */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 int wordCount;
 int bedFieldCount = bedPlus;
 char *line;
 char *words[LIFTOVER_MAX_WORDS];
 int ct = 0;
 
 if (lineFileNextReal(lf, &line))
     {
     line = cloneString(line);
     if (tabSep)
         wordCount = chopByChar(line, '\t', words, ArraySize(words));
     else
         wordCount = chopLine(line, words);
     if (hasBin)
         wordCount--;
     lineFileReuse(lf);
     freez(&line);
     if (wordCount < 3)
 	 errAbort("Data format error: expecting at least 3 fields in BED file (%s)", fileName);
     if (bedFieldCount == 0)
         bedFieldCount = wordCount;
     if (bedFieldCount <= 10)
+	{
+	if (ends)
+	    ct = bedOverSmallEnds(lf, wordCount, chainHash, minMatch, 
+                        minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, 
+			multiple, chainTable, bedPlus, hasBin, tabSep, ends, errCt);
+	else
 	 ct = bedOverSmall(lf, wordCount, chainHash, minMatch, 
                         minSizeT, minSizeQ, minChainT, minChainQ, f, unmapped, 
                         multiple, chainTable, bedPlus, hasBin, tabSep, errCt);
+	}
+    else if (ends)
+	errAbort("Cannot use -ends with blocked BED\n");
     else
 	 ct = bedOverBig(lf, wordCount, chainHash, minMatch, minBlocks, 
                         fudgeThick, f, unmapped, bedPlus, hasBin, tabSep, errCt);
     }
 lineFileClose(&lf);
 return ct;
 }
 
+int liftOverBedPlus(char *fileName, struct hash *chainHash, double minMatch,  
+                    double minBlocks, int minSizeT, int minSizeQ, int minChainT,
+                    int minChainQ, bool fudgeThick, FILE *f, FILE *unmapped, 
+                    bool multiple, char *chainTable, int bedPlus, bool hasBin, 
+                    bool tabSep, int *errCt)
+/* Lift bed N+ file.
+ * Return the number of records successfully converted */
+{
+return liftOverBedPlusEnds(fileName, chainHash, minMatch, minBlocks,
+                        minSizeT, minSizeQ, minChainT, minChainQ,
+                        fudgeThick, f, unmapped, multiple, chainTable,
+			    0, FALSE, FALSE, 0, errCt);
+}
+
 int liftOverBed(char *fileName, struct hash *chainHash, 
                         double minMatch,  double minBlocks, 
                         int minSizeT, int minSizeQ,
                         int minChainT, int minChainQ,
                         bool fudgeThick, FILE *f, FILE *unmapped, 
                         bool multiple, char *chainTable, int *errCt)
 /* Open up file, decide what type of bed it is, and lift it. 
  * Return the number of records successfully converted */
 {
 return liftOverBedPlus(fileName, chainHash, minMatch, minBlocks,
                         minSizeT, minSizeQ, minChainT, minChainQ,
                         fudgeThick, f, unmapped, multiple, chainTable,
                         0, FALSE, FALSE, errCt);
 }