d83e58012a5e77fe65f840071d230e171bc6b2a6
angie
  Tue Oct 5 15:39:39 2010 -0700
Redmine Track #1261 (Construct track of "bad apple" coverage anomalies in human genome): Added -allowStartEqualEnd (as in hgLoadBed) so that 0-length items like insertions aren't lost -- Mary-Claire King pointed out the lack of insertions in the first version of the track, oops. Majorly refactored bedIntersect.c and added test cases.
diff --git src/hg/bedIntersect/bedIntersect.c src/hg/bedIntersect/bedIntersect.c
index a716395..c077021 100644
--- src/hg/bedIntersect/bedIntersect.c
+++ src/hg/bedIntersect/bedIntersect.c
@@ -9,14 +9,16 @@
 
 static boolean aHitAny = FALSE;
 static boolean bScore = FALSE;
-static float aCoverage = 0.00001;
+static float minCoverage = 0.00001;
 static boolean strictTab = FALSE;
+static boolean allowStartEqualEnd = FALSE;
 
 static struct optionSpec optionSpecs[] = {
     {"aHitAny", OPTION_BOOLEAN},
     {"bScore", OPTION_BOOLEAN},
-    {"aCoverage", OPTION_FLOAT},
+    {"minCoverage", OPTION_FLOAT},
     {"tab", OPTION_BOOLEAN},
+    {"allowStartEqualEnd", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
@@ -30,21 +32,16 @@
   "   bedIntersect a.bed b.bed output.bed\n"
   "options:\n"
   "   -aHitAny        output all of a if any of it is hit by b\n"
-  "   -aCoverage=0.N  min coverage of b to output match.  Default .00001\n"
+  "   -minCoverage=0.N  min coverage of b to output match (or if -aHitAny, of a).\n"
+  "                   Not applied to 0-length items.  Default %f\n"
   "   -bScore         output score from b.bed (must be at least 5 field bed)\n"
   "   -tab            chop input at tabs not spaces\n"
+  "   -allowStartEqualEnd  Don't discard 0-length items of a or b\n"
+  "                        (e.g. point insertions)\n",
+  minCoverage
   );
 }
 
-struct bed3 
-/* A three field bed. */
-    {
-    struct bed3 *next;
-    char *chrom;	/* Allocated in hash. */
-    int start;		/* Start (0 based) */
-    int end;		/* End (non-inclusive) */
-    };
-
 struct bed5 
 /* A five field bed. */
     {
@@ -60,16 +57,16 @@
 /* Read bed and return it as a hash keyed by chromName
  * with binKeeper values. */
 {
-char *row[3];
+char *row[5];
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
-struct binKeeper *bk;
 struct hash *hash = newHash(0);
-struct hashEl *hel;
-struct bed3 *bed;
+int expectedCols = bScore ? 5 : 3;
 
-while (lineFileRow(lf, row))
+while (lineFileNextRow(lf, row, expectedCols))
     {
-    hel = hashLookup(hash, row[0]);
+    struct binKeeper *bk;
+    struct bed5 *bed;
+    struct hashEl *hel = hashLookup(hash, row[0]);
     if (hel == NULL)
        {
        bk = binKeeperNew(0, 1024*1024*1024);
@@ -80,102 +77,92 @@
     bed->chrom = hel->name;
     bed->start = lineFileNeedNum(lf, row, 1);
     bed->end = lineFileNeedNum(lf, row, 2);
+    if (bScore)
+	bed->score = lineFileNeedNum(lf, row, 4);
     if (bed->start > bed->end)
         errAbort("start after end line %d of %s", lf->lineIx, lf->fileName);
+    if (allowStartEqualEnd && bed->start == bed->end)
+	// Note we are tweaking binKeeper coords here, so use bed->start and bed->end.
+	binKeeperAdd(bk, max(0, bed->start-1), bed->end+1, bed);
+    else
     binKeeperAdd(bk, bed->start, bed->end, bed);
     }
 lineFileClose(&lf);
 return hash;
 }
 
-struct hash *readBed5(char *fileName)
-/* Read bed and return it as a hash keyed by chromName
- * with binKeeper values. */
-{
-char *row[5];
-struct lineFile *lf = lineFileOpen(fileName, TRUE);
-struct binKeeper *bk;
-struct hash *hash = newHash(0);
-struct hashEl *hel;
-struct bed5 *bed;
-
-while (lineFileRow(lf, row))
-    {
-    hel = hashLookup(hash, row[0]);
-    if (hel == NULL)
+void outputBed(FILE *f, char **row, int wordCount, int aStart, int aEnd, struct bed5 *b)
+/* Print a row of bed, possibly substituting in a different start, end and score. */
+{
+int s = aHitAny ? aStart : max(aStart, b->start);
+int e = aHitAny ? aEnd : min(aEnd, b->end);
+if (s == e && !allowStartEqualEnd)
+    return;
+int i;
+fprintf(f, "%s\t%d\t%d", row[0], s, e);
+for (i = 3 ; i<wordCount; i++)
        {
-       bk = binKeeperNew(0, 511*1024*1024);
-       hel = hashAdd(hash, row[0], bk);
+    if (bScore && i==4)
+	fprintf(f, "\t%d", b->score);
+    else
+	fprintf(f, "\t%s", row[i]);
        }
-    bk = hel->val;
-    AllocVar(bed);
-    bed->chrom = hel->name;
-    bed->start = lineFileNeedNum(lf, row, 1);
-    bed->end = lineFileNeedNum(lf, row, 2);
-    bed->score = lineFileNeedNum(lf, row, 4);
-    if (bed->start > bed->end)
-        errAbort("start after end line %d of %s", lf->lineIx, lf->fileName);
-    binKeeperAdd(bk, bed->start, bed->end, bed);
+fprintf(f, "\n");
     }
-lineFileClose(&lf);
-return hash;
+
+float getCov(int aStart, int aEnd, struct bed5 *b)
+/* Compute coverage of a's start and end vs. b's, taking aHitAny and
+ * allowStartEqualEnd into account. */
+{
+float overlap = positiveRangeIntersection(aStart, aEnd, b->start, b->end);
+float denominator = aHitAny ? (aEnd - aStart) : (b->end - b->start);
+float cov = (denominator == 0) ? 0.0 : overlap/denominator;
+if (allowStartEqualEnd && (aStart == aEnd || b->start == b->end))
+    cov = 1.0;
+return cov;
 }
+
 void bedIntersect(char *aFile, char *bFile, char *outFile)
 /* bedIntersect - Intersect two bed files. */
 {
-struct hash *bHash = NULL;
 struct lineFile *lf = lineFileOpen(aFile, TRUE);
+struct hash *bHash = readBed(bFile);
 FILE *f = mustOpen(outFile, "w");
 char *row[40];
-char *name;
-int start, end, i, wordCount;
-struct binElement *hitList = NULL, *hit;
-struct binKeeper *bk;
-struct bed5 *bed;
+int wordCount;
 
-if (bScore)
-    bHash = readBed5(bFile);
-else
-    bHash = readBed(bFile);
 while ((wordCount = (strictTab ? lineFileChopTab(lf, row) : lineFileChop(lf, row))) != 0)
     {
-    name = row[0];
-    start = lineFileNeedNum(lf, row, 1);
-    end = lineFileNeedNum(lf, row, 2);
+    char *chrom = row[0];
+    int start = lineFileNeedNum(lf, row, 1);
+    int end = lineFileNeedNum(lf, row, 2);
     if (start > end)
         errAbort("start after end line %d of %s", lf->lineIx, lf->fileName);
-    bk = hashFindVal(bHash, name);
+    struct binKeeper *bk = hashFindVal(bHash, chrom);
     if (bk != NULL)
 	{
+	struct binElement *hitList = NULL, *hit;
+	if (allowStartEqualEnd && start == end)
+	    hitList = binKeeperFind(bk, start-1, end+1);
+	else
 	hitList = binKeeperFind(bk, start, end);
 	if (aHitAny)
 	    {
-	    if (hitList != NULL)
-                {
                 for (hit = hitList; hit != NULL; hit = hit->next)
                     {
-                    float overlap = (float)positiveRangeIntersection(start, end, hit->start, hit->end);
-                    if (overlap/(float)(end-start) > aCoverage)
-                        {
-                        fprintf(f, "%s\t%d\t%d", name, start, end);
-                        for (i = 3 ; i<wordCount; i++)
-                            {
-                            if (bScore && i==4)
+		float cov = getCov(start, end, hit->val);
+		if (cov >= minCoverage)
                                 {
-                                bed = hit->val;
-                                fprintf(f, "\t%d",bed->score);
-                                }
-                            else
-                                fprintf(f, "\t%s",row[i]);
-                            }
-                        fprintf(f, "\n");
+		    outputBed(f, row, wordCount, start, end, hit->val);
                         break;
                         }
                     else
                         {
-                        printf("filter out %s %d %d %d %d overlap %.1f %d %d %.3f\n",
-                                name,start,end, hit->start,hit->end, overlap, end-start, hit->end-hit->start,overlap/(float)(end-start));
-                        }
+		    struct bed5 *b = hit->val;
+		    verbose(1, "filter out %s %d %d %d %d overlap %d %d %d %.3f\n",
+			    chrom, start, end, b->start, b->end,
+			    positiveRangeIntersection(start, end, b->start, b->end),
+			    end-start, b->end-b->start, cov);
                     }
                 }
 	    }
@@ -183,24 +170,8 @@
 	    {
 	    for (hit = hitList; hit != NULL; hit = hit->next)
 	        {
-		int s = max(start, hit->start);
-		int e = min(end, hit->end);
-                float overlap = (float)positiveRangeIntersection(start, end, hit->start, hit->end);
-		if ((s < e) && (overlap/(float)(hit->end-hit->start) > aCoverage))
-                    {
-		    fprintf(f, "%s\t%d\t%d", name, s, e);
-                    for (i = 3 ; i<wordCount; i++)
-                        {
-                        if (bScore && i==4)
-                            {
-                            bed = hit->val;
-                            fprintf(f, "\t%d",bed->score);
-                            }
-                        else
-                            fprintf(f, "\t%s",row[i]);
-                        }
-                    fprintf(f, "\n");
-                    }
+		if (getCov(start, end, hit->val) >= minCoverage)
+		    outputBed(f, row, wordCount, start, end, hit->val);
 		}
 	    }
 	slFreeList(&hitList);
@@ -215,8 +186,9 @@
 
 aHitAny = optionExists("aHitAny");
 bScore = optionExists("bScore");
-aCoverage = optionFloat("aCoverage", aCoverage);
+minCoverage = optionFloat("minCoverage", minCoverage);
 strictTab = optionExists("tab");
+allowStartEqualEnd = optionExists("allowStartEqualEnd");
 if (argc != 4)
     usage();
 bedIntersect(argv[1], argv[2], argv[3]);