b40094812a7c35cef96e0d9b94fa421262947b13
hiram
  Tue Jul 11 14:43:23 2017 -0700
adding truncate option and tests to verify everything works refs #19514

diff --git src/utils/bedClip/bedClip.c src/utils/bedClip/bedClip.c
index 1b67d59..9b72f4b 100644
--- src/utils/bedClip/bedClip.c
+++ src/utils/bedClip/bedClip.c
@@ -1,93 +1,115 @@
 /* bedClip - Remove lines from bed file that refer to off-chromosome places.. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "bbiFile.h"
 #include "sqlNum.h"
 #include "obscure.h"
 
-
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
-  "bedClip - Remove lines from bed file that refer to off-chromosome places.\n"
+  "bedClip - Remove lines from bed file that refer to off-chromosome locations.\n"
   "usage:\n"
-  "   bedClip input.bed chrom.sizes output.bed\n"
+  "   bedClip [options] input.bed chrom.sizes output.bed\n"
   "chrom.sizes is a two-column file/URL: <chromosome name> <size in bases>\n"
   "If the assembly <db> is hosted by UCSC, chrom.sizes can be a URL like\n"
   "  http://hgdownload.cse.ucsc.edu/goldenPath/<db>/bigZips/<db>.chrom.sizes\n"
   "or you may use the script fetchChromSizes to download the chrom.sizes file.\n"
   "If not hosted by UCSC, a chrom.sizes file can be generated by running\n"
   "twoBitInfo on the assembly .2bit file.\n"
   "options:\n"
-  "   -verbose=2 - set to get list of lines clipped and why\n"
+  "   -truncate  - truncate items that span ends of chrom instead of the\n"
+  "                default of dropping the items\n"
+  "   -verbose=2 - set to get list of lines clipped and why"
   );
 }
 
 static struct optionSpec options[] = {
+   {"truncate", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
+static boolean trim = FALSE;	// the name truncate is already taken
+
 void bedClip(char *inFile, char *chromSizes, char *outFile)
 /* bedClip - Remove lines from bed file that refer to off-chromosome places.. */
 {
 struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes);
 struct lineFile *lf = lineFileOpen(inFile, TRUE);
 FILE *f = mustOpen(outFile, "w");
 char *line;
 while (lineFileNextReal(lf, &line))
     {
     char *chrom = nextWord(&line);
     char *startString = nextWord(&line);
     char *endString = nextWord(&line);
     if (endString == NULL)
         errAbort("Need at least three fields line %d of %s", lf->lineIx, lf->fileName);
     if (startString[0] == '-')
 	{
-	verbose(2, "Clipping negative line %d of %s\n", lf->lineIx, lf->fileName);
+	if (trim)
+	    {
+	    verbose(2, "Truncating negative start line %d of %s: %s:%s-%s\n", lf->lineIx, lf->fileName, chrom, startString, endString);
+	    startString = "0";
+	    }
+	else
+	    {
+	    verbose(2, "Clipping negative line %d of %s: %s:%s-%s\n", lf->lineIx, lf->fileName, chrom, startString, endString);
 	    continue;		// Clip off negatives
 	    }
+	}
     if (!isdigit(startString[0]))
-        errAbort("Expecting number got %s line %d of %s", startString, lf->lineIx, lf->fileName);
+        errAbort("Expecting number got %s line %d of %s: %s:%s-%s", startString, lf->lineIx, lf->fileName, chrom, startString, endString);
     if (!isdigit(endString[0]))
-        errAbort("Expecting number got %s line %d of %s", endString, lf->lineIx, lf->fileName);
+        errAbort("Expecting number got %s line %d of %s: %s:%s-%s", endString, lf->lineIx, lf->fileName, chrom, startString, endString);
     int start = sqlUnsigned(startString);
     int end = sqlUnsigned(endString);
     if (start >= end)
 	{
-	verbose(2, "Clipping end <= start line %d of %s\n", lf->lineIx, lf->fileName);
+	verbose(2, "Clipping end <= start line %d of %s: %s:%s-%s\n", lf->lineIx, lf->fileName, chrom, startString, endString);
 	continue;
 	}
     struct hashEl *hel = hashLookup(chromSizesHash, chrom);
     if (hel == NULL)
-        errAbort("Chromosome %s isn't in %s line %d of %s\n", chrom, chromSizes, lf->lineIx, lf->fileName);
+        errAbort("Chromosome %s isn't in %s line %d of %s: %s:%s-%s\n", chrom, chromSizes, lf->lineIx, lf->fileName, chrom, startString, endString);
     int chromSize = ptToInt(hel->val);
     if (end > chromSize)
 	{
-	verbose(2, "Clipping end > chromSize line %d of %s\n", lf->lineIx, lf->fileName);
+	if (trim)
+	    {
+	    end = chromSize;
+	    verbose(2, "Truncating end > chromSize(%d) line %d of %s: %s:%s-%s\n", chromSize, lf->lineIx, lf->fileName, chrom, startString, endString);
+	    }
+	else
+	    {
+	    verbose(2, "Clipping end > chromSize(%d) line %d of %s: %s:%s-%s\n", chromSize, lf->lineIx, lf->fileName, chrom, startString, endString);
 	    continue;
 	    }
-    fprintf(f, "%s\t%s\t%s", chrom, startString, endString);
+	}
+    fprintf(f, "%s\t%d\t%d", chrom, start, end);
     line = skipLeadingSpaces(line);
     if (line == NULL || line[0] == 0)
         fputc('\n', f);
     else
         fprintf(f, "\t%s\n", line);
     }
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
+trim = optionExists("truncate");
+
 bedClip(argv[1], argv[2], argv[3]);
 return 0;
 }