src/utils/bedToBigBed/bedToBigBed.c 433ed3d13a7fd8e09901031c223c54d5abc77178

433ed3d13a7fd8e09901031c223c54d5abc77178
braney
  Fri Apr 24 13:11:54 2020 -0700
add a flag to bedToBigBed to allow exons to have 1bp overlapping

diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c
index de134fd..a278fa9 100644
--- src/utils/bedToBigBed/bedToBigBed.c
+++ src/utils/bedToBigBed/bedToBigBed.c
@@ -25,30 +25,31 @@
  *   2.6 - Made it not crash on empty input.  
  *   */
 
 /* Things set directly or indirectly by command lne in main() routine. */
 int blockSize = 256;
 int itemsPerSlot = 512;
 char *extraIndex = NULL;
 int bedN = 0;   /* number of standard bed fields */
 int bedP = 0;   /* number of bed plus fields */
 char *asFile = NULL;
 char *asText = NULL;
 char *udcDir = NULL;
 static boolean doCompress = FALSE;
 static boolean tabSep = FALSE;
 static boolean sizesIs2Bit = FALSE;
+static boolean allow1bpOverlap = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bedToBigBed v. %s - Convert bed file to bigBed. (bbi version: %d)\n"
   "usage:\n"
   "   bedToBigBed in.bed chrom.sizes out.bb\n"
   "Where in.bed is in one of the ascii bed formats, but not including track lines\n"
   "and chrom.sizes is a two-column file/URL: <chromosome name> <size in bases>\n"
   "and out.bb is the output indexed big bed file.\n"
   "If the assembly <db> is hosted by UCSC, chrom.sizes can be a URL like\n"
   "  http://hgdownload.soe.ucsc.edu/goldenPath/<db>/bigZips/<db>.chrom.sizes\n"
   "or you may use the script fetchChromSizes to download the chrom.sizes file.\n"
   "If you have bed annotations on patch sequences from NCBI, a more inclusive\n"
@@ -67,44 +68,46 @@
   "                      optional (+) if extra \"bedPlus\" fields, \n"
   "                      optional P specifies the number of extra fields. Not required, but preferred.\n"
   "                      Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n"
   "                      (see http://genome.ucsc.edu/FAQ/FAQformat.html#format1)\n"
   "   -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n"
   "                   of each field in a row in AutoSql format here.\n"
   "   -blockSize=N - Number of items to bundle in r-tree.  Default %d\n"
   "   -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n"
   "   -unc - If set, do not use compression.\n"
   "   -tab - If set, expect fields to be tab separated, normally\n"
   "           expects white space separator.\n"
   "   -extraIndex=fieldList - If set, make an index on each field in a comma separated list\n"
   "           extraIndex=name and extraIndex=name,id are commonly used.\n"
   "   -sizesIs2Bit  -- If set, the chrom.sizes file is assumed to be a 2bit file.\n"
   "   -udcDir=/path/to/udcCacheDir  -- sets the UDC cache dir for caching of remote files.\n"
+  "   -allow1bpOverlap  -- allow exons to overlap by at most one base pair\n"
   , version, bbiCurrentVersion, blockSize, itemsPerSlot
   );
 }
 
 static struct optionSpec options[] = {
    {"blockSize", OPTION_INT},
    {"itemsPerSlot", OPTION_INT},
    {"type", OPTION_STRING},
    {"as", OPTION_STRING},
    {"unc", OPTION_BOOLEAN},
    {"tab", OPTION_BOOLEAN},
    {"sizesIs2Bit", OPTION_BOOLEAN},
    {"extraIndex", OPTION_STRING},
    {"udcDir", OPTION_STRING},
+   {"allow1bpOverlap", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 int bbNamedFileChunkCmpByName(const void *va, const void *vb)
 /* Compare two named offset object to facilitate qsorting by name. */
 {
 const struct bbNamedFileChunk *a = va, *b = vb;
 return strcmp(a->name, b->name);
 }
 
 static int maxBedNameSize;
 
 void bbNamedFileChunkKey(const void *va, char *keyBuf)
 /* Copy name to keyBuf for bPlusTree maker */
 {
@@ -183,31 +186,31 @@
 long sectionStartIx = 0, sectionEndIx = 0;
 
 for (;;)
     {
     /* Get next line of input if any. */
     if (lineFileNextReal(lf, &line))
 	{
 	/* Chop up line and make sure the word count is right. */
 	int wordCount;
 	if (tabSep)
 	    wordCount = chopTabs(line, row);
 	else
 	    wordCount = chopLine(line, row);
 	lineFileExpectWords(lf, fieldCount, wordCount);
 
-	loadAndValidateBed(row, bedN, fieldCount, lf, bed, as, FALSE);
+	loadAndValidateBedExt(row, bedN, fieldCount, lf, bed, as, FALSE, allow1bpOverlap);
 
 	chrom = bed->chrom;
 	start = bed->chromStart;
 	end = bed->chromEnd;
 
 	sameChrom = sameString(chrom, usage->name);
 	}
     else  /* No next line */
 	{
 	atEnd = TRUE;
 	}
 
 
     /* Check conditions that would end block and save block info and advance to next if need be. */
     if (atEnd || !sameChrom || itemIx >= itemsPerSlot)
@@ -810,30 +813,31 @@
 bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, asText, as, 
 	doCompress, extraIndexList, outName);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 blockSize = optionInt("blockSize", blockSize);
 itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot);
 asFile = optionVal("as", asFile);
 doCompress = !optionExists("unc");
 sizesIs2Bit = optionExists("sizesIs2Bit");
 extraIndex = optionVal("extraIndex", NULL);
 tabSep = optionExists("tab");
+allow1bpOverlap = optionExists("allow1bpOverlap");
 udcDir = optionVal("udcDir", udcDefaultDir());
 if (argc != 4)
     usage();
 udcSetDefaultDir(udcDir);
 
 if (optionExists("type"))
     {
     // parse type
     char *btype = cloneString(optionVal("type", ""));
     char *plus = strchr(btype, '+');
     if (plus)
 	{
 	*plus++ = 0;
 	if (isdigit(*plus))
 	    bedP = sqlUnsigned(plus);