12d0bf30d4a9fbe7aee60fdf9e7f8827116ba400
markd
  Wed Jan 27 17:49:12 2021 -0800
allow overlap select to ignore non-standard columns in BED as well as genePred and PSL, as often come with bigBed

diff --git src/hg/utils/overlapSelect/overlapSelect.c src/hg/utils/overlapSelect/overlapSelect.c
index 1c15cd8..8e26915 100644
--- src/hg/utils/overlapSelect/overlapSelect.c
+++ src/hg/utils/overlapSelect/overlapSelect.c
@@ -51,30 +51,32 @@
 /* incompatible with aggregate */
 static char *aggIncompatible[] =
 {
     "overlapSimilarity", "overlapSimilarityCeil", "overlapThresholdCeil", "overlapBases", "merge", "mergeOutput", "idMatch", NULL
 };
 
 /* file format constants */
 enum recordFmt {
     UNKNOWN_FMT,
     PSL_FMT,
     PSLQ_FMT,
     CHAIN_FMT,
     CHAINQ_FMT,
     GENEPRED_FMT,
     BED_FMT,
+    BED3P_FMT,
+    BED6P_FMT,
     COORD_COLS_FMT
 };
 
 /* Options parsed from the command line */
 enum recordFmt selectFmt = UNKNOWN_FMT;
 struct coordCols selectCoordCols;
 unsigned selectCaOpts = 0;
 
 unsigned inFmt = UNKNOWN_FMT;
 struct coordCols inCoordCols;
 unsigned inCaOpts = 0;
 
 unsigned selectOpts = 0;
 boolean useAggregate = FALSE;
 boolean nonOverlapping = FALSE;
@@ -88,30 +90,34 @@
 enum recordFmt parseFormatSpec(char *fmt)
 /* parse a format specification */
 {
 if (sameString(fmt, "psl"))
     return PSL_FMT;
 if (sameString(fmt, "pslq"))
     return PSLQ_FMT;
 if (sameString(fmt, "chain"))
     return CHAIN_FMT;
 if (sameString(fmt, "chainq"))
     return CHAINQ_FMT;
 if (sameString(fmt, "genePred"))
     return GENEPRED_FMT;
 if (sameString(fmt, "bed"))
     return BED_FMT;
+if (sameString(fmt, "bed3+"))
+    return BED3P_FMT;
+if (sameString(fmt, "bed6+"))
+    return BED6P_FMT;
 errAbort("invalid file format: %s", fmt);
 return UNKNOWN_FMT;
 }
 
 enum recordFmt getFileFormat(char *path)
 /* determine the file format from the specified file extension */
 {
 char *filePath = path;
 char filePathBuf[PATH_LEN];
 
 if (endsWith(filePath, ".gz") || endsWith(filePath, ".bz2") || endsWith(filePath, ".Z"))
     {
     /* strip .gz/.bz2/.Z extension */
     splitPath(path, NULL, filePathBuf, NULL);
     filePath = filePathBuf;
@@ -133,31 +139,35 @@
                                                     unsigned caOpts,
                                                     struct coordCols *cols)
 /* construct a reader.  The coordCols spec is only used for tab files */
 {
 switch (fmt)
     {
     case PSL_FMT:
     case PSLQ_FMT:
         return chromAnnPslReaderNew(fileName, caOpts);
     case CHAIN_FMT:
     case CHAINQ_FMT:
         return chromAnnChainReaderNew(fileName, caOpts);
     case GENEPRED_FMT:
         return chromAnnGenePredReaderNew(fileName, caOpts);
     case BED_FMT:
-        return chromAnnBedReaderNew(fileName, caOpts);
+        return chromAnnBedReaderNew(fileName, caOpts, 12);
+    case BED3P_FMT:
+        return chromAnnBedReaderNew(fileName, caOpts, 3);
+    case BED6P_FMT:
+        return chromAnnBedReaderNew(fileName, caOpts, 6);
     case COORD_COLS_FMT:
         return chromAnnTabReaderNew(fileName, cols, caOpts);
     case UNKNOWN_FMT:
         break; 
     }
 assert(FALSE);
 return NULL;
 }
 
 static char *getPrintId(struct chromAnn* ca)
 /* get id for output, or <unknown> if not known */
 {
 return (ca->name == NULL) ? "<unknown>" : ca->name;
 }
 
@@ -355,101 +365,120 @@
 /* enable for memory analysis */
 #if 0
 selectTableFree();
 #endif
 }
 
 void usage()
 /* usage message and abort */
 {
 static char *usageMsg =
 #include "usage.msg"
     ;
 errAbort("%s", usageMsg);
 }
 
-/* entry */
+static boolean hasStrand(unsigned fmt, struct coordCols* coordCols)
+/* does the format include strand? */
+{
+return !((selectFmt == BED3P_FMT) ||
+         ((selectFmt == COORD_COLS_FMT) && (coordCols->strandCol == -1)));
+}
+
 int main(int argc, char** argv)
+/* entry */
 {
 char *selectFile, *inFile, *outFile, *dropFile;
 optionInit(&argc, argv, optionSpecs);
 if (argc != 4)
     usage();
 selectFile = argv[1];
 inFile = argv[2];
 outFile = argv[3];
 
 /* select file options */
 if (optionExists("selectFmt") && optionExists("selectCoordCols"))
     errAbort("can't specify both -selectFmt and -selectCoordCols");
 
 if (optionExists("selectFmt"))
     selectFmt = parseFormatSpec(optionVal("selectFmt", NULL));
 else if (optionExists("selectCoordCols"))
     {
     selectCoordCols = coordColsParseSpec("selectCoordCols",
                                          optionVal("selectCoordCols", NULL));
     selectFmt = COORD_COLS_FMT;
     }
 else
     selectFmt = getFileFormat(selectFile);
+if ((selectFmt == BED3P_FMT) || (selectFmt == BED6P_FMT))
+    selectCaOpts |= chromAnnRange; // no blocks
 
 if (optionExists("selectCds"))
     selectCaOpts |= chromAnnCds;
 if (optionExists("selectRange"))
     selectCaOpts |= chromAnnRange;
 if ((selectFmt == PSLQ_FMT) || (selectFmt == CHAINQ_FMT))
     selectCaOpts |= chromAnnUseQSide;
 
 /* in file options */
 if (optionExists("inFmt") && optionExists("inCoordCols"))
     errAbort("can't specify both -inFmt and -inCoordCols");
 if (optionExists("inFmt"))
     inFmt = parseFormatSpec(optionVal("inFmt", NULL));
 else if (optionExists("inCoordCols"))
     {
     inCoordCols = coordColsParseSpec("inCoordCols",
                                      optionVal("inCoordCols", NULL));
     inFmt = COORD_COLS_FMT;
     }
 else
     inFmt = getFileFormat(inFile);
+if ((inFmt == BED3P_FMT) || (inFmt == BED6P_FMT))
+    inCaOpts |= chromAnnRange; // no blocks
 
 inCaOpts = chromAnnSaveLines; // need lines for output
 if (optionExists("inCds"))
     inCaOpts |= chromAnnCds;
 if (optionExists("inRange"))
     inCaOpts |= chromAnnRange;
 if ((inFmt == PSLQ_FMT) || (inFmt == CHAINQ_FMT))
     inCaOpts |= chromAnnUseQSide;
 
 /* select options */
 useAggregate = optionExists("aggregate");
 nonOverlapping = optionExists("nonOverlapping");
 if (optionExists("strand") && optionExists("oppositeStrand"))
     errAbort("can only specify one of -strand and -oppositeStrand");
 if (optionExists("strand"))
     selectOpts |= selStrand;
 if (optionExists("oppositeStrand"))
     selectOpts |= selOppositeStrand;
 if (optionExists("excludeSelf") && (optionExists("idMatch")))
     errAbort("can't specify both -excludeSelf and -idMatch");
 if (optionExists("excludeSelf"))
     selectOpts |= selExcludeSelf;
 if (optionExists("idMatch"))
     selectOpts |= selIdMatch;
 
+/* catch conflicts */
+boolean needStrand = (selectOpts & (selStrand | selOppositeStrand)) != 0;
+    if (!hasStrand(selectFmt, &selectCoordCols) && needStrand)
+    errAbort("request selecting by strand and selectFile doesn't have strand");
+if (!hasStrand(inFmt, &inCoordCols) && needStrand)
+    errAbort("request selecting by strand and inFile doesn't have strand");
+
+
 criteria.threshold = optionFloat("overlapThreshold", 0.0);
 criteria.thresholdCeil = optionFloat("overlapThresholdCeil", 1.1);
 criteria.similarity = optionFloat("overlapSimilarity", 0.0);
 criteria.similarityCeil = optionFloat("overlapSimilarityCeil", 1.1);
 criteria.bases = optionInt("overlapBases", -1);
 
 /* output options */
 mergeOutput = optionExists("mergeOutput");
 idOutput = optionExists("idOutput");
 statsOutput = optionExists("statsOutput") || optionExists("statsOutputAll") || optionExists("statsOutputBoth");
 if ((mergeOutput + idOutput + statsOutput) > 1)
     errAbort("can only specify one of -mergeOutput, -idOutput, -statsOutput, -statsOutputAll, or -statsOutputBoth");
 outputAll = optionExists("statsOutputAll");
 outputBoth = optionExists("statsOutputBoth");
 if (outputBoth)