12d0bf30d4a9fbe7aee60fdf9e7f8827116ba400 markd Wed Jan 27 17:49:12 2021 -0800 allow overlap select to ignore non-standard columns in BED as well as genePred and PSL, as often come with bigBed diff --git src/hg/utils/overlapSelect/overlapSelect.c src/hg/utils/overlapSelect/overlapSelect.c index 1c15cd8..8e26915 100644 --- src/hg/utils/overlapSelect/overlapSelect.c +++ src/hg/utils/overlapSelect/overlapSelect.c @@ -51,30 +51,32 @@ /* incompatible with aggregate */ static char *aggIncompatible[] = { "overlapSimilarity", "overlapSimilarityCeil", "overlapThresholdCeil", "overlapBases", "merge", "mergeOutput", "idMatch", NULL }; /* file format constants */ enum recordFmt { UNKNOWN_FMT, PSL_FMT, PSLQ_FMT, CHAIN_FMT, CHAINQ_FMT, GENEPRED_FMT, BED_FMT, + BED3P_FMT, + BED6P_FMT, COORD_COLS_FMT }; /* Options parsed from the command line */ enum recordFmt selectFmt = UNKNOWN_FMT; struct coordCols selectCoordCols; unsigned selectCaOpts = 0; unsigned inFmt = UNKNOWN_FMT; struct coordCols inCoordCols; unsigned inCaOpts = 0; unsigned selectOpts = 0; boolean useAggregate = FALSE; boolean nonOverlapping = FALSE; @@ -88,30 +90,34 @@ enum recordFmt parseFormatSpec(char *fmt) /* parse a format specification */ { if (sameString(fmt, "psl")) return PSL_FMT; if (sameString(fmt, "pslq")) return PSLQ_FMT; if (sameString(fmt, "chain")) return CHAIN_FMT; if (sameString(fmt, "chainq")) return CHAINQ_FMT; if (sameString(fmt, "genePred")) return GENEPRED_FMT; if (sameString(fmt, "bed")) return BED_FMT; +if (sameString(fmt, "bed3+")) + return BED3P_FMT; +if (sameString(fmt, "bed6+")) + return BED6P_FMT; errAbort("invalid file format: %s", fmt); return UNKNOWN_FMT; } enum recordFmt getFileFormat(char *path) /* determine the file format from the specified file extension */ { char *filePath = path; char filePathBuf[PATH_LEN]; if (endsWith(filePath, ".gz") || endsWith(filePath, ".bz2") || endsWith(filePath, ".Z")) { /* strip .gz/.bz2/.Z extension */ splitPath(path, NULL, filePathBuf, NULL); filePath = filePathBuf; @@ -133,31 +139,35 @@ unsigned caOpts, struct coordCols *cols) /* construct a reader. The coordCols spec is only used for tab files */ { switch (fmt) { case PSL_FMT: case PSLQ_FMT: return chromAnnPslReaderNew(fileName, caOpts); case CHAIN_FMT: case CHAINQ_FMT: return chromAnnChainReaderNew(fileName, caOpts); case GENEPRED_FMT: return chromAnnGenePredReaderNew(fileName, caOpts); case BED_FMT: - return chromAnnBedReaderNew(fileName, caOpts); + return chromAnnBedReaderNew(fileName, caOpts, 12); + case BED3P_FMT: + return chromAnnBedReaderNew(fileName, caOpts, 3); + case BED6P_FMT: + return chromAnnBedReaderNew(fileName, caOpts, 6); case COORD_COLS_FMT: return chromAnnTabReaderNew(fileName, cols, caOpts); case UNKNOWN_FMT: break; } assert(FALSE); return NULL; } static char *getPrintId(struct chromAnn* ca) /* get id for output, or <unknown> if not known */ { return (ca->name == NULL) ? "<unknown>" : ca->name; } @@ -355,101 +365,120 @@ /* enable for memory analysis */ #if 0 selectTableFree(); #endif } void usage() /* usage message and abort */ { static char *usageMsg = #include "usage.msg" ; errAbort("%s", usageMsg); } -/* entry */ +static boolean hasStrand(unsigned fmt, struct coordCols* coordCols) +/* does the format include strand? */ +{ +return !((selectFmt == BED3P_FMT) || + ((selectFmt == COORD_COLS_FMT) && (coordCols->strandCol == -1))); +} + int main(int argc, char** argv) +/* entry */ { char *selectFile, *inFile, *outFile, *dropFile; optionInit(&argc, argv, optionSpecs); if (argc != 4) usage(); selectFile = argv[1]; inFile = argv[2]; outFile = argv[3]; /* select file options */ if (optionExists("selectFmt") && optionExists("selectCoordCols")) errAbort("can't specify both -selectFmt and -selectCoordCols"); if (optionExists("selectFmt")) selectFmt = parseFormatSpec(optionVal("selectFmt", NULL)); else if (optionExists("selectCoordCols")) { selectCoordCols = coordColsParseSpec("selectCoordCols", optionVal("selectCoordCols", NULL)); selectFmt = COORD_COLS_FMT; } else selectFmt = getFileFormat(selectFile); +if ((selectFmt == BED3P_FMT) || (selectFmt == BED6P_FMT)) + selectCaOpts |= chromAnnRange; // no blocks if (optionExists("selectCds")) selectCaOpts |= chromAnnCds; if (optionExists("selectRange")) selectCaOpts |= chromAnnRange; if ((selectFmt == PSLQ_FMT) || (selectFmt == CHAINQ_FMT)) selectCaOpts |= chromAnnUseQSide; /* in file options */ if (optionExists("inFmt") && optionExists("inCoordCols")) errAbort("can't specify both -inFmt and -inCoordCols"); if (optionExists("inFmt")) inFmt = parseFormatSpec(optionVal("inFmt", NULL)); else if (optionExists("inCoordCols")) { inCoordCols = coordColsParseSpec("inCoordCols", optionVal("inCoordCols", NULL)); inFmt = COORD_COLS_FMT; } else inFmt = getFileFormat(inFile); +if ((inFmt == BED3P_FMT) || (inFmt == BED6P_FMT)) + inCaOpts |= chromAnnRange; // no blocks inCaOpts = chromAnnSaveLines; // need lines for output if (optionExists("inCds")) inCaOpts |= chromAnnCds; if (optionExists("inRange")) inCaOpts |= chromAnnRange; if ((inFmt == PSLQ_FMT) || (inFmt == CHAINQ_FMT)) inCaOpts |= chromAnnUseQSide; /* select options */ useAggregate = optionExists("aggregate"); nonOverlapping = optionExists("nonOverlapping"); if (optionExists("strand") && optionExists("oppositeStrand")) errAbort("can only specify one of -strand and -oppositeStrand"); if (optionExists("strand")) selectOpts |= selStrand; if (optionExists("oppositeStrand")) selectOpts |= selOppositeStrand; if (optionExists("excludeSelf") && (optionExists("idMatch"))) errAbort("can't specify both -excludeSelf and -idMatch"); if (optionExists("excludeSelf")) selectOpts |= selExcludeSelf; if (optionExists("idMatch")) selectOpts |= selIdMatch; +/* catch conflicts */ +boolean needStrand = (selectOpts & (selStrand | selOppositeStrand)) != 0; + if (!hasStrand(selectFmt, &selectCoordCols) && needStrand) + errAbort("request selecting by strand and selectFile doesn't have strand"); +if (!hasStrand(inFmt, &inCoordCols) && needStrand) + errAbort("request selecting by strand and inFile doesn't have strand"); + + criteria.threshold = optionFloat("overlapThreshold", 0.0); criteria.thresholdCeil = optionFloat("overlapThresholdCeil", 1.1); criteria.similarity = optionFloat("overlapSimilarity", 0.0); criteria.similarityCeil = optionFloat("overlapSimilarityCeil", 1.1); criteria.bases = optionInt("overlapBases", -1); /* output options */ mergeOutput = optionExists("mergeOutput"); idOutput = optionExists("idOutput"); statsOutput = optionExists("statsOutput") || optionExists("statsOutputAll") || optionExists("statsOutputBoth"); if ((mergeOutput + idOutput + statsOutput) > 1) errAbort("can only specify one of -mergeOutput, -idOutput, -statsOutput, -statsOutputAll, or -statsOutputBoth"); outputAll = optionExists("statsOutputAll"); outputBoth = optionExists("statsOutputBoth"); if (outputBoth)