b95ff3835509b242bd9007ba55c4f60c1022da47
markd
  Thu Dec 30 14:58:16 2010 -0800
moved to programs to hg/utils, fixed build of distributed utilities
diff --git src/hg/overlapSelect/overlapSelect.c src/hg/overlapSelect/overlapSelect.c
deleted file mode 100644
index 976e8b8..0000000
--- src/hg/overlapSelect/overlapSelect.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/* overlapSelect - select records based on overlap of chromosome ranges */
-
-#include "common.h"
-#include "selectTable.h"
-#include "coordCols.h"
-#include "chromAnn.h"
-#include "dystring.h"
-#include "options.h"
-
-static char const rcsid[] = "$Id: overlapSelect.c,v 1.42 2009/07/31 18:09:23 markd Exp $";
-
-/* FIXME:
- * - would be nice to be able to specify ranges in the same manner
- *   as featureBits
- * - should keep header lines in files
- * - don't need to save if infile records if stats output
- */
-
-static struct optionSpec optionSpecs[] = {
-    {"selectFmt", OPTION_STRING},
-    {"selectCoordCols", OPTION_STRING},
-    {"selectCds", OPTION_BOOLEAN},
-    {"selectRange", OPTION_BOOLEAN},
-    {"inFmt", OPTION_STRING},
-    {"inCoordCols", OPTION_STRING},
-    {"inCds", OPTION_BOOLEAN},
-    {"inRange", OPTION_BOOLEAN},
-    {"nonOverlapping", OPTION_BOOLEAN},
-    {"strand", OPTION_BOOLEAN},
-    {"oppositeStrand", OPTION_BOOLEAN},
-    {"excludeSelf", OPTION_BOOLEAN},
-    {"idMatch", OPTION_BOOLEAN},
-    {"dropped", OPTION_STRING},
-    {"overlapThreshold", OPTION_FLOAT},
-    {"overlapThresholdCeil", OPTION_FLOAT},
-    {"overlapSimilarity", OPTION_FLOAT},
-    {"overlapSimilarityCeil", OPTION_FLOAT},
-    {"overlapBases", OPTION_INT},
-    {"merge", OPTION_BOOLEAN},
-    {"mergeOutput", OPTION_BOOLEAN},
-    {"statsOutput", OPTION_BOOLEAN},
-    {"statsOutputAll", OPTION_BOOLEAN},
-    {"statsOutputBoth", OPTION_BOOLEAN},
-    {"idOutput", OPTION_BOOLEAN},
-    {"aggregate", OPTION_BOOLEAN},
-    {NULL, 0}
-};
-
-/* incompatible with aggregate */
-static char *aggIncompatible[] =
-{
-    "overlapSimilarity", "overlapSimilarityCeil", "overlapThresholdCeil", "overlapBases", "merge", "mergeOutput", "idMatch", NULL
-};
-
-/* file format constants */
-enum recordFmt {
-    UNKNOWN_FMT,
-    PSL_FMT,
-    PSLQ_FMT,
-    CHAIN_FMT,
-    CHAINQ_FMT,
-    GENEPRED_FMT,
-    BED_FMT,
-    COORD_COLS_FMT
-};
-
-/* Options parsed from the command line */
-enum recordFmt selectFmt = UNKNOWN_FMT;
-struct coordCols selectCoordCols;
-unsigned selectCaOpts = 0;
-
-unsigned inFmt = UNKNOWN_FMT;
-struct coordCols inCoordCols;
-unsigned inCaOpts = 0;
-
-unsigned selectOpts = 0;
-boolean useAggregate = FALSE;
-boolean nonOverlapping = FALSE;
-boolean mergeOutput = FALSE;
-boolean idOutput = FALSE;
-boolean statsOutput = FALSE;
-boolean outputAll = FALSE;
-boolean outputBoth = FALSE;
-struct overlapCriteria criteria = {0.0, 1.1, 0.0, 1.1, -1};
-
-enum recordFmt parseFormatSpec(char *fmt)
-/* parse a format specification */
-{
-if (sameString(fmt, "psl"))
-    return PSL_FMT;
-if (sameString(fmt, "pslq"))
-    return PSLQ_FMT;
-if (sameString(fmt, "chain"))
-    return CHAIN_FMT;
-if (sameString(fmt, "chainq"))
-    return CHAINQ_FMT;
-if (sameString(fmt, "genePred"))
-    return GENEPRED_FMT;
-if (sameString(fmt, "bed"))
-    return BED_FMT;
-errAbort("invalid file format: %s", fmt);
-return UNKNOWN_FMT;
-}
-
-enum recordFmt getFileFormat(char *path)
-/* determine the file format from the specified file extension */
-{
-char *filePath = path;
-char filePathBuf[PATH_LEN];
-
-if (endsWith(filePath, ".gz") || endsWith(filePath, ".bz2") || endsWith(filePath, ".Z"))
-    {
-    /* strip .gz/.bz2/.Z extension */
-    splitPath(path, NULL, filePathBuf, NULL);
-    filePath = filePathBuf;
-    }
-if (endsWith(filePath, ".psl"))
-    return PSL_FMT;
-if (endsWith(filePath, ".chain"))
-    return CHAIN_FMT;
-if (endsWith(filePath, ".genePred") || endsWith(filePath, ".gp"))
-    return GENEPRED_FMT;
-if (endsWith(filePath, ".bed"))
-    return BED_FMT;
-errAbort("can't determine file format of %s", filePath);
-return UNKNOWN_FMT;
-}
-
-static struct  chromAnnReader *createChromAnnReader(char *fileName,
-                                                    enum recordFmt fmt,
-                                                    unsigned caOpts,
-                                                    struct coordCols *cols)
-/* construct a reader.  The coordCols spec is only used for tab files */
-{
-switch (fmt)
-    {
-    case PSL_FMT:
-    case PSLQ_FMT:
-        return chromAnnPslReaderNew(fileName, caOpts);
-    case CHAIN_FMT:
-    case CHAINQ_FMT:
-        return chromAnnChainReaderNew(fileName, caOpts);
-    case GENEPRED_FMT:
-        return chromAnnGenePredReaderNew(fileName, caOpts);
-    case BED_FMT:
-        return chromAnnBedReaderNew(fileName, caOpts);
-    case COORD_COLS_FMT:
-        return chromAnnTabReaderNew(fileName, cols, caOpts);
-    case UNKNOWN_FMT:
-        break; 
-    }
-assert(FALSE);
-return NULL;
-}
-
-static char *getPrintId(struct chromAnn* ca)
-/* get id for output, or <unknown> if not known */
-{
-return (ca->name == NULL) ? "<unknown>" : ca->name;
-}
-
-static void outputMerge(struct chromAnn* inCa, FILE *outFh,
-                        struct chromAnnRef *overlappingRecs)
-/* output for the -mergeOutput option; pairs of inRec and overlap */
-{
-struct chromAnnRef *selectCaRef;
-for (selectCaRef = overlappingRecs; selectCaRef != NULL; selectCaRef = selectCaRef->next)
-    {
-    struct chromAnn *selectCa = selectCaRef->ref;
-    inCa->recWrite(inCa, outFh, '\t');
-    selectCa->recWrite(selectCa, outFh, '\n');
-    }
-}
-
-static void outputIds(struct chromAnn* inCa, FILE *outFh,
-                      struct chromAnnRef *overlappingRecs)
-/* output for the -idOutput option; pairs of inRec and overlap ids */
-{
-struct chromAnnRef *selectCaRef;
-for (selectCaRef = overlappingRecs; selectCaRef != NULL; selectCaRef = selectCaRef->next)
-    {
-    struct chromAnn *selectCa = selectCaRef->ref;
-    fprintf(outFh, "%s\t%s\n", getPrintId(inCa), getPrintId(selectCa));
-    }
-}
-
-/* format string for stats output */
-static char *statsFmt = "%s\t%s\t%0.3g\t%0.3g\t%d\t%0.3g\t%d\t%d\n";
-
-static void outputStats(struct chromAnn* inCa, FILE *outFh,
-                        struct chromAnnRef *overlappingRecs)
-/* output for the -statOutput option; pairs of inRec and overlap ids */
-{
-if (overlappingRecs == NULL)
-    {
-    // -statsOutputAll and nothing overlapping
-    fprintf(outFh, statsFmt, getPrintId(inCa), "", 0.0, 0.0, 0, 0.0, inCa->totalSize, 0);
-    }
-struct chromAnnRef *selectCaRef;
-for (selectCaRef = overlappingRecs; selectCaRef != NULL; selectCaRef = selectCaRef->next)
-    {
-    struct chromAnn *selectCa = selectCaRef->ref;
-    unsigned overBases = selectOverlapBases(inCa, selectCa);
-    fprintf(outFh, statsFmt, getPrintId(inCa), getPrintId(selectCa),
-            selectFracOverlap(inCa, overBases), selectFracOverlap(selectCa, overBases), overBases,
-            selectFracSimilarity(inCa, selectCa, overBases),
-            inCa->totalSize,  selectCa->totalSize);
-    }
-}
-
-static void outputStatsSelNotUsed(FILE *outFh)
-/* output stats for select chromAnns that were not used */
-{
-struct chromAnnMapIter iter = selectTableFirst();
-struct chromAnn *selCa;
-while ((selCa = chromAnnMapIterNext(&iter)) != NULL)
-    {
-    if (!selCa->used)
-        fprintf(outFh, statsFmt, "", getPrintId(selCa), 0.0, 0.0, 0, 0.0, 0, selCa->totalSize);
-    }
-}
-
-static void doItemOverlap(struct chromAnn* inCa, FILE *outFh, FILE *dropFh)
-/* Do individual item overlap process of chromAnn object given the criteria,
- * and if so output */
-{
-struct chromAnnRef *overlappingRecs = NULL;
-struct chromAnnRef **overlappingRecsPtr = NULL;  /* used to indicate if recs should be collected */
-if (mergeOutput || idOutput || statsOutput)
-    overlappingRecsPtr = &overlappingRecs;
-
-boolean overlaps = selectIsOverlapped(selectOpts, inCa, &criteria, overlappingRecsPtr);
-if (overlappingRecsPtr != NULL)
-    slSort(overlappingRecsPtr, chromAnnRefLocCmp);
-if (((nonOverlapping) ? !overlaps : overlaps) || outputAll)
-    {
-    if (mergeOutput)
-        outputMerge(inCa, outFh, overlappingRecs);
-    else if (idOutput)
-        outputIds(inCa, outFh, overlappingRecs);
-    else if (statsOutput)
-        outputStats(inCa, outFh, overlappingRecs);
-    else
-        inCa->recWrite(inCa, outFh, '\n');
-    }
-else if (dropFh != NULL)
-    {
-    if (idOutput)
-        fprintf(dropFh, "%s\n", getPrintId(inCa));
-    else
-        inCa->recWrite(inCa, dropFh, '\n');
-    }
-
-slFreeList(&overlappingRecs);
-}
-
-static void doItemOverlaps(struct chromAnnReader* inCar, FILE *outFh, FILE *dropFh)
-/* Do individual item overlap processings */
-{
-struct chromAnn *inCa;
-while ((inCa = inCar->caRead(inCar)) != NULL)
-    {
-    doItemOverlap(inCa, outFh, dropFh);
-    chromAnnFree(&inCa);
-    }
-}
-
-
-static void doAggregateOverlap(struct chromAnn* inCa, FILE *outFh, FILE *dropFh)
-/* Do aggreate overlap process of chromAnn object given the criteria,
- * and if so output */
-{
-struct overlapAggStats stats = selectAggregateOverlap(selectOpts, inCa);
-boolean overlaps;
-if (criteria.threshold <= 0.0)
-    overlaps = (stats.inOverlap > 0.0); /* any overlap */
-else
-    overlaps = (stats.inOverlap >= criteria.threshold);
-if (((nonOverlapping) ? !overlaps : overlaps) || outputAll)
-    {
-    if (idOutput)
-        fprintf(outFh, "%s\n", getPrintId(inCa));
-    else if (statsOutput)
-        fprintf(outFh, "%s\t%0.3g\t%d\t%d\n", getPrintId(inCa),
-                stats.inOverlap, stats.inOverBases, stats.inBases);
-    else
-        inCa->recWrite(inCa, outFh, '\n');
-    }
-else if (dropFh != NULL)
-    {
-    if (idOutput)
-        fprintf(dropFh, "%s\n", getPrintId(inCa));
-    else
-        inCa->recWrite(inCa, dropFh, '\n');
-    }
-}
-
-static void doAggregateOverlaps(struct chromAnnReader* inCar, FILE *outFh, FILE *dropFh)
-/* Do aggreate overlap processing */
-{
-struct chromAnn *inCa;
-while ((inCa = inCar->caRead(inCar)) != NULL)
-    {
-    doAggregateOverlap(inCa, outFh, dropFh);
-    chromAnnFree(&inCa);
-    }
-}
-
-void loadSelectTable(char *selectFile)
-/* load the select table from a file */
-{
-struct chromAnnReader *selCar = createChromAnnReader(selectFile, selectFmt, selectCaOpts, &selectCoordCols);
-selectTableAddRecords(selCar);
-selCar->carFree(&selCar);
-}
-
-void overlapSelect(char *selectFile, char *inFile, char *outFile, char *dropFile)
-/* select records based on overlap of chromosome ranges */
-{
-struct chromAnnReader *inCar
-    = createChromAnnReader(inFile, inFmt, inCaOpts, &inCoordCols);
-loadSelectTable(selectFile);
-FILE *outFh = mustOpen(outFile, "w");
-FILE *dropFh = NULL;
-if (dropFile != NULL)
-    dropFh = mustOpen(dropFile, "w");
-if (idOutput)
-    {
-    if (useAggregate)
-        fputs("#inId\n", outFh);
-    else
-        fputs("#inId\t" "selectId\n", outFh);
-    }
-if (statsOutput)
-    {
-    if (useAggregate)
-        fputs("#inId\t" "inOverlap\t" "inOverBases\t" "inBases\n", outFh);
-    else
-        fputs("#inId\t" "selectId\t" "inOverlap\t" "selectOverlap\t" "overBases\t" "similarity\t" "inBases\t" "selectBases\n", outFh);
-    }
-
-if (useAggregate)
-    doAggregateOverlaps(inCar, outFh, dropFh);
-else
-    doItemOverlaps(inCar, outFh, dropFh);
-
-inCar->carFree(&inCar);
-if (statsOutput && outputBoth)
-    outputStatsSelNotUsed(outFh);
-
-carefulClose(&outFh);
-carefulClose(&dropFh);
-/* enable for memory analysis */
-#if 0
-selectTableFree();
-#endif
-}
-
-void usage(char *msg)
-/* usage message and abort */
-{
-static char *usageMsg =
-#include "usage.msg"
-    ;
-errAbort("%s:  %s", msg, usageMsg);
-}
-
-/* entry */
-int main(int argc, char** argv)
-{
-char *selectFile, *inFile, *outFile, *dropFile;
-optionInit(&argc, argv, optionSpecs);
-if (argc != 4)
-    usage("wrong # args");
-selectFile = argv[1];
-inFile = argv[2];
-outFile = argv[3];
-
-/* select file options */
-if (optionExists("selectFmt") && optionExists("selectCoordCols"))
-    errAbort("can't specify both -selectFmt and -selectCoordCols");
-
-if (optionExists("selectFmt"))
-    selectFmt = parseFormatSpec(optionVal("selectFmt", NULL));
-else if (optionExists("selectCoordCols"))
-    {
-    selectCoordCols = coordColsParseSpec("selectCoordCols",
-                                         optionVal("selectCoordCols", NULL));
-    selectFmt = COORD_COLS_FMT;
-    }
-else
-    selectFmt = getFileFormat(selectFile);
-
-if (optionExists("selectCds"))
-    selectCaOpts |= chromAnnCds;
-if (optionExists("selectRange"))
-    selectCaOpts |= chromAnnRange;
-if ((selectFmt == PSLQ_FMT) || (selectFmt == CHAINQ_FMT))
-    selectCaOpts |= chromAnnUseQSide;
-
-/* in file options */
-if (optionExists("inFmt") && optionExists("inCoordCols"))
-    errAbort("can't specify both -inFmt and -inCoordCols");
-if (optionExists("inFmt"))
-    inFmt = parseFormatSpec(optionVal("inFmt", NULL));
-else if (optionExists("inCoordCols"))
-    {
-    inCoordCols = coordColsParseSpec("inCoordCols",
-                                     optionVal("inCoordCols", NULL));
-    inFmt = COORD_COLS_FMT;
-    }
-else
-    inFmt = getFileFormat(inFile);
-
-inCaOpts = chromAnnSaveLines; // need lines for output
-if (optionExists("inCds"))
-    inCaOpts |= chromAnnCds;
-if (optionExists("inRange"))
-    inCaOpts |= chromAnnRange;
-if ((inFmt == PSLQ_FMT) || (inFmt == CHAINQ_FMT))
-    inCaOpts |= chromAnnUseQSide;
-
-/* select options */
-useAggregate = optionExists("aggregate");
-nonOverlapping = optionExists("nonOverlapping");
-if (optionExists("strand") && optionExists("oppositeStrand"))
-    errAbort("can only specify one of -strand and -oppositeStrand");
-if (optionExists("strand"))
-    selectOpts |= selStrand;
-if (optionExists("oppositeStrand"))
-    selectOpts |= selOppositeStrand;
-if (optionExists("excludeSelf") && (optionExists("idMatch")))
-    errAbort("can't specify both -excludeSelf and -idMatch");
-if (optionExists("excludeSelf"))
-    selectOpts |= selExcludeSelf;
-if (optionExists("idMatch"))
-    selectOpts |= selIdMatch;
-
-criteria.threshold = optionFloat("overlapThreshold", 0.0);
-criteria.thresholdCeil = optionFloat("overlapThresholdCeil", 1.1);
-criteria.similarity = optionFloat("overlapSimilarity", 0.0);
-criteria.similarityCeil = optionFloat("overlapSimilarityCeil", 1.1);
-criteria.bases = optionInt("overlapBases", -1);
-
-/* output options */
-mergeOutput = optionExists("mergeOutput");
-idOutput = optionExists("idOutput");
-statsOutput = optionExists("statsOutput") || optionExists("statsOutputAll") || optionExists("statsOutputBoth");
-if ((mergeOutput + idOutput + statsOutput) > 1)
-    errAbort("can only specify one of -mergeOutput, -idOutput, -statsOutput, -statsOutputAll, or -statsOutputBoth");
-outputAll = optionExists("statsOutputAll");
-outputBoth = optionExists("statsOutputBoth");
-if (outputBoth)
-    outputAll = TRUE;
-if (mergeOutput)
-    {
-    if (nonOverlapping)
-        errAbort("can't use -mergeOutput with -nonOverlapping");
-    if (useAggregate)
-        errAbort("can't use -mergeOutput with -aggregate");
-    if ((selectFmt == CHAIN_FMT) || (selectFmt == CHAINQ_FMT)
-        || (inFmt == CHAIN_FMT) || (inFmt == CHAINQ_FMT))
-    if (useAggregate)
-        errAbort("can't use -mergeOutput with chains");
-    selectCaOpts |= chromAnnSaveLines;
-    }
-dropFile = optionVal("dropped", NULL);
-
-/* check for options incompatible with aggregate mode */
-if (useAggregate)
-    {
-    int i;
-    for (i = 0; aggIncompatible[i] != NULL; i++)
-        {
-        if (optionExists(aggIncompatible[i]))
-            errAbort("-%s is not allowed -aggregate", aggIncompatible[i]);
-        }
-    }
-
-overlapSelect(selectFile, inFile, outFile, dropFile);
-return 0;
-}