src/hg/overlapSelect/overlapSelect.c 1.42

1.42 2009/07/31 18:09:23 markd
added rcsids
Index: src/hg/overlapSelect/overlapSelect.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/overlapSelect/overlapSelect.c,v
retrieving revision 1.41
retrieving revision 1.42
diff -b -B -U 1000000 -r1.41 -r1.42
--- src/hg/overlapSelect/overlapSelect.c	22 Jul 2009 07:01:29 -0000	1.41
+++ src/hg/overlapSelect/overlapSelect.c	31 Jul 2009 18:09:23 -0000	1.42
@@ -1,480 +1,482 @@
 /* overlapSelect - select records based on overlap of chromosome ranges */
 
 #include "common.h"
 #include "selectTable.h"
 #include "coordCols.h"
 #include "chromAnn.h"
 #include "dystring.h"
 #include "options.h"
 
+static char const rcsid[] = "$Id$";
+
 /* FIXME:
  * - would be nice to be able to specify ranges in the same manner
  *   as featureBits
  * - should keep header lines in files
  * - don't need to save if infile records if stats output
  */
 
 static struct optionSpec optionSpecs[] = {
     {"selectFmt", OPTION_STRING},
     {"selectCoordCols", OPTION_STRING},
     {"selectCds", OPTION_BOOLEAN},
     {"selectRange", OPTION_BOOLEAN},
     {"inFmt", OPTION_STRING},
     {"inCoordCols", OPTION_STRING},
     {"inCds", OPTION_BOOLEAN},
     {"inRange", OPTION_BOOLEAN},
     {"nonOverlapping", OPTION_BOOLEAN},
     {"strand", OPTION_BOOLEAN},
     {"oppositeStrand", OPTION_BOOLEAN},
     {"excludeSelf", OPTION_BOOLEAN},
     {"idMatch", OPTION_BOOLEAN},
     {"dropped", OPTION_STRING},
     {"overlapThreshold", OPTION_FLOAT},
     {"overlapThresholdCeil", OPTION_FLOAT},
     {"overlapSimilarity", OPTION_FLOAT},
     {"overlapSimilarityCeil", OPTION_FLOAT},
     {"overlapBases", OPTION_INT},
     {"merge", OPTION_BOOLEAN},
     {"mergeOutput", OPTION_BOOLEAN},
     {"statsOutput", OPTION_BOOLEAN},
     {"statsOutputAll", OPTION_BOOLEAN},
     {"statsOutputBoth", OPTION_BOOLEAN},
     {"idOutput", OPTION_BOOLEAN},
     {"aggregate", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 /* incompatible with aggregate */
 static char *aggIncompatible[] =
 {
     "overlapSimilarity", "overlapSimilarityCeil", "overlapThresholdCeil", "overlapBases", "merge", "mergeOutput", "idMatch", NULL
 };
 
 /* file format constants */
 enum recordFmt {
     UNKNOWN_FMT,
     PSL_FMT,
     PSLQ_FMT,
     CHAIN_FMT,
     CHAINQ_FMT,
     GENEPRED_FMT,
     BED_FMT,
     COORD_COLS_FMT
 };
 
 /* Options parsed from the command line */
 enum recordFmt selectFmt = UNKNOWN_FMT;
 struct coordCols selectCoordCols;
 unsigned selectCaOpts = 0;
 
 unsigned inFmt = UNKNOWN_FMT;
 struct coordCols inCoordCols;
 unsigned inCaOpts = 0;
 
 unsigned selectOpts = 0;
 boolean useAggregate = FALSE;
 boolean nonOverlapping = FALSE;
 boolean mergeOutput = FALSE;
 boolean idOutput = FALSE;
 boolean statsOutput = FALSE;
 boolean outputAll = FALSE;
 boolean outputBoth = FALSE;
 struct overlapCriteria criteria = {0.0, 1.1, 0.0, 1.1, -1};
 
 enum recordFmt parseFormatSpec(char *fmt)
 /* parse a format specification */
 {
 if (sameString(fmt, "psl"))
     return PSL_FMT;
 if (sameString(fmt, "pslq"))
     return PSLQ_FMT;
 if (sameString(fmt, "chain"))
     return CHAIN_FMT;
 if (sameString(fmt, "chainq"))
     return CHAINQ_FMT;
 if (sameString(fmt, "genePred"))
     return GENEPRED_FMT;
 if (sameString(fmt, "bed"))
     return BED_FMT;
 errAbort("invalid file format: %s", fmt);
 return UNKNOWN_FMT;
 }
 
 enum recordFmt getFileFormat(char *path)
 /* determine the file format from the specified file extension */
 {
 char *filePath = path;
 char filePathBuf[PATH_LEN];
 
 if (endsWith(filePath, ".gz") || endsWith(filePath, ".bz2") || endsWith(filePath, ".Z"))
     {
     /* strip .gz/.bz2/.Z extension */
     splitPath(path, NULL, filePathBuf, NULL);
     filePath = filePathBuf;
     }
 if (endsWith(filePath, ".psl"))
     return PSL_FMT;
 if (endsWith(filePath, ".chain"))
     return CHAIN_FMT;
 if (endsWith(filePath, ".genePred") || endsWith(filePath, ".gp"))
     return GENEPRED_FMT;
 if (endsWith(filePath, ".bed"))
     return BED_FMT;
 errAbort("can't determine file format of %s", filePath);
 return UNKNOWN_FMT;
 }
 
 static struct  chromAnnReader *createChromAnnReader(char *fileName,
                                                     enum recordFmt fmt,
                                                     unsigned caOpts,
                                                     struct coordCols *cols)
 /* construct a reader.  The coordCols spec is only used for tab files */
 {
 switch (fmt)
     {
     case PSL_FMT:
     case PSLQ_FMT:
         return chromAnnPslReaderNew(fileName, caOpts);
     case CHAIN_FMT:
     case CHAINQ_FMT:
         return chromAnnChainReaderNew(fileName, caOpts);
     case GENEPRED_FMT:
         return chromAnnGenePredReaderNew(fileName, caOpts);
     case BED_FMT:
         return chromAnnBedReaderNew(fileName, caOpts);
     case COORD_COLS_FMT:
         return chromAnnTabReaderNew(fileName, cols, caOpts);
     case UNKNOWN_FMT:
         break; 
     }
 assert(FALSE);
 return NULL;
 }
 
 static char *getPrintId(struct chromAnn* ca)
 /* get id for output, or <unknown> if not known */
 {
 return (ca->name == NULL) ? "<unknown>" : ca->name;
 }
 
 static void outputMerge(struct chromAnn* inCa, FILE *outFh,
                         struct chromAnnRef *overlappingRecs)
 /* output for the -mergeOutput option; pairs of inRec and overlap */
 {
 struct chromAnnRef *selectCaRef;
 for (selectCaRef = overlappingRecs; selectCaRef != NULL; selectCaRef = selectCaRef->next)
     {
     struct chromAnn *selectCa = selectCaRef->ref;
     inCa->recWrite(inCa, outFh, '\t');
     selectCa->recWrite(selectCa, outFh, '\n');
     }
 }
 
 static void outputIds(struct chromAnn* inCa, FILE *outFh,
                       struct chromAnnRef *overlappingRecs)
 /* output for the -idOutput option; pairs of inRec and overlap ids */
 {
 struct chromAnnRef *selectCaRef;
 for (selectCaRef = overlappingRecs; selectCaRef != NULL; selectCaRef = selectCaRef->next)
     {
     struct chromAnn *selectCa = selectCaRef->ref;
     fprintf(outFh, "%s\t%s\n", getPrintId(inCa), getPrintId(selectCa));
     }
 }
 
 /* format string for stats output */
 static char *statsFmt = "%s\t%s\t%0.3g\t%0.3g\t%d\t%0.3g\t%d\t%d\n";
 
 static void outputStats(struct chromAnn* inCa, FILE *outFh,
                         struct chromAnnRef *overlappingRecs)
 /* output for the -statOutput option; pairs of inRec and overlap ids */
 {
 if (overlappingRecs == NULL)
     {
     // -statsOutputAll and nothing overlapping
     fprintf(outFh, statsFmt, getPrintId(inCa), "", 0.0, 0.0, 0, 0.0, inCa->totalSize, 0);
     }
 struct chromAnnRef *selectCaRef;
 for (selectCaRef = overlappingRecs; selectCaRef != NULL; selectCaRef = selectCaRef->next)
     {
     struct chromAnn *selectCa = selectCaRef->ref;
     unsigned overBases = selectOverlapBases(inCa, selectCa);
     fprintf(outFh, statsFmt, getPrintId(inCa), getPrintId(selectCa),
             selectFracOverlap(inCa, overBases), selectFracOverlap(selectCa, overBases), overBases,
             selectFracSimilarity(inCa, selectCa, overBases),
             inCa->totalSize,  selectCa->totalSize);
     }
 }
 
 static void outputStatsSelNotUsed(FILE *outFh)
 /* output stats for select chromAnns that were not used */
 {
 struct chromAnnMapIter iter = selectTableFirst();
 struct chromAnn *selCa;
 while ((selCa = chromAnnMapIterNext(&iter)) != NULL)
     {
     if (!selCa->used)
         fprintf(outFh, statsFmt, "", getPrintId(selCa), 0.0, 0.0, 0, 0.0, 0, selCa->totalSize);
     }
 }
 
 static void doItemOverlap(struct chromAnn* inCa, FILE *outFh, FILE *dropFh)
 /* Do individual item overlap process of chromAnn object given the criteria,
  * and if so output */
 {
 struct chromAnnRef *overlappingRecs = NULL;
 struct chromAnnRef **overlappingRecsPtr = NULL;  /* used to indicate if recs should be collected */
 if (mergeOutput || idOutput || statsOutput)
     overlappingRecsPtr = &overlappingRecs;
 
 boolean overlaps = selectIsOverlapped(selectOpts, inCa, &criteria, overlappingRecsPtr);
 if (overlappingRecsPtr != NULL)
     slSort(overlappingRecsPtr, chromAnnRefLocCmp);
 if (((nonOverlapping) ? !overlaps : overlaps) || outputAll)
     {
     if (mergeOutput)
         outputMerge(inCa, outFh, overlappingRecs);
     else if (idOutput)
         outputIds(inCa, outFh, overlappingRecs);
     else if (statsOutput)
         outputStats(inCa, outFh, overlappingRecs);
     else
         inCa->recWrite(inCa, outFh, '\n');
     }
 else if (dropFh != NULL)
     {
     if (idOutput)
         fprintf(dropFh, "%s\n", getPrintId(inCa));
     else
         inCa->recWrite(inCa, dropFh, '\n');
     }
 
 slFreeList(&overlappingRecs);
 }
 
 static void doItemOverlaps(struct chromAnnReader* inCar, FILE *outFh, FILE *dropFh)
 /* Do individual item overlap processings */
 {
 struct chromAnn *inCa;
 while ((inCa = inCar->caRead(inCar)) != NULL)
     {
     doItemOverlap(inCa, outFh, dropFh);
     chromAnnFree(&inCa);
     }
 }
 
 
 static void doAggregateOverlap(struct chromAnn* inCa, FILE *outFh, FILE *dropFh)
 /* Do aggreate overlap process of chromAnn object given the criteria,
  * and if so output */
 {
 struct overlapAggStats stats = selectAggregateOverlap(selectOpts, inCa);
 boolean overlaps;
 if (criteria.threshold <= 0.0)
     overlaps = (stats.inOverlap > 0.0); /* any overlap */
 else
     overlaps = (stats.inOverlap >= criteria.threshold);
 if (((nonOverlapping) ? !overlaps : overlaps) || outputAll)
     {
     if (idOutput)
         fprintf(outFh, "%s\n", getPrintId(inCa));
     else if (statsOutput)
         fprintf(outFh, "%s\t%0.3g\t%d\t%d\n", getPrintId(inCa),
                 stats.inOverlap, stats.inOverBases, stats.inBases);
     else
         inCa->recWrite(inCa, outFh, '\n');
     }
 else if (dropFh != NULL)
     {
     if (idOutput)
         fprintf(dropFh, "%s\n", getPrintId(inCa));
     else
         inCa->recWrite(inCa, dropFh, '\n');
     }
 }
 
 static void doAggregateOverlaps(struct chromAnnReader* inCar, FILE *outFh, FILE *dropFh)
 /* Do aggreate overlap processing */
 {
 struct chromAnn *inCa;
 while ((inCa = inCar->caRead(inCar)) != NULL)
     {
     doAggregateOverlap(inCa, outFh, dropFh);
     chromAnnFree(&inCa);
     }
 }
 
 void loadSelectTable(char *selectFile)
 /* load the select table from a file */
 {
 struct chromAnnReader *selCar = createChromAnnReader(selectFile, selectFmt, selectCaOpts, &selectCoordCols);
 selectTableAddRecords(selCar);
 selCar->carFree(&selCar);
 }
 
 void overlapSelect(char *selectFile, char *inFile, char *outFile, char *dropFile)
 /* select records based on overlap of chromosome ranges */
 {
 struct chromAnnReader *inCar
     = createChromAnnReader(inFile, inFmt, inCaOpts, &inCoordCols);
 loadSelectTable(selectFile);
 FILE *outFh = mustOpen(outFile, "w");
 FILE *dropFh = NULL;
 if (dropFile != NULL)
     dropFh = mustOpen(dropFile, "w");
 if (idOutput)
     {
     if (useAggregate)
         fputs("#inId\n", outFh);
     else
         fputs("#inId\t" "selectId\n", outFh);
     }
 if (statsOutput)
     {
     if (useAggregate)
         fputs("#inId\t" "inOverlap\t" "inOverBases\t" "inBases\n", outFh);
     else
         fputs("#inId\t" "selectId\t" "inOverlap\t" "selectOverlap\t" "overBases\t" "similarity\t" "inBases\t" "selectBases\n", outFh);
     }
 
 if (useAggregate)
     doAggregateOverlaps(inCar, outFh, dropFh);
 else
     doItemOverlaps(inCar, outFh, dropFh);
 
 inCar->carFree(&inCar);
 if (statsOutput && outputBoth)
     outputStatsSelNotUsed(outFh);
 
 carefulClose(&outFh);
 carefulClose(&dropFh);
 /* enable for memory analysis */
 #if 0
 selectTableFree();
 #endif
 }
 
 void usage(char *msg)
 /* usage message and abort */
 {
 static char *usageMsg =
 #include "usage.msg"
     ;
 errAbort("%s:  %s", msg, usageMsg);
 }
 
 /* entry */
 int main(int argc, char** argv)
 {
 char *selectFile, *inFile, *outFile, *dropFile;
 optionInit(&argc, argv, optionSpecs);
 if (argc != 4)
     usage("wrong # args");
 selectFile = argv[1];
 inFile = argv[2];
 outFile = argv[3];
 
 /* select file options */
 if (optionExists("selectFmt") && optionExists("selectCoordCols"))
     errAbort("can't specify both -selectFmt and -selectCoordCols");
 
 if (optionExists("selectFmt"))
     selectFmt = parseFormatSpec(optionVal("selectFmt", NULL));
 else if (optionExists("selectCoordCols"))
     {
     selectCoordCols = coordColsParseSpec("selectCoordCols",
                                          optionVal("selectCoordCols", NULL));
     selectFmt = COORD_COLS_FMT;
     }
 else
     selectFmt = getFileFormat(selectFile);
 
 if (optionExists("selectCds"))
     selectCaOpts |= chromAnnCds;
 if (optionExists("selectRange"))
     selectCaOpts |= chromAnnRange;
 if ((selectFmt == PSLQ_FMT) || (selectFmt == CHAINQ_FMT))
     selectCaOpts |= chromAnnUseQSide;
 
 /* in file options */
 if (optionExists("inFmt") && optionExists("inCoordCols"))
     errAbort("can't specify both -inFmt and -inCoordCols");
 if (optionExists("inFmt"))
     inFmt = parseFormatSpec(optionVal("inFmt", NULL));
 else if (optionExists("inCoordCols"))
     {
     inCoordCols = coordColsParseSpec("inCoordCols",
                                      optionVal("inCoordCols", NULL));
     inFmt = COORD_COLS_FMT;
     }
 else
     inFmt = getFileFormat(inFile);
 
 inCaOpts = chromAnnSaveLines; // need lines for output
 if (optionExists("inCds"))
     inCaOpts |= chromAnnCds;
 if (optionExists("inRange"))
     inCaOpts |= chromAnnRange;
 if ((inFmt == PSLQ_FMT) || (inFmt == CHAINQ_FMT))
     inCaOpts |= chromAnnUseQSide;
 
 /* select options */
 useAggregate = optionExists("aggregate");
 nonOverlapping = optionExists("nonOverlapping");
 if (optionExists("strand") && optionExists("oppositeStrand"))
     errAbort("can only specify one of -strand and -oppositeStrand");
 if (optionExists("strand"))
     selectOpts |= selStrand;
 if (optionExists("oppositeStrand"))
     selectOpts |= selOppositeStrand;
 if (optionExists("excludeSelf") && (optionExists("idMatch")))
     errAbort("can't specify both -excludeSelf and -idMatch");
 if (optionExists("excludeSelf"))
     selectOpts |= selExcludeSelf;
 if (optionExists("idMatch"))
     selectOpts |= selIdMatch;
 
 criteria.threshold = optionFloat("overlapThreshold", 0.0);
 criteria.thresholdCeil = optionFloat("overlapThresholdCeil", 1.1);
 criteria.similarity = optionFloat("overlapSimilarity", 0.0);
 criteria.similarityCeil = optionFloat("overlapSimilarityCeil", 1.1);
 criteria.bases = optionInt("overlapBases", -1);
 
 /* output options */
 mergeOutput = optionExists("mergeOutput");
 idOutput = optionExists("idOutput");
 statsOutput = optionExists("statsOutput") || optionExists("statsOutputAll") || optionExists("statsOutputBoth");
 if ((mergeOutput + idOutput + statsOutput) > 1)
     errAbort("can only specify one of -mergeOutput, -idOutput, -statsOutput, -statsOutputAll, or -statsOutputBoth");
 outputAll = optionExists("statsOutputAll");
 outputBoth = optionExists("statsOutputBoth");
 if (outputBoth)
     outputAll = TRUE;
 if (mergeOutput)
     {
     if (nonOverlapping)
         errAbort("can't use -mergeOutput with -nonOverlapping");
     if (useAggregate)
         errAbort("can't use -mergeOutput with -aggregate");
     if ((selectFmt == CHAIN_FMT) || (selectFmt == CHAINQ_FMT)
         || (inFmt == CHAIN_FMT) || (inFmt == CHAINQ_FMT))
     if (useAggregate)
         errAbort("can't use -mergeOutput with chains");
     selectCaOpts |= chromAnnSaveLines;
     }
 dropFile = optionVal("dropped", NULL);
 
 /* check for options incompatible with aggregate mode */
 if (useAggregate)
     {
     int i;
     for (i = 0; aggIncompatible[i] != NULL; i++)
         {
         if (optionExists(aggIncompatible[i]))
             errAbort("-%s is not allowed -aggregate", aggIncompatible[i]);
         }
     }
 
 overlapSelect(selectFile, inFile, outFile, dropFile);
 return 0;
 }