be5f3b3d7f1653df940b1937a5caffe21b3cf24b
hiram
  Fri Jun 24 17:04:09 2016 -0700
now using new trf v4.09 for simple repeats refs #17580

diff --git src/hg/trfBig/trfBig.c src/hg/trfBig/trfBig.c
index 894c8a3..3ff49af 100644
--- src/hg/trfBig/trfBig.c
+++ src/hg/trfBig/trfBig.c
@@ -5,61 +5,69 @@
 #include "common.h"
 #include "linefile.h"
 #include "fa.h"
 #include "nib.h"
 #include "portable.h"
 #include "options.h"
 #include "verbose.h"
 
 
 /* Variables that can be set from command line. */
 char *trfExe = "trf";	/* trf executable name. */
 boolean doBed = FALSE;	/* Output .bed file. */
 char *tempDir = ".";	/* By default use current dir. */
 int maxPeriod = 2000;    /* Maximum size of repeat. */
 bool keep = FALSE;       /* Don't delete tmp files */
+int trf409_l = 0;	/* trf 4.09 new option -l, from trf usage message:
+-l <n> maximum TR length expected (in millions) (eg, -l 3 or -l=3 for 3 million)
+                  Human genome HG38 would need -l 6
+*/
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] =
 {
     {"bed", OPTION_BOOLEAN},
     {"bedAt", OPTION_STRING},
     {"tempDir", OPTION_STRING},
     {"trf", OPTION_STRING},
     {"maxPeriod", OPTION_INT},
     {"keep", OPTION_BOOLEAN},
+    {"l", OPTION_INT},
     {NULL, 0}
 };
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "trfBig - Mask tandem repeats on a big sequence file.\n"
   "usage:\n"
   "   trfBig inFile outFile\n"
   "This will repeatedly run trf to mask tandem repeats in infile\n"
   "and put masked results in outFile.  inFile and outFile can be .fa\n"
   "or .nib format. Outfile can be .bed as well. Sequence output is hard\n"
   "masked, lowercase.\n"
   "\n"
   "   -bed creates a bed file in current dir\n"
   "   -bedAt=path.bed - create a bed file at explicit location\n"
   "   -tempDir=dir Where to put temp files.\n"
   "   -trf=trfExe explicitly specifies trf executable name\n"
   "   -maxPeriod=N  Maximum period size of repeat (default %d)\n"
-  "   -keep  don't delete tmp files\n",
+  "   -keep  don't delete tmp files\n"
+  "   -l=<n> when used here, for new trf v4.09 option:\n"
+  "          maximum TR length expected (in millions)\n"
+  "          (eg, -l=3 for 3 million), Human genome hg38 would need -l=6",
   maxPeriod);
 }
 
 void writeSomeDatToBed(char *inName, FILE *out, char *chromName, int chromOffset,
 	int start, int end)
 /* Read dat file and write bits of it to .bed out file adding offset as necessary. */
 {
 struct lineFile *lf = lineFileOpen(inName, TRUE);
 char *line;
 int lineSize;
 char *row[14];
 boolean gotHead = FALSE;
 int s, e, i;
 
 while (lineFileNext(lf, &line, &lineSize))
@@ -111,30 +119,34 @@
 
 void makeTrfRootName(char trfRootName[512], char *faFile)
 /* Make root name of files trf produces from faFile. */
 {
 sprintf(trfRootName, "%s.2.7.7.80.10.50.%d", faFile, maxPeriod);
 }
 
 void trfSysCall(char *faFile)
 /* Invoke trf program on file. */
 {
 // need to execute in trf directory, as tmp files go to current directory
 char faBase[FILENAME_LEN], faExt[FILENAME_LEN];
 splitPath(faFile, NULL, faBase, faExt);
 
 char command[1024];
+if (trf409_l > 0)
+  safef(command, sizeof(command), "cd %s && %s %s%s 2 7 7 80 10 50 %d -m %s -l %d",
+      tempDir, trfExe, faBase, faExt, maxPeriod, doBed ? "-d" : "", trf409_l);
+else
   safef(command, sizeof(command), "cd %s && %s %s%s 2 7 7 80 10 50 %d -m %s",
       tempDir, trfExe, faBase, faExt, maxPeriod, doBed ? "-d" : "");
 verbose(1, "command %s\n", command);
 fflush(stdout);
 fflush(stderr);
 
 /* Run the system command, expecting a return code of 1, as trf
    returns the number of successfully processed sequences. */
 int status = system(command);
 if (status == -1)
     errnoAbort("error starting command: %s", command);
 else if (WIFSIGNALED(status))
     errAbort("command terminated by signal %d: %s", WTERMSIG(status), command);
 else if (WIFEXITED(status))
     {
@@ -269,23 +281,27 @@
     errAbort("Sorry, both input and output must be in same format.");
     }
 if (!keep)
     {
     sprintf(trfTemp, "%s*", tempFile);
     removeWild(trfTemp);
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 if (argc != 3)
     usage();
+trf409_l = optionInt("l", trf409_l);
+if (trf409_l > 0)
+   trfExe = optionVal("trf.4.09", trfExe);
+else
    trfExe = optionVal("trf", trfExe);
 doBed = optionExists("bed") || optionExists("bedAt");
 tempDir = optionVal("tempDir", tempDir);
 maxPeriod = optionInt("maxPeriod", maxPeriod);
 keep = optionExists("keep");
 trfBig(argv[1], argv[2]);
 return 0;
 }