5c8526806b8283911229cd3e53dafbd001a1f68b markd Tue Feb 1 23:11:59 2011 -0800 Fixed bug with tempDir option not working. Update to current option parsing. Made help more informative diff --git src/hg/trfBig/trfBig.c src/hg/trfBig/trfBig.c index e42a627..5fc5169 100644 --- src/hg/trfBig/trfBig.c +++ src/hg/trfBig/trfBig.c @@ -1,263 +1,289 @@ /* trfBig - Mask tandem repeats on a big sequence file.. */ #include "common.h" #include "linefile.h" #include "fa.h" #include "nib.h" #include "portable.h" -#include "cheapcgi.h" +#include "options.h" +#include "verbose.h" static char const rcsid[] = "$Id: trfBig.c,v 1.20 2009/12/24 05:10:49 markd Exp $"; /* Variables that can be set from command line. */ char *trfExe = "trf"; /* trf executable name. */ boolean doBed = FALSE; /* Output .bed file. */ char *tempDir = "."; /* By default use current dir. */ int maxPeriod = 2000; /* Maximum size of repeat. */ +bool keep = FALSE; /* Don't delete tmp files */ + +/* command line option specifications */ +static struct optionSpec optionSpecs[] = +{ + {"bed", OPTION_BOOLEAN}, + {"bedAt", OPTION_STRING}, + {"tempDir", OPTION_STRING}, + {"trf", OPTION_STRING}, + {"maxPeriod", OPTION_INT}, + {"keep", OPTION_BOOLEAN}, + {NULL, 0} +}; void usage() /* Explain usage and exit. */ { errAbort( "trfBig - Mask tandem repeats on a big sequence file.\n" "usage:\n" " trfBig inFile outFile\n" "This will repeatedly run trf to mask tandem repeats in infile\n" "and put masked results in outFile. inFile and outFile can be .fa\n" - "or .nib format. Outfile can be .bed as well\n" + "or .nib format. Outfile can be .bed as well. Sequence output is hard\n" + "masked, lowercase.\n" "\n" " -bed creates a bed file in current dir\n" " -bedAt=path.bed - create a bed file at explicit location\n" " -tempDir=dir Where to put temp files.\n" " -trf=trfExe explicitly specifies trf executable name\n" - " -maxPeriod=N Maximum period size of repeat (default %d)\n", + " -maxPeriod=N Maximum period size of repeat (default %d)\n" + " -keep don't delete tmp files\n", maxPeriod); } void writeSomeDatToBed(char *inName, FILE *out, char *chromName, int chromOffset, int start, int end) /* Read dat file and write bits of it to .bed out file adding offset as necessary. */ { struct lineFile *lf = lineFileOpen(inName, TRUE); char *line; int lineSize; char *row[14]; boolean gotHead = FALSE; int s, e, i; while (lineFileNext(lf, &line, &lineSize)) { if (startsWith("Parameters:", line)) { gotHead = TRUE; break; } } if (!gotHead) errAbort("%s isn't a recognized trf .dat file\n", inName); while(lineFileRow(lf, row)) { s = atoi(row[0])-1; e = atoi(row[1]); if (s < start) s = start; if (e > end) e = end; if (s < e) { fprintf(out, "%s\t%d\t%d\ttrf", chromName, s+chromOffset, e+chromOffset); for (i=2; i<14; ++i) fprintf(out, "\t%s", row[i]); fprintf(out, "\n"); } } lineFileClose(&lf); } void removeWild(char *pat) /* Remove all files matching wildcard. */ { char dir[256], fn[128], ext[64]; char wild[256]; struct fileInfo *list, *el; splitPath(pat, dir, fn, ext); sprintf(wild, "%s%s", fn, ext); if (dir[0] == 0) strcpy(dir, "."); list = listDirX(tempDir, wild, TRUE); for (el = list; el != NULL; el = el->next) { remove(el->name); - uglyf("Removed %s\n", el->name); + verbose(1, "Removed %s\n", el->name); } slFreeList(&list); } void makeTrfRootName(char trfRootName[512], char *faFile) /* Make root name of files trf produces from faFile. */ { sprintf(trfRootName, "%s.2.7.7.80.10.50.%d", faFile, maxPeriod); } void trfSysCall(char *faFile) /* Invoke trf program on file. */ { +// need to execute in trf directory, as tmp files go to current directory +char faBase[FILENAME_LEN], faExt[FILENAME_LEN]; +splitPath(faFile, NULL, faBase, faExt); + char command[1024]; -safef(command, sizeof(command), "cd %s; %s %s 2 7 7 80 10 50 %d -m %s", - tempDir, trfExe, faFile, maxPeriod, doBed ? "-d" : ""); -uglyf("faFile %s, command %s\n", faFile, command); +safef(command, sizeof(command), "cd %s && %s %s%s 2 7 7 80 10 50 %d -m %s", + tempDir, trfExe, faBase, faExt, maxPeriod, doBed ? "-d" : ""); +verbose(1, "command %s\n", command); +fflush(stdout); +fflush(stderr); /* Run the system command, expecting a return code of 1, as trf returns the number of successfully processed sequences. */ int status = system(command); if (status == -1) errnoAbort("error starting command: %s", command); else if (WIFSIGNALED(status)) errAbort("command terminated by signal %d: %s", WTERMSIG(status), command); else if (WIFEXITED(status)) { if (WEXITSTATUS(status) != 1) errAbort("command exited with status %d (expected 1): %s", WEXITSTATUS(status), command); } else errAbort("unexpected exit for command: %s", command); } void outputWithBreaks(FILE *out, char *s, int size, int lineSize) /* Print s of given size to file, adding line feeds every now and then. */ { int i, oneSize; for (i=0; i lineSize) oneSize = lineSize; mustWrite(out, s+i, oneSize); fputc('\n', out); } } void trfBig(char *input, char *output) /* trfBig - Mask tandem repeats on a big sequence file.. */ { int maxSize = 5000000; int overlapSize = 10000; int start, end, s, e; int halfOverlapSize = overlapSize/2; char tempFile[512], trfRootName[512], trfTemp[512], bedFileName[512]; char dir[256], seqName[128], ext[64]; FILE *bedFile = NULL; struct dnaSeq *maskedSeq = NULL; if (doBed) { - if (cgiVarExists("bedAt")) - strcpy(bedFileName, cgiString("bedAt")); + if (optionExists("bedAt")) + strcpy(bedFileName, optionVal("bedAt", NULL)); else { splitPath(output, dir, seqName, ext); sprintf(bedFileName, "%s%s.bed", dir, seqName); } bedFile = mustOpen(bedFileName, "w"); } splitPath(input, dir, seqName, ext); if (sameString("stdin", seqName)) safef(tempFile, sizeof(tempFile), "%s", rTempName(tempDir, seqName, ".tf")); else safef(tempFile, sizeof(tempFile), "%s/%s.tf", tempDir, seqName); if (endsWith(input, ".nib") && (endsWith(output, ".nib") || sameString(output, "/dev/null"))) { int nibSize; FILE *in; struct nibStream *ns = nibStreamOpen(output); struct dnaSeq *seq; nibOpenVerify(input, &in, &nibSize); for (start = 0; start < nibSize; start = end) { end = start + maxSize; if (end > nibSize) end = nibSize; seq = nibLdPart(input, in, nibSize, start, end - start); faWrite(tempFile, seq->name, seq->dna, seq->size); freeDnaSeq(&seq); trfSysCall(tempFile); makeTrfRootName(trfRootName, tempFile); sprintf(trfTemp, "%s.mask", trfRootName); maskedSeq = faReadDna(trfTemp); s = (start == 0 ? 0 : halfOverlapSize); if (end == nibSize) e = end - start; else { e = end - halfOverlapSize - start; end -= overlapSize; } nibStreamMany(ns, maskedSeq->dna + s, e-s); freeDnaSeq(&maskedSeq); if (doBed) { sprintf(trfTemp, "%s.dat", trfRootName); writeSomeDatToBed(trfTemp, bedFile, seqName, start, s, e); } } nibStreamClose(&ns); } else if (!endsWith(input, ".nib") && !endsWith(output, ".nib")) { struct lineFile *lf = lineFileOpen(input, TRUE); struct dnaSeq seq; FILE *out = mustOpen(output, "w"); ZeroVar(&seq); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { fprintf(out, ">%s\n", seq.name); for (start = 0; start < seq.size; start = end) { end = start + maxSize; if (end > seq.size) end = seq.size; faWrite(tempFile, seq.name, seq.dna+start, end - start); trfSysCall(tempFile); makeTrfRootName(trfRootName, tempFile); sprintf(trfTemp, "%s.mask", trfRootName); maskedSeq = faReadDna(trfTemp); s = (start == 0 ? 0 : halfOverlapSize); if (end == seq.size) e = end - start; else { e = end - halfOverlapSize - start; end -= overlapSize; } outputWithBreaks(out, maskedSeq->dna+s, e-s, 50); freeDnaSeq(&maskedSeq); if (doBed) { sprintf(trfTemp, "%s.dat", trfRootName); writeSomeDatToBed(trfTemp, bedFile, seq.name, start, s, e); } } } lineFileClose(&lf); carefulClose(&out); } else { errAbort("Sorry, both input and output must be in same format."); } +if (!keep) + { sprintf(trfTemp, "%s*", tempFile); removeWild(trfTemp); } +} int main(int argc, char *argv[]) /* Process command line. */ { -cgiSpoof(&argc, argv); +optionInit(&argc, argv, optionSpecs); if (argc != 3) usage(); -trfExe = cgiUsualString("trf", trfExe); -doBed = cgiBoolean("bed") || cgiVarExists("bedAt"); -tempDir = cgiUsualString("tempDir", tempDir); -maxPeriod = cgiUsualInt("maxPeriod", maxPeriod); +trfExe = optionVal("trf", trfExe); +doBed = optionExists("bed") || optionExists("bedAt"); +tempDir = optionVal("tempDir", tempDir); +maxPeriod = optionInt("maxPeriod", maxPeriod); +keep = optionExists("keep"); trfBig(argv[1], argv[2]); return 0; }