175c45a32bf909202f672f1a9c972ffecc70d97a max Tue Apr 2 11:03:32 2013 -0700 option name was documented but actually didnt exist, fixed it diff --git src/utils/faCat/faCat.c src/utils/faCat/faCat.c index 52f8e6e..8c3264e 100644 --- src/utils/faCat/faCat.c +++ src/utils/faCat/faCat.c @@ -1,209 +1,210 @@ /* faCat - concatenate fa records and add gaps between */ #include "common.h" #include "linefile.h" #include "options.h" #include "fa.h" #include "portable.h" struct liftSpec /* How to lift coordinates. */ { struct liftSpec *next; /* Next in list. */ long int offset; /* Offset to add. */ char *oldName; /* Name in source file. */ int oldSize; /* Size of old sequence. */ char *newName; /* Name in dest file. */ int newSize; /* Size of new sequence. */ char strand; /* Strand of contig relative to chromosome. */ }; void usage() /* Explain usage and exit. */ { errAbort("faCat - concatenate fa records and add gaps between each sequence.\n" "usage:\n" " faCat [options] in.fa outRoot liftFileRoot\n" "\n" "Options:\n" " -gapSize=N - size of gap to insert between each sequence. (default 25).\n" " -maxOutputSize=N - Split file into multiple file if output exceeds N (default 2000000).\n" " -name=chrUn - name of output sequence\n" "\n" "Note: .fa and .lft will be appended to the outRoot and liftFile names resp.\n" ); } static struct optionSpec options[] = { {"gapSize", OPTION_INT}, {"maxOutputSize", OPTION_INT}, + {"name", OPTION_STRING}, {NULL, 0}, }; /* command line options */ char *name = "chrUn"; char *namePat = NULL; boolean vOption = FALSE; int minSize = -1; int maxSize = -1; int gapSize = 0; long maxOutputSize = 2000000; /* max size of output file */ boolean matchName(char *seqHeader) /* see if the sequence name matches */ { /* find end of name */ char *nameSep = skipToSpaces(seqHeader); char sepChr = '\0'; boolean isMatch = FALSE; if (nameSep != NULL) { sepChr = *nameSep; /* terminate name */ *nameSep = '\0'; } isMatch = wildMatch(namePat, seqHeader); if (nameSep != NULL) *nameSep = sepChr; return isMatch; } boolean recMatches(DNA *seq, int seqSize, char *seqHeader) /* check if a fasta record matches the sequence constraints */ { if ((minSize >= 0) && (seqSize < minSize)) return FALSE; if ((maxSize >= 0) && (seqSize > maxSize)) return FALSE; if ((namePat != NULL) && !matchName(seqHeader)) return FALSE; return TRUE; } struct liftSpec *readLifts(char *fileName) /* Read in lift file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); int wordCount; char *words[16]; struct liftSpec *list = NULL, *el; while ((wordCount = lineFileChop(lf, words)) != 0) { char *offs; if (wordCount < 5) errAbort("Need at least 5 words line %d of %s", lf->lineIx, lf->fileName); offs = words[0]; if (!isdigit(offs[0]) && !(offs[0] == '-' && isdigit(offs[1]))) errAbort("Expecting number in first field line %d of %s", lf->lineIx, lf->fileName); if (!isdigit(words[4][0])) errAbort("Expecting number in fifth field line %d of %s", lf->lineIx, lf->fileName); AllocVar(el); el->offset = atol(offs); el->oldName = cloneString(words[1]); el->oldSize = atoi(words[2]); el->newName = cloneString(words[3]); el->newSize = atoi(words[4]); if (wordCount >= 6) { char c = words[5][0]; if (c == '+' || c == '-') el->strand = c; else errAbort("Expecting + or - field 6, line %d of %s", lf->lineIx, lf->fileName); } else el->strand = '+'; slAddHead(&list, el); } slReverse(&list); lineFileClose(&lf); if (list == NULL) errAbort("Empty liftSpec file %s", fileName); return list; } void fixNewLength(char *inFile, char *liftFile, long int size) { FILE *liftFh = mustOpen(liftFile, "w"); struct liftSpec *el, *lifts = readLifts(inFile); for (el = lifts; el != NULL; el = el->next) { fprintf(liftFh, "%ld\t%s\t%ld\t%s\t%d\n",el->offset, el->oldName, size, el->newName, el->newSize); } carefulClose(&liftFh); } void faCat(char *inFile, char *outFile, char *liftFile) /* faCat - Filter out fa records that don't match expression. */ { char *tempFile = rTempName("/tmp", "lift", ".lft"); struct lineFile *inLf = lineFileOpen(inFile, TRUE); FILE *outFh = NULL; FILE *tempFh = mustOpen(tempFile, "w"); DNA *seq; int seqSize; char *seqHeader; long int offset = 0; char *gap = NULL; int i, fileIndex = 1; char nameNew[512]; char outFileName[512]; char liftFileName[512]; char fastaHeader[512]; safef(nameNew,sizeof(nameNew), "%s.%d",name, fileIndex); safef(fastaHeader,sizeof(fastaHeader),">%s\n",nameNew); safef(outFileName, sizeof(outFileName), "%s.%d.fa",outFile, fileIndex); safef(liftFileName, sizeof(liftFileName), "%s.%d.lft",liftFile, fileIndex++); outFh = mustOpen(outFileName, "w"); gap = needMem(gapSize+1); for (i = 0 ; i < gapSize ; i++) { gap[i] = 'N'; } gap[i] = '\0'; mustWrite(outFh, fastaHeader, strlen(fastaHeader)); while (faMixedSpeedReadNext(inLf, &seq, &seqSize, &seqHeader)) { // if (vOption ^ recMatches(seq, seqSize, seqHeader)) // faWriteNext(outFh, seqHeader, seq, seqSize); /* output lift record: offset oldName oldSize newName newSize */ fprintf(tempFh, "%ld\t%s\t%d\t%s\t%d\n",offset, nameNew, 0, seqHeader, seqSize); offset += (seqSize + gapSize); writeSeqWithBreaks(outFh, seq, seqSize, 50); writeSeqWithBreaks(outFh, gap, gapSize, 50); if (offset > maxOutputSize) { carefulClose(&tempFh); carefulClose(&outFh); fixNewLength(tempFile, liftFileName, offset); tempFh = mustOpen(tempFile, "w"); safef(nameNew,sizeof(nameNew), "%s.%d",name, fileIndex); safef(fastaHeader,sizeof(fastaHeader),">%s\n",nameNew); safef(liftFileName, sizeof(liftFileName), "%s.%d.lft",liftFile, fileIndex); safef(outFileName, sizeof(outFileName), "%s.%d.fa",outFile, fileIndex++); outFh = mustOpen(outFileName, "w"); mustWrite(outFh, fastaHeader, strlen(fastaHeader)); offset = 0; } } carefulClose(&tempFh); fixNewLength(tempFile, liftFileName, offset); lineFileClose(&inLf); carefulClose(&outFh); unlink(tempFile); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); name = optionVal("name", "chrUn"); //vOption = optionExists("v"); //minSize = optionInt("minSize", -1); maxOutputSize = optionInt("maxOutputSize", maxOutputSize); gapSize = optionInt("gapSize", 25); faCat(argv[1],argv[2], argv[3]); return 0; }