daf9260d23a25ea5e0a30b065bfe7a74d686384d
ceisenhart
  Wed Apr 30 16:45:26 2014 -0700
initial commit for mott trimming
diff --git src/utils/fastqMottTrim/fastqMottTrim.c src/utils/fastqMottTrim/fastqMottTrim.c
new file mode 100644
index 0000000..1e2e802
--- /dev/null
+++ src/utils/fastqMottTrim/fastqMottTrim.c
@@ -0,0 +1,196 @@
+/* fastqMottTrim - mott. */
+/* Applies Richard Mott's trimming algorithm to the */
+/* three prime end of each sequence. */
+/* Cuttoff is the lowest desired quality score in phred scores, */
+/* set by default to 1/100 chance of mismatch. */
+/* Base corresponds to any additions to the ASCII quality scheme. */
+/* Minlength specifies the minimum sequence length for the output. */
+
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "dystring.h"
+
+int clMinLength = 20;
+boolean clIsIllumina = FALSE;
+int clCutoff = 20;
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "fastqMottTrim - applies Mott's trimming algorithm to a fastq file\n"
+  " trims the 3 prime end based on cumulative quality \n"
+  "usage:\n"
+  "   fastqMottTrim input.fq output.fq\n"
+  "options:\n"
+  " -minLength=N the minimum length allowed for a trimmed sequence default 20  \n"
+  " -isIllumina TRUE for illumina, FALSE for Sanger \n"
+  " -cutoff=N  the lowest desired phred score, default 30 \n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {"minLength",OPTION_INT},
+   {"isIllumina",OPTION_BOOLEAN},
+   {"cutoff",OPTION_INT},
+   {NULL,0},
+};
+
+struct fastqSeq
+/* holds a single fastq sequence */
+    {
+    struct fastqSeq *next;
+    int size;       /* Size of the sequence. */
+    char *header;   /* Sequence header, begins with '@' */
+    char *del;      /* Fastq deliminator '+' */
+    char *dna;      /* DNA sequence */
+    char *quality;  /* DNA quality, in ASCII format */
+    };
+
+#ifdef OLD
+struct fastqSeq  *fastqReadNext(struct lineFile *lf)
+/*reads the next fastq sequence and returns a fastq struct */
+{
+struct fastqSeq *seq=needMem(sizeof(struct fastqSeq));
+/* dyamically allocate memory */
+int size;
+lineFileNext(lf,&seq->header,&size);
+lineFileNext(lf,&seq->dna,&size);
+lineFileNext(lf,&seq->del,&size);
+lineFileNext(lf,&seq->quality,&size);
+return(seq);
+}
+#endif /* OLD */
+
+char *nextLineIntoString(struct lineFile *lf)
+/* Return next line cloned into persistent memory.  Abort if can't get next line. */
+{
+char *line;
+lineFileNeedNext(lf, &line, NULL);
+return cloneString(line);
+}
+
+struct fastqSeq  *fastqReadNext(struct lineFile *lf)
+/* Reads the next fastq sequence and returns a fastq struct.
+ * Returns NULL at end of file. */
+{
+char *line;
+if (!lineFileNext(lf, &line, NULL))
+    return NULL;   // Pass back end of file
+
+struct fastqSeq *seq;
+AllocVar(seq);
+seq->header = cloneString(line);
+seq->dna = nextLineIntoString(lf);
+seq->del = nextLineIntoString(lf);
+seq->quality = nextLineIntoString(lf);
+seq->size = strlen(seq->dna);
+if (seq->size != strlen(seq->quality))
+    errAbort("dna and quality sizes don't match at record ending line %d of %s", lf->lineIx, lf->fileName);
+return(seq);
+}
+
+
+void fastqWriteNext(struct fastqSeq *input, FILE *f)
+/* a function for writing a single fastq struct to file */
+{
+    fprintf(f,"%s\n",input->header);
+    fprintf(f,"%s\n",input->dna);
+    fprintf(f,"%s\n",input->del);
+    fprintf(f,"%s\n",input->quality);
+}
+
+void freeFastqSeq(struct fastqSeq **pInput)
+/* frees the memory allocated to a fastq struct */
+{
+struct fastqSeq *input = *pInput;
+if (input != NULL)
+    {
+    freeMem(input->header);
+    freeMem(input->dna);
+    freeMem(input->del);
+    freeMem(input->quality);
+    freez(pInput);
+    }
+}
+
+boolean  mottTrim(struct fastqSeq *input, int minLength, boolean isIllumina, int cutoff)
+/* applies mott's trimming algorithm to the fastq input*/
+/* trims from the 3 prime end based on */
+/* the sum of (quality score - cutoff value ) */
+/* returns true if the trimmed sequence is larger than the minimum sequene length */
+{
+int base = 33;
+int index = -1;
+long minValue = 1000000;
+long scoreValue = 0;
+int qualScore;
+if(isIllumina)
+    base = 64;
+int i = strlen(input->dna)-1;
+/*determining the trim length */
+for(; i >= 0; --i)
+    {
+/*convert the quality scores to their ascii values */
+    qualScore = input->quality[i];
+    qualScore = qualScore - base;
+/*calculate the sum of the (quality score - cutoff value)'s*/
+    scoreValue = scoreValue + (qualScore - cutoff);
+    if(scoreValue < minValue)
+        {
+        minValue=scoreValue;
+        ++index;
+    }
+/*modifying the fastq fields to be the trimmed length*/
+}
+if(minValue < cutoff)
+    {
+    input->dna[strlen(input->dna)-index] = '\0';
+    input->quality[strlen(input->quality)-index] = '\0';
+    }  
+if(strlen(input->dna) > minLength)
+    {
+    return(TRUE);
+    }
+return(FALSE);
+
+
+
+}
+
+void parseFastq(char *input, char *output, int minLength, boolean isIllumina, int cutoff )
+/* goes through fastq sequences in a fastq file*/
+/* parses, stores, mottTrims,  prints, then frees*/
+{
+FILE *f = mustOpen(output, "w");
+struct lineFile *lf = lineFileOpen(input, TRUE);
+struct fastqSeq *fq;
+while ((fq = fastqReadNext(lf)) != NULL)
+    {
+    if( mottTrim(fq, minLength, isIllumina, cutoff))
+        {
+	fastqWriteNext(fq, f);
+        }
+    freeFastqSeq(&fq);
+    }
+carefulClose(&f);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+clMinLength = optionInt("minLength", clMinLength);
+clIsIllumina = optionExists("isIllumina");
+clCutoff = optionInt("cutoff", clCutoff);
+
+if(argc != 3)
+   {
+   usage();
+   }
+parseFastq(argv[1], argv[2], clMinLength, clIsIllumina, clCutoff);
+return 0;
+}