a86ab2ecb0564663acfb2bfdc20d4585af293f98
ceisenhart
  Wed May 14 16:48:30 2014 -0700
updated comments changed a < to <=
diff --git src/utils/fastqMottTrim/fastqMottTrim.c src/utils/fastqMottTrim/fastqMottTrim.c
index 1e2e802..570a247 100644
--- src/utils/fastqMottTrim/fastqMottTrim.c
+++ src/utils/fastqMottTrim/fastqMottTrim.c
@@ -1,196 +1,230 @@
 /* fastqMottTrim - mott. */
 /* Applies Richard Mott's trimming algorithm to the */
 /* three prime end of each sequence. */
 /* Cuttoff is the lowest desired quality score in phred scores, */
-/* set by default to 1/100 chance of mismatch. */
+/* set by default to a 50%  chance of mismatch. */
 /* Base corresponds to any additions to the ASCII quality scheme. */
 /* Minlength specifies the minimum sequence length for the output. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 
-int clMinLength = 20;
+int clMinLength = 30;
 boolean clIsIllumina = FALSE;
-int clCutoff = 20;
+int clCutoff = 3;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "fastqMottTrim - applies Mott's trimming algorithm to a fastq file\n"
   " trims the 3 prime end based on cumulative quality \n"
   "usage:\n"
   "   fastqMottTrim input.fq output.fq\n"
+  "   for paired end use \n"
+  "   fastqMottTrim pair1.fq pair2.fq output1.fq output2.fq \n"
   "options:\n"
   " -minLength=N the minimum length allowed for a trimmed sequence default 20  \n"
   " -isIllumina TRUE for illumina, FALSE for Sanger \n"
   " -cutoff=N  the lowest desired phred score, default 30 \n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"minLength",OPTION_INT},
    {"isIllumina",OPTION_BOOLEAN},
    {"cutoff",OPTION_INT},
    {NULL,0},
 };
 
 struct fastqSeq
 /* holds a single fastq sequence */
     {
     struct fastqSeq *next;
     int size;       /* Size of the sequence. */
     char *header;   /* Sequence header, begins with '@' */
     char *del;      /* Fastq deliminator '+' */
     char *dna;      /* DNA sequence */
-    char *quality;  /* DNA quality, in ASCII format */
+    unsigned char *quality;  /* DNA quality, in ASCII format */
     };
 
 #ifdef OLD
 struct fastqSeq  *fastqReadNext(struct lineFile *lf)
 /*reads the next fastq sequence and returns a fastq struct */
 {
 struct fastqSeq *seq=needMem(sizeof(struct fastqSeq));
 /* dyamically allocate memory */
 int size;
 lineFileNext(lf,&seq->header,&size);
 lineFileNext(lf,&seq->dna,&size);
 lineFileNext(lf,&seq->del,&size);
 lineFileNext(lf,&seq->quality,&size);
 return(seq);
 }
 #endif /* OLD */
 
 char *nextLineIntoString(struct lineFile *lf)
 /* Return next line cloned into persistent memory.  Abort if can't get next line. */
 {
 char *line;
 lineFileNeedNext(lf, &line, NULL);
 return cloneString(line);
 }
 
 struct fastqSeq  *fastqReadNext(struct lineFile *lf)
 /* Reads the next fastq sequence and returns a fastq struct.
  * Returns NULL at end of file. */
 {
 char *line;
 if (!lineFileNext(lf, &line, NULL))
     return NULL;   // Pass back end of file
 
 struct fastqSeq *seq;
 AllocVar(seq);
 seq->header = cloneString(line);
 seq->dna = nextLineIntoString(lf);
 seq->del = nextLineIntoString(lf);
-seq->quality = nextLineIntoString(lf);
+seq->quality = (unsigned char *)nextLineIntoString(lf);
 seq->size = strlen(seq->dna);
-if (seq->size != strlen(seq->quality))
+if (seq->size != strlen((char *)seq->quality))
     errAbort("dna and quality sizes don't match at record ending line %d of %s", lf->lineIx, lf->fileName);
 return(seq);
 }
 
 
 void fastqWriteNext(struct fastqSeq *input, FILE *f)
 /* a function for writing a single fastq struct to file */
 {
     fprintf(f,"%s\n",input->header);
     fprintf(f,"%s\n",input->dna);
     fprintf(f,"%s\n",input->del);
     fprintf(f,"%s\n",input->quality);
 }
 
 void freeFastqSeq(struct fastqSeq **pInput)
 /* frees the memory allocated to a fastq struct */
 {
 struct fastqSeq *input = *pInput;
 if (input != NULL)
     {
     freeMem(input->header);
     freeMem(input->dna);
     freeMem(input->del);
     freeMem(input->quality);
     freez(pInput);
     }
 }
 
 boolean  mottTrim(struct fastqSeq *input, int minLength, boolean isIllumina, int cutoff)
 /* applies mott's trimming algorithm to the fastq input*/
 /* trims from the 3 prime end based on */
 /* the sum of (quality score - cutoff value ) */
 /* returns true if the trimmed sequence is larger than the minimum sequene length */
 {
 int base = 33;
 int index = -1;
-long minValue = 1000000;
+long minValue = 100000000;
 long scoreValue = 0;
-int qualScore;
 if(isIllumina)
     base = 64;
-int i = strlen(input->dna)-1;
+int len = strlen(input->dna);
+int i = len - 1;
 /*determining the trim length */
 for(; i >= 0; --i)
     {
 /*convert the quality scores to their ascii values */
-    qualScore = input->quality[i];
-    qualScore = qualScore - base;
+    int qualScore = input->quality[i]-base;
 /*calculate the sum of the (quality score - cutoff value)'s*/
-    scoreValue = scoreValue + (qualScore - cutoff);
+    scoreValue += (qualScore - cutoff);
     if(scoreValue < minValue)
         {
         minValue = scoreValue;
-        ++index;
+        index = i;
         }
 /*modifying the fastq fields to be the trimmed length*/
 }
-if(minValue < cutoff)
+if(minValue <= cutoff)
     {
-    input->dna[strlen(input->dna)-index] = '\0';
-    input->quality[strlen(input->quality)-index] = '\0';
+    input->dna[index] = '\0';
+    input->quality[index] = '\0';
     }  
-if(strlen(input->dna) > minLength)
+if(strlen(input->dna) >= minLength)
     {
     return(TRUE);
     }
 return(FALSE);
+}
 
+void parseFastqPairs(char *input, char *output,
+            char *input2, char *output2, int minLength, boolean isIllumina, int cutoff )
+/* goes through fastq sequences in a fastq file*/
+/* parses, stores, mottTrims,  prints, then frees*/
+{
+FILE *f = mustOpen(output, "w");
+FILE *f2 = mustOpen(output2, "w");
+struct lineFile *lf = lineFileOpen(input, TRUE);
+struct lineFile *lf2 = lineFileOpen(input2, TRUE);
+struct fastqSeq *fq;
+struct fastqSeq *fq2;
+while ((fq = fastqReadNext(lf)) != NULL && (fq2 = fastqReadNext(lf2)) != NULL)
+    {
+    if( mottTrim(fq, minLength, isIllumina, cutoff) && mottTrim(fq2, minLength, isIllumina, cutoff))
+        {
+	fastqWriteNext(fq, f);
+        fastqWriteNext(fq2, f2);
+	}
+    freeFastqSeq(&fq);
+    freeFastqSeq(&fq2);
+    }
+carefulClose(&f);
+carefulClose(&f2);
+}
 
 
-}
 
 void parseFastq(char *input, char *output, int minLength, boolean isIllumina, int cutoff )
 /* goes through fastq sequences in a fastq file*/
 /* parses, stores, mottTrims,  prints, then frees*/
 {
 FILE *f = mustOpen(output, "w");
 struct lineFile *lf = lineFileOpen(input, TRUE);
 struct fastqSeq *fq;
 while ((fq = fastqReadNext(lf)) != NULL)
     {
     if( mottTrim(fq, minLength, isIllumina, cutoff))
         {
 	fastqWriteNext(fq, f);
         }
     freeFastqSeq(&fq);
     }
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 clMinLength = optionInt("minLength", clMinLength);
 clIsIllumina = optionExists("isIllumina");
 clCutoff = optionInt("cutoff", clCutoff);
 
-if(argc != 3)
+if(argc != 3 && argc != 5)
     {
     usage();
     }
+if(argc==3)
+    {
     parseFastq(argv[1], argv[2], clMinLength, clIsIllumina, clCutoff);
+    }
+    
+if(argc==5)
+    {
+    parseFastqPairs(argv[1], argv[3], argv[2], argv[4], clMinLength, clIsIllumina, clCutoff);
+    }
+    
 return 0;
 }