e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/utils/fastqToFa/fastqToFa.c src/utils/fastqToFa/fastqToFa.c
index 350e12c..bd3bf95 100644
--- src/utils/fastqToFa/fastqToFa.c
+++ src/utils/fastqToFa/fastqToFa.c
@@ -1,249 +1,252 @@
 /* fastqToFa - Convert from fastq to fasta format.. */
+
+/* Copyright (C) 2011 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "fastqToFa - Convert from fastq to fasta format.\n"
   "usage:\n"
   "   fastqToFa [options] in.fastq out.fa\n"
   "options:\n"
   "   -nameVerify='string' - for multi-line fastq files, 'string' must\n"
   "	match somewhere in the sequence names in order to correctly\n"
   "	identify the next sequence block (e.g.: -nameVerify='Supercontig_')\n"
   "   -qual=file.qual.fa - output quality scores to specifed file\n\t(default: quality scores are ignored)\n"
   "   -qualSizes=qual.sizes - write sizes file for the quality scores\n"
   "   -noErrors - warn only on problems, do not error out\n"
   "              (specify -verbose=3 to see warnings\n"
   "   -solexa - use Solexa/Illumina quality score algorithm\n\t(instead of Phread quality)\n"
   "   -verbose=2 - set warning level to get some stats output during processing"
   );
 }
 
 static char *qualFile = NULL;
 static char *qualSizes = NULL;
 static char *nameVerify = NULL;
 static boolean solexa = FALSE;
 static boolean noErrors = FALSE;
 
 static struct optionSpec options[] = {
    {"qual", OPTION_STRING},
    {"qualSizes", OPTION_STRING},
    {"nameVerify", OPTION_STRING},
    {"solexa", OPTION_BOOLEAN},
    {"noErrors", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 void fastqToFa(char *inFastq, char *outFa)
 /* fastqToFa - Convert from fastq to fasta format.. */
 {
 struct lineFile *lf = lineFileOpen(inFastq, TRUE);
 FILE *f = mustOpen(outFa, "w");
 char *line;
 boolean inSequence = FALSE;
 long long seqCount = 0;
 long long totalSize = 0;
 boolean readingOk = lineFileNextReal(lf, &line);
 char seqNameNote[128];
 FILE *qFH = NULL;
 FILE *qSZ = NULL;
 
 if (qualFile)
     qFH = mustOpen(qualFile, "w");
 if (qualSizes)
     qSZ = mustOpen(qualSizes, "w");
 
 // the first line has already been read upon entry here, and thus
 // each time coming back to the top of this while() loop, the sequence
 // name line has been read and is ready to process.
 
 while (readingOk)
     {
     line = skipLeadingSpaces(line);
     if (line[0] != '@')
         errAbort("ERROR: %s doesn't seem to be fastq format.  "
 		"Expecting '@' start of line %d, got %c.",
 		lf->fileName, lf->lineIx, line[0]);
     if (nameVerify && (! stringIn(nameVerify, line+1)))
 	errAbort("ERROR: expecting to find '%s' in sequence identifier\n\t"
 	    "\tfound '%s' at line %d of %s\n",
 		nameVerify, line+1, lf->lineIx, lf->fileName);
     ++seqCount;
     fprintf(f, ">%s\n", line+1);	// starting a sequence
     safef(seqNameNote, ArraySize(seqNameNote), "%s", line+1); // save name
     inSequence = TRUE;
     long long dnaSize = 0;	// counting the size of this one sequence
     while (inSequence)
 	{
 	readingOk = lineFileNextReal(lf, &line);
 	if (!readingOk)
 	   lineFileUnexpectedEnd(lf); 
 	switch (line[0])	// check first character of line
 	    {
 	    case '+':		// seeing + enters quality scores
 		inSequence = FALSE;
 		break;
 	    case '@':		// seeing next sequence already is an error
 		errAbort("ERROR: expecting '+', got '@' at line %d of %s",
 		    lf->lineIx, lf->fileName);
 		break;
 	    default:		// anything else is assumed to be sequence
 		dnaSize += strlen(line);
 		fprintf(f, "%s\n", line);
 		break;
 	    }
 	}
     if (dnaSize < 1)
 	errAbort("ERROR: expecting DNA sequence, got nothing at line %d of %s",
 		lf->lineIx, lf->fileName);
     if (inSequence)	// I don't think this can actually happen here
 	errAbort("ERROR: expecting '+', found end of file at line %d of %s",
 	    lf->lineIx, lf->fileName);
     totalSize += dnaSize;
     long long qualitySize = 0;
     int retSize;
     readingOk = lineFileNext(lf, &line, &retSize);
     if (qFH)
 	fprintf(qFH, ">%s\n", seqNameNote);
     boolean onNewline = TRUE;
     while (readingOk)
 	{
 	if ('@' == line[0])
 	    {
 	    if (!nameVerify)
 		break;		// found next sequence
 	    if (stringIn(nameVerify, line+1))
 		break;		// found next sequence
 	    }
 	// anything else is considered to be quality score
 	int lineLength = strlen(line);
 	if (qFH)
 	    {
 	    int i;
 	    int qualValue = ((int)(line[0])) - 33;
 	    if (onNewline)
 		fprintf(qFH, "%d", qualValue);	// first score only
 	    else
 		fprintf(qFH, " %d", qualValue);	// all subsequent scores
 	    onNewline = FALSE;
 	    qualitySize += 1;
 	    for (i = 1; i < lineLength; ++i)
 		{
 		qualitySize += 1;
 		int qualValue = ((int)(line[i])) - 33;
 		if (solexa)
 		    {
 		    qualValue =
 			10*log(1+pow(10,(((int)(line[i]))-64)/10.0))/log(10);
 		    }
 if (qualValue > 127)
     errAbort("ERROR: illegal quality score at character %d on "
 	"line %d of %s\n\tfor sequence '%s'",
 		i, lf->lineIx, lf->fileName, seqNameNote);
 #define COUNTS_PER_LINE	26
 		if (0 == qualitySize % COUNTS_PER_LINE)
 		    fprintf(qFH, "\n%d", qualValue);
 		else
 		    fprintf(qFH, " %d", qualValue);  // next ones have space
 		}
 	    }
 	    else
 	    {
 	    qualitySize += lineLength;
 	    }
 	readingOk = lineFileNext(lf, &line, &retSize);
 	}
     if (qFH)	// the above loop *always* finishes needing a new line
 	{
 	fprintf(qFH, "\n");
 	onNewline = TRUE;
     }
     if (qualitySize < 1)
 	{
 	if (noErrors)
 	    verbose(3,"ERROR: expecting quality scores, got nothing at "
 		"line %d of %s\n\tfor sequence '%s'",
 		lf->lineIx, lf->fileName, seqNameNote);
 	else
 	    errAbort("ERROR: expecting quality scores, got nothing at "
 		"line %d of %s\n\tfor sequence '%s'",
 		lf->lineIx, lf->fileName, seqNameNote);
 	}
     if (dnaSize != qualitySize)
 	{
 	if (noErrors)
 	    verbose(3,"ERROR: number of quality scores %lld does not match "
 		"sequence length %lld\n\tat line %d of %s\n"
 		"\tfor sequence '%s'",
 		qualitySize, dnaSize, lf->lineIx, lf->fileName, seqNameNote);
 	else
 	    errAbort("ERROR: number of quality scores %lld does not match "
 		"sequence length %lld\n\tat line %d of %s"
 		"\tfor sequence '%s'\n",
 		qualitySize, dnaSize, lf->lineIx, lf->fileName, seqNameNote);
 	}
     if (qSZ)
 	{
 	fprintf(qSZ, "%s\t%lld\n", seqNameNote, qualitySize);
 	if (0 == seqCount % 1000)
 	    fflush(qSZ);
 	}
     }
 carefulClose(&f);
 if (qFH)
     carefulClose(&qFH);
 if (qSZ)
     carefulClose(&qSZ);
 
 verbose(2, "#\tprocessed %lld sequences with total length of %lld bases\n",
 	seqCount, totalSize);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 qualFile = optionVal("qual", NULL);
 qualSizes = optionVal("qualSizes", NULL);
 nameVerify = optionVal("nameVerify", NULL);
 solexa = optionExists("solexa");
 noErrors = optionExists("noErrors");
 
 if (nameVerify)
     verbose(2,"#\tverify sequence names contain the string '%s'\n", nameVerify);
 else
     verbose(2,"#\tno name checks will be made on lines beginning with @\n");
 
 if (qualSizes)
     verbose(2,"#\toutput sequence name and quality scores to '%s'\n",qualSizes);
 
 if (qualFile)
     verbose(2,"#\toutput quality scores to '%s'\n", qualFile);
 else
     {
     verbose(2,"#\tignore quality scores\n");
     if (solexa)
 	errAbort("ERROR: specified solexa quality score algorithm,\n"
 		"\tbut no quality file output");
     }
 if (solexa)
     verbose(2,"#\tusing solexa quality score algorithm\n");
 else
     verbose(2,"#\tusing default Phread quality score algorithm\n");
 if (noErrors)
     verbose(2,"#\tissue warnings on errors and continue\n#\t(specify -verbose=3 to see warnings)\n");
 else
     verbose(2,"#\tall errors will cause exit\n");
 
 if (argc != 3)
     usage();
 
 fastqToFa(argv[1], argv[2]);
 return 0;
 }