3dd6cd1eed6c6f52a3506dd53d1f4a81a6e4532b kent Tue Aug 6 12:09:34 2013 -0700 Added basesInSample output. diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c index 1f3e7a7..4b23fdd 100644 --- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c +++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c @@ -12,48 +12,47 @@ int sampleSize = 100000; int seed = 0; boolean smallOk = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting statistics\n" "and also producing a smaller fastq out of a sample of the data. The fastq input may be\n" "compressed with gzip or bzip2. Unfortunately the fastq input can't be in a pipe\n" "usage:\n" " fastqStatsAndSubsample in.fastq out.stats out.fastq\n" "options:\n" " -sampleSize=N - default %d\n" - " -seed=N - use given seed for random number generator. Default %d.\n" + " -seed=N - Use given seed for random number generator. Default %d.\n" " -smallOk - Not an error if less than sampleSize reads. out.fastq will be entire in.fastq\n" , sampleSize, seed ); } /* Command line validation table. */ static struct optionSpec options[] = { {"sampleSize", OPTION_INT}, {"seed", OPTION_INT}, {"smallOk", OPTION_BOOLEAN}, {NULL, 0}, }; /* Estimate base count from file size based on this. */ #define ZIPPED_BYTES_PER_BASE 0.80 #define UNZIPPED_BYTES_PER_BASE 2.5 -#define READ_SIZE 100 static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof) /* Get next line and make sure, other than whitespace, it matches 'match'. * Return FALSE on EOF, unless noEof is set, in which case abort */ { char *line; if (!lineFileNext(lf, &line, NULL)) { if (noEof) errAbort("Expecting %c got end of file in %s", match, lf->fileName); else return FALSE; } if (line[0] != match) errAbort("Expecting %c got %s line %d of %s", match, line, lf->lineIx, lf->fileName); @@ -105,56 +104,78 @@ estimatedBases = initialSize/ZIPPED_BYTES_PER_BASE; else estimatedBases = initialSize/UNZIPPED_BYTES_PER_BASE; long long estimatedReads = estimatedBases/readSize; double estimatedReduction = (double)estimatedReads/desiredReadCount; double conservativeReduction = estimatedReduction * 0.3; if (conservativeReduction < 1) conservativeReduction = 1; return round(conservativeReduction); } /* A bunch of statistics gathering variables set by oneFastqRecord below. */ #define MAX_READ_SIZE 100000 /* This is fastq, right now only get 160 base reads max. */ int maxReadBases, minReadBases, readCount; -double sumReadBases, sumSquaredReadBases; +long long sumReadBases; +double sumSquaredReadBases; int aCount[MAX_READ_SIZE], cCount[MAX_READ_SIZE], gCount[MAX_READ_SIZE], tCount[MAX_READ_SIZE]; int nCount[MAX_READ_SIZE]; double sumQuals[MAX_READ_SIZE], sumSquaredQuals[MAX_READ_SIZE]; int maxQual, minQual; double sumDoubleArray(double *array, int arraySize) /* Return sum of all items in array */ { double total = 0; int i; for (i=0; ilineIx, lf->fileName); if (copy) @@ -202,30 +223,31 @@ break; case 't': tCount[i] += 1; break; case 'n': case '.': nCount[i] += 1; break; default: errAbort("Unrecognized nucleotide character %c line %d of %s", c, lf->lineIx, lf->fileName); break; } } + if (copy) mustWrite(f, line, lineSize); /* Deal with line containing just '+' that separates sequence from quality. */ nextLineMustMatchChar(lf, '+', TRUE); if (copy) fprintf(f, "+\n"); /* Deal with quality score line. */ if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s truncated in middle of record", lf->fileName); int qualSize = lineSize-1; /* Make sure it is same size */ if (seqSize != qualSize) @@ -247,31 +269,32 @@ minQual = qual; sumQuals[i] += qual; sumSquaredQuals[i] += qual*qual; } if (copy) mustWrite(f, line, lineSize); return TRUE; } boolean maybeCopyFastqRecord(struct lineFile *lf, FILE *f, boolean copy, int *retSeqSize) /* Read next fastq record from LF, and optionally copy it to f. Return FALSE at end of file * Do a _little_ error checking on record while we're at it. The format has already been - * validated on the client side fairly thoroughly. */ + * validated on the client side fairly thoroughly. Similar to oneFastq record but with + * fewer side effects. */ { char *line; int lineSize; /* Deal with initial line starting with '@' */ if (!lineFileNext(lf, &line, &lineSize)) return FALSE; if (line[0] != '@') errAbort("Expecting line starting with '@' got %s line %d of %s", line, lf->lineIx, lf->fileName); if (copy) mustWrite(f, line, lineSize); /* Deal with line containing sequence. */ @@ -290,72 +313,78 @@ /* Deal with quality score line. */ if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s truncated in middle of record", lf->fileName); if (copy) mustWrite(f, line, lineSize); int qualSize = lineSize-1; if (seqSize != qualSize) errAbort("Sequence and quality size differ line %d and %d of %s", lf->lineIx-2, lf->lineIx, lf->fileName); *retSeqSize = seqSize; return TRUE; } -void reduceFastqSample(char *source, FILE *f, int oldSize, int newSize) +long long reduceFastqSample(char *source, FILE *f, int oldSize, int newSize) /* Copy newSize samples from source into open output f. */ { +long long basesInSample = 0; + /* Make up an array that tells us which random parts of the source file to use. */ assert(oldSize >= newSize); char *randomizer = needMem(oldSize); memset(randomizer, TRUE, newSize); shuffleArrayOfChars(randomizer, oldSize); struct lineFile *lf = lineFileOpen(source, FALSE); int i; for (i=0; i