7f8427c9acfb1c2094ea7bdfe3a2fbf2e0c83ba4 kent Thu Jul 25 13:18:46 2013 -0700 Seems to work. diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c new file mode 100644 index 0000000..262e69b --- /dev/null +++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c @@ -0,0 +1,412 @@ +/* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting + * statistics, and also producing a smaller fastq out of a sample of the data. */ + +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "portable.h" +#include "obscure.h" +#include "hmmstats.h" + +int sampleSize = 100000; +int seed = 0; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting statistics\n" + "and also producing a smaller fastq out of a sample of the data. The fastq input may be\n" + "compressed with gzip or bzip2. Unfortunately the fastq input can't be in a pipe\n" + "usage:\n" + " fastqStatsAndSubsample in.fastq out.stats out.fastq\n" + "options:\n" + " -sampleSize=N - default %d\n" + " -seed=N - use given seed for random number generator. Default %d\n" + , sampleSize, seed + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"sampleSize", OPTION_INT}, + {"seed", OPTION_INT}, + {NULL, 0}, +}; + +/* Estimate base count from file size based on this. */ +#define ZIPPED_BYTES_PER_BASE 0.80 +#define UNZIPPED_BYTES_PER_BASE 2.5 +#define READ_SIZE 100 + +static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof) +/* Get next line and make sure, other than whitespace, it matches 'match'. + * Return FALSE on EOF, unless noEof is set, in which case abort */ +{ +char *line; +if (!lineFileNext(lf, &line, NULL)) + { + if (noEof) + errAbort("Expecting %c got end of file in %s", match, lf->fileName); + else + return FALSE; + } +if (line[0] != match) + errAbort("Expecting %c got %s line %d of %s", match, line, lf->lineIx, lf->fileName); +return TRUE; +} + +static int averageReadSize(char *fileName, int maxReads) +/* Read up to maxReads from fastq file and return average # of reads. */ +{ +struct lineFile *lf = lineFileOpen(fileName, FALSE); +int i; +long total = 0; +int count = 0; + +for (i=0; ifileName); + + /* Get size and add it to stats */ + int seqSize = lineSize-1; + total += seqSize; + count += 1; + + /* Deal with next two lines '+' and quality lines. */ + nextLineMustMatchChar(lf, '+', TRUE); + lineFileNeedNext(lf, &line, &lineSize); + } +lineFileClose(&lf); +if (count < 1) + errAbort("No data in %s", fileName); +return (total + (count>>1))/count; +} + +int calcInitialReduction(char *fileName, int desiredReadCount) +/* Using file name and size figure out how much to reduce it to get ~2x the subsample we want. */ +{ +size_t initialSize = fileSize(fileName); +int readSize = averageReadSize(fileName, 100); +long long estimatedBases; +if (endsWith(fileName, ".gz") || endsWith(fileName, ".bz2")) + estimatedBases = initialSize/ZIPPED_BYTES_PER_BASE; +else + estimatedBases = initialSize/UNZIPPED_BYTES_PER_BASE; +long long estimatedReads = estimatedBases/readSize; +double estimatedReduction = (double)estimatedReads/desiredReadCount; +double conservativeReduction = estimatedReduction * 0.3; +if (conservativeReduction < 1) + conservativeReduction = 1; +return round(conservativeReduction); +} + + +/* A bunch of statistics gathering variables set by oneFastqRecord below. */ +#define MAX_READ_SIZE 100000 /* This is fastq, right now only get 160 base reads max. */ +int maxReadBases, minReadBases, readCount; +double sumReadBases, sumSquaredReadBases; +int aCount[MAX_READ_SIZE], cCount[MAX_READ_SIZE], gCount[MAX_READ_SIZE], tCount[MAX_READ_SIZE]; +int nCount[MAX_READ_SIZE]; +double sumQuals[MAX_READ_SIZE], sumSquaredQuals[MAX_READ_SIZE]; +int maxQual, minQual; + +double sumDoubleArray(double *array, int arraySize) +/* Return sum of all items in array */ +{ +double total = 0; +int i; +for (i=0; ifileName); + +/* Get size and add it to stats */ +int seqSize = lineSize-1; +if (seqSize > MAX_READ_SIZE) + errAbort("Sequence size %d too long line %d of %s. Max is %d", seqSize, + lf->lineIx, lf->fileName, MAX_READ_SIZE); +if (firstTime) + { + maxReadBases = minReadBases = seqSize; + } +else + { + if (maxReadBases < seqSize) + maxReadBases = seqSize; + if (minReadBases > seqSize) + minReadBases = seqSize; + } +sumReadBases += seqSize; +sumSquaredReadBases += seqSize*seqSize; +++readCount; + +/* Save up nucleotide stats and abort on bogus nucleotides. */ +int i; +for (i=0; ilineIx, lf->fileName); + break; + } + } + +if (copy) + mustWrite(f, line, lineSize); + +/* Deal with line containing just '+' that separates sequence from quality. */ +nextLineMustMatchChar(lf, '+', TRUE); +if (copy) + fprintf(f, "+\n"); + +/* Deal with quality score line. */ +if (!lineFileNext(lf, &line, &lineSize)) + errAbort("%s truncated in middle of record", lf->fileName); +int qualSize = lineSize-1; + +/* Make sure it is same size */ +if (seqSize != qualSize) + errAbort("Sequence and quality size differ line %d and %d of %s", + lf->lineIx-2, lf->lineIx, lf->fileName); + +if (firstTime) + { + minQual = maxQual = line[0]; + } + +/* Do stats */ +for (i=0; i qual) + minQual = qual; + sumQuals[i] += qual; + sumSquaredQuals[i] += qual*qual; + } + +if (copy) + mustWrite(f, line, lineSize); + + +return TRUE; +} + +boolean maybeCopyFastqRecord(struct lineFile *lf, FILE *f, boolean copy, int *retSeqSize) +/* Read next fastq record from LF, and optionally copy it to f. Return FALSE at end of file + * Do a _little_ error checking on record while we're at it. The format has already been + * validated on the client side fairly thoroughly. */ +{ +char *line; +int lineSize; + +/* Deal with initial line starting with '@' */ +if (!nextLineMustMatchChar(lf, '@', FALSE)) + return FALSE; +if (copy) + fprintf(f, "@\n"); + +/* Deal with line containing sequence. */ +if (!lineFileNext(lf, &line, &lineSize)) + errAbort("%s truncated in middle of record", lf->fileName); +if (copy) + mustWrite(f, line, lineSize); +int seqSize = lineSize-1; + +/* Deal with line containing just '+' that separates sequence from quality. */ +/* Deal with line containing just '+' that separates sequence from quality. */ +nextLineMustMatchChar(lf, '+', TRUE); +if (copy) + fprintf(f, "+\n"); + +/* Deal with quality score line. */ +if (!lineFileNext(lf, &line, &lineSize)) + errAbort("%s truncated in middle of record", lf->fileName); +if (copy) + mustWrite(f, line, lineSize); +int qualSize = lineSize-1; + +if (seqSize != qualSize) + errAbort("Sequence and quality size differ line %d and %d of %s", + lf->lineIx-2, lf->lineIx, lf->fileName); + +*retSeqSize = seqSize; +return TRUE; +} + +void reduceFastqSample(char *source, FILE *f, int oldSize, int newSize) +/* Copy newSize samples from source into open output f. */ +{ +/* Make up an array that tells us which random parts of the source file to use. */ +assert(oldSize >= newSize); +char *randomizer = needMem(oldSize); +memset(randomizer, TRUE, newSize); +shuffleArrayOfChars(randomizer, oldSize); + +struct lineFile *lf = lineFileOpen(source, FALSE); +int i; +for (i=0; i totalReads) + errAbort("SampleSize is set to %d reads, but there are only %d reads in %s", + sampleSize, totalReads, inFastq); + else + errAbort("Internal error: bad estimate %d for downStep on %s", downStep, inFastq); + } + +char *qualType = "solexa"; +int qualZero = 64; +if (minQual <= 58) + { + qualType = "sanger"; + qualZero = 33; + } +FILE *f = mustOpen(outStats, "w"); +fprintf(f, "readCount %d\n", totalReads); +fprintf(f, "baseCount %g\n", sumReadBases); +fprintf(f, "readSizeMean %g\n", sumReadBases/totalReads); +fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads)); +fprintf(f, "readSizeMin %d\n", minReadBases); +fprintf(f, "readSizeMax %d\n", maxReadBases); +double qSum = sumDoubleArray(sumQuals, maxReadBases); +double qSumSquared = sumDoubleArray(sumSquaredQuals, maxReadBases); +fprintf(f, "qualMean %g\n", qSum/sumReadBases - qualZero); +fprintf(f, "qualStd %g\n", calcStdFromSums(qSum, qSumSquared, sumReadBases)); +fprintf(f, "qualMin %d\n", minQual - qualZero); +fprintf(f, "qualMax %d\n", maxQual - qualZero); +fprintf(f, "qualType %s\n", qualType); +fprintf(f, "qualZero %d\n", qualZero); +carefulClose(&f); + +f = mustOpen(outFastq, "w"); +reduceFastqSample(smallFastqName, f, readsCopied, sampleSize); +carefulClose(&f); +remove(smallFastqName); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 4) + usage(); +sampleSize = optionInt("sampleSize", sampleSize); +seed = optionInt("seed", seed); +fastqStatsAndSubsample(argv[1], argv[2], argv[3]); +return 0; +}