7f8427c9acfb1c2094ea7bdfe3a2fbf2e0c83ba4 kent Thu Jul 25 13:18:46 2013 -0700 Seems to work. diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c new file mode 100644 index 0000000..262e69b --- /dev/null +++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c @@ -0,0 +1,412 @@ +/* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting + * statistics, and also producing a smaller fastq out of a sample of the data. */ + +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "portable.h" +#include "obscure.h" +#include "hmmstats.h" + +int sampleSize = 100000; +int seed = 0; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting statistics\n" + "and also producing a smaller fastq out of a sample of the data. The fastq input may be\n" + "compressed with gzip or bzip2. Unfortunately the fastq input can't be in a pipe\n" + "usage:\n" + " fastqStatsAndSubsample in.fastq out.stats out.fastq\n" + "options:\n" + " -sampleSize=N - default %d\n" + " -seed=N - use given seed for random number generator. Default %d\n" + , sampleSize, seed + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"sampleSize", OPTION_INT}, + {"seed", OPTION_INT}, + {NULL, 0}, +}; + +/* Estimate base count from file size based on this. */ +#define ZIPPED_BYTES_PER_BASE 0.80 +#define UNZIPPED_BYTES_PER_BASE 2.5 +#define READ_SIZE 100 + +static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof) +/* Get next line and make sure, other than whitespace, it matches 'match'. + * Return FALSE on EOF, unless noEof is set, in which case abort */ +{ +char *line; +if (!lineFileNext(lf, &line, NULL)) + { + if (noEof) + errAbort("Expecting %c got end of file in %s", match, lf->fileName); + else + return FALSE; + } +if (line[0] != match) + errAbort("Expecting %c got %s line %d of %s", match, line, lf->lineIx, lf->fileName); +return TRUE; +} + +static int averageReadSize(char *fileName, int maxReads) +/* Read up to maxReads from fastq file and return average # of reads. */ +{ +struct lineFile *lf = lineFileOpen(fileName, FALSE); +int i; +long total = 0; +int count = 0; + +for (i=0; i<maxReads; ++i) + { + /* Deal with initial line starting with '@' */ + if (!nextLineMustMatchChar(lf, '@', FALSE)) + break; + + /* Deal with line containing sequence. */ + char *line; + int lineSize = 0; + if (!lineFileNext(lf, &line, &lineSize)) + errAbort("%s truncated in middle of record", lf->fileName); + + /* Get size and add it to stats */ + int seqSize = lineSize-1; + total += seqSize; + count += 1; + + /* Deal with next two lines '+' and quality lines. */ + nextLineMustMatchChar(lf, '+', TRUE); + lineFileNeedNext(lf, &line, &lineSize); + } +lineFileClose(&lf); +if (count < 1) + errAbort("No data in %s", fileName); +return (total + (count>>1))/count; +} + +int calcInitialReduction(char *fileName, int desiredReadCount) +/* Using file name and size figure out how much to reduce it to get ~2x the subsample we want. */ +{ +size_t initialSize = fileSize(fileName); +int readSize = averageReadSize(fileName, 100); +long long estimatedBases; +if (endsWith(fileName, ".gz") || endsWith(fileName, ".bz2")) + estimatedBases = initialSize/ZIPPED_BYTES_PER_BASE; +else + estimatedBases = initialSize/UNZIPPED_BYTES_PER_BASE; +long long estimatedReads = estimatedBases/readSize; +double estimatedReduction = (double)estimatedReads/desiredReadCount; +double conservativeReduction = estimatedReduction * 0.3; +if (conservativeReduction < 1) + conservativeReduction = 1; +return round(conservativeReduction); +} + + +/* A bunch of statistics gathering variables set by oneFastqRecord below. */ +#define MAX_READ_SIZE 100000 /* This is fastq, right now only get 160 base reads max. */ +int maxReadBases, minReadBases, readCount; +double sumReadBases, sumSquaredReadBases; +int aCount[MAX_READ_SIZE], cCount[MAX_READ_SIZE], gCount[MAX_READ_SIZE], tCount[MAX_READ_SIZE]; +int nCount[MAX_READ_SIZE]; +double sumQuals[MAX_READ_SIZE], sumSquaredQuals[MAX_READ_SIZE]; +int maxQual, minQual; + +double sumDoubleArray(double *array, int arraySize) +/* Return sum of all items in array */ +{ +double total = 0; +int i; +for (i=0; i<arraySize; ++i) + total += array[i]; +return total; +} + +long long sumIntArray(int *array, int arraySize) +/* Return sum of all items in array */ +{ +long long total = 0; +int i; +for (i=0; i<arraySize; ++i) + total += array[i]; +return total; +} + +boolean oneFastqRecord(struct lineFile *lf, FILE *f, boolean copy, boolean firstTime) +/* Read next fastq record from LF, and optionally copy it to f. Return FALSE at end of file + * Do a _little_ error checking on record while we're at it. The format has already been + * validated on the client side fairly thoroughly. */ +{ +char *line; +int lineSize; + +/* Deal with initial line starting with '@' */ +if (!nextLineMustMatchChar(lf, '@', FALSE)) + return FALSE; +if (copy) + fprintf(f, "@\n"); + +/* Deal with line containing sequence. */ +if (!lineFileNext(lf, &line, &lineSize)) + errAbort("%s truncated in middle of record", lf->fileName); + +/* Get size and add it to stats */ +int seqSize = lineSize-1; +if (seqSize > MAX_READ_SIZE) + errAbort("Sequence size %d too long line %d of %s. Max is %d", seqSize, + lf->lineIx, lf->fileName, MAX_READ_SIZE); +if (firstTime) + { + maxReadBases = minReadBases = seqSize; + } +else + { + if (maxReadBases < seqSize) + maxReadBases = seqSize; + if (minReadBases > seqSize) + minReadBases = seqSize; + } +sumReadBases += seqSize; +sumSquaredReadBases += seqSize*seqSize; +++readCount; + +/* Save up nucleotide stats and abort on bogus nucleotides. */ +int i; +for (i=0; i<seqSize; ++i) + { + char c = tolower(line[i]); + switch (c) + { + case 'a': + aCount[i] += 1; + break; + case 'c': + cCount[i] += 1; + break; + case 'g': + gCount[i] += 1; + break; + case 't': + tCount[i] += 1; + break; + case 'n': + nCount[i] += 1; + break; + default: + errAbort("Unrecognized nucleotide character %c line %d of %s", c, + lf->lineIx, lf->fileName); + break; + } + } + +if (copy) + mustWrite(f, line, lineSize); + +/* Deal with line containing just '+' that separates sequence from quality. */ +nextLineMustMatchChar(lf, '+', TRUE); +if (copy) + fprintf(f, "+\n"); + +/* Deal with quality score line. */ +if (!lineFileNext(lf, &line, &lineSize)) + errAbort("%s truncated in middle of record", lf->fileName); +int qualSize = lineSize-1; + +/* Make sure it is same size */ +if (seqSize != qualSize) + errAbort("Sequence and quality size differ line %d and %d of %s", + lf->lineIx-2, lf->lineIx, lf->fileName); + +if (firstTime) + { + minQual = maxQual = line[0]; + } + +/* Do stats */ +for (i=0; i<seqSize; ++i) + { + int qual = line[i]; + if (maxQual < qual) + maxQual = qual; + if (minQual > qual) + minQual = qual; + sumQuals[i] += qual; + sumSquaredQuals[i] += qual*qual; + } + +if (copy) + mustWrite(f, line, lineSize); + + +return TRUE; +} + +boolean maybeCopyFastqRecord(struct lineFile *lf, FILE *f, boolean copy, int *retSeqSize) +/* Read next fastq record from LF, and optionally copy it to f. Return FALSE at end of file + * Do a _little_ error checking on record while we're at it. The format has already been + * validated on the client side fairly thoroughly. */ +{ +char *line; +int lineSize; + +/* Deal with initial line starting with '@' */ +if (!nextLineMustMatchChar(lf, '@', FALSE)) + return FALSE; +if (copy) + fprintf(f, "@\n"); + +/* Deal with line containing sequence. */ +if (!lineFileNext(lf, &line, &lineSize)) + errAbort("%s truncated in middle of record", lf->fileName); +if (copy) + mustWrite(f, line, lineSize); +int seqSize = lineSize-1; + +/* Deal with line containing just '+' that separates sequence from quality. */ +/* Deal with line containing just '+' that separates sequence from quality. */ +nextLineMustMatchChar(lf, '+', TRUE); +if (copy) + fprintf(f, "+\n"); + +/* Deal with quality score line. */ +if (!lineFileNext(lf, &line, &lineSize)) + errAbort("%s truncated in middle of record", lf->fileName); +if (copy) + mustWrite(f, line, lineSize); +int qualSize = lineSize-1; + +if (seqSize != qualSize) + errAbort("Sequence and quality size differ line %d and %d of %s", + lf->lineIx-2, lf->lineIx, lf->fileName); + +*retSeqSize = seqSize; +return TRUE; +} + +void reduceFastqSample(char *source, FILE *f, int oldSize, int newSize) +/* Copy newSize samples from source into open output f. */ +{ +/* Make up an array that tells us which random parts of the source file to use. */ +assert(oldSize >= newSize); +char *randomizer = needMem(oldSize); +memset(randomizer, TRUE, newSize); +shuffleArrayOfChars(randomizer, oldSize); + +struct lineFile *lf = lineFileOpen(source, FALSE); +int i; +for (i=0; i<oldSize; ++i) + { + int seqSize; + boolean doIt = randomizer[i]; + if (!maybeCopyFastqRecord(lf, f, doIt, &seqSize)) + internalErr(); + } +freez(&randomizer); +lineFileClose(&lf); +} + +void fastqStatsAndSubsample(char *inFastq, char *outStats, char *outFastq) +/* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting + * statistics, and also producing a smaller fastq out of a sample of the data. */ +{ +/* Split up outFastq path, so we can make a temp file in the same dir. */ +char outDir[PATH_LEN]; +splitPath(outFastq, outDir, NULL, NULL); + +/* Open up temp output file. This one will be for the initial scaling. We'll do + * a second round of scaling as well. */ +char smallFastqName[PATH_LEN]; +safef(smallFastqName, PATH_LEN, "%sfastqSubsampleXXXXXX", outDir); +int smallFd = mkstemp(smallFastqName); +FILE *smallF = fdopen(smallFd, "w"); + +/* Scan through input, collecting stats, validating, and creating a subset file. */ +int downStep = calcInitialReduction(inFastq, sampleSize); +struct lineFile *lf = lineFileOpen(inFastq, FALSE); +boolean done = FALSE; +int readsCopied = 0, totalReads = 0; +boolean firstTime = TRUE; +while (!done) + { + int hotPosInCycle = rand()%downStep; + int cycle; + for (cycle=0; cycle<downStep; ++cycle) + { + boolean hotPos = (cycle == hotPosInCycle); + if (!oneFastqRecord(lf, smallF, hotPos, firstTime)) + { + done = TRUE; + break; + } + if (hotPos) + ++readsCopied; + firstTime = FALSE; + ++totalReads; + } + } +lineFileClose(&lf); +carefulClose(&smallF); + +if (readsCopied < sampleSize) + { + remove(smallFastqName); + /* Our sample isn't big enough. This could have two causes - a bug in + * our estimation, maybe caused by read sizes growing a bunch in the time + * since this code was written - or a file that is actually smaller smaller + * than sampleSize. */ + if (sampleSize > totalReads) + errAbort("SampleSize is set to %d reads, but there are only %d reads in %s", + sampleSize, totalReads, inFastq); + else + errAbort("Internal error: bad estimate %d for downStep on %s", downStep, inFastq); + } + +char *qualType = "solexa"; +int qualZero = 64; +if (minQual <= 58) + { + qualType = "sanger"; + qualZero = 33; + } +FILE *f = mustOpen(outStats, "w"); +fprintf(f, "readCount %d\n", totalReads); +fprintf(f, "baseCount %g\n", sumReadBases); +fprintf(f, "readSizeMean %g\n", sumReadBases/totalReads); +fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads)); +fprintf(f, "readSizeMin %d\n", minReadBases); +fprintf(f, "readSizeMax %d\n", maxReadBases); +double qSum = sumDoubleArray(sumQuals, maxReadBases); +double qSumSquared = sumDoubleArray(sumSquaredQuals, maxReadBases); +fprintf(f, "qualMean %g\n", qSum/sumReadBases - qualZero); +fprintf(f, "qualStd %g\n", calcStdFromSums(qSum, qSumSquared, sumReadBases)); +fprintf(f, "qualMin %d\n", minQual - qualZero); +fprintf(f, "qualMax %d\n", maxQual - qualZero); +fprintf(f, "qualType %s\n", qualType); +fprintf(f, "qualZero %d\n", qualZero); +carefulClose(&f); + +f = mustOpen(outFastq, "w"); +reduceFastqSample(smallFastqName, f, readsCopied, sampleSize); +carefulClose(&f); +remove(smallFastqName); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 4) + usage(); +sampleSize = optionInt("sampleSize", sampleSize); +seed = optionInt("seed", seed); +fastqStatsAndSubsample(argv[1], argv[2], argv[3]); +return 0; +}