e8934cf354e5696886de36315e7fc6dd602949dd kent Thu Dec 5 10:51:17 2013 -0800 Simplifying initial step down to make it give consistent result across read pairs. diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c index afcda73..0c25689 100644 --- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c +++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c @@ -1,130 +1,88 @@ /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting * statistics, and also producing a smaller fastq out of a sample of the data. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "portable.h" #include "obscure.h" #include "hmmstats.h" +/* A note on randomness: This program is used on paired end data. This data is represented + * as two separate fastq files where the forward reads are in one file and the reverse in + * the other. The files are in the same order, which is how we know which forward read goes + * with which reverse read. As a result it is very important that this program sample + * the same records from files that have the same number of records. + * + * This is implemented in two passes - the first pass calculates the statistics and + * produces a file with 1/10 the number of reads in it. The second pass produces the + * final output by downsampling the 1/10 size file if it is big enough, or the original + * file if not. + * + * Earlier versions of this program estimated the amount to reduce in the first pass + * and were more efficient, but the estimates were based on the file sizes, and thus + * sometimes varied when dealing with compressed input files, and this would break the + * correspondence between read pairs, so now the estimate is always 1/10. */ + int sampleSize = 100000; int seed = 0; boolean smallOk = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting statistics\n" "and also producing a smaller fastq out of a sample of the data. The fastq input may be\n" "compressed with gzip or bzip2. Unfortunately the fastq input can't be in a pipe\n" "usage:\n" " fastqStatsAndSubsample in.fastq out.stats out.fastq\n" "options:\n" " -sampleSize=N - default %d\n" " -seed=N - Use given seed for random number generator. Default %d.\n" " -smallOk - Not an error if less than sampleSize reads. out.fastq will be entire in.fastq\n" , sampleSize, seed ); } /* Command line validation table. */ static struct optionSpec options[] = { {"sampleSize", OPTION_INT}, {"seed", OPTION_INT}, {"smallOk", OPTION_BOOLEAN}, {NULL, 0}, }; -/* Estimate base count from file size based on this. */ -#define ZIPPED_BYTES_PER_BASE 0.80 -#define UNZIPPED_BYTES_PER_BASE 2.5 - static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof) /* Get next line and make sure, other than whitespace, it matches 'match'. * Return FALSE on EOF, unless noEof is set, in which case abort */ { char *line; if (!lineFileNextReal(lf, &line)) { if (noEof) errAbort("Expecting %c got end of file in %s", match, lf->fileName); else return FALSE; } if (line[0] != match) errAbort("Expecting %c got %s line %d of %s", match, line, lf->lineIx, lf->fileName); return TRUE; } -static int averageReadSize(char *fileName, int maxReads) -/* Read up to maxReads from fastq file and return average # of reads. */ -{ -struct lineFile *lf = lineFileOpen(fileName, FALSE); -int i; -long total = 0; -int count = 0; - -for (i=0; ifileName); - - /* Get size and add it to stats */ - int seqSize = lineSize-1; - total += seqSize; - count += 1; - - /* Deal with next two lines '+' and quality lines. */ - nextLineMustMatchChar(lf, '+', TRUE); - lineFileNeedNext(lf, &line, &lineSize); - } -lineFileClose(&lf); -if (count < 1) - errAbort("No data in %s", fileName); -return (total + (count>>1))/count; -} - -int calcInitialReduction(char *fileName, int desiredReadCount) -/* Using file name and size figure out how much to reduce it to get ~2x the subsample we want. */ -{ -size_t initialSize = fileSize(fileName); -int readSize = averageReadSize(fileName, 100); -long long estimatedBases; -if (endsWith(fileName, ".gz") || endsWith(fileName, ".bz2")) - estimatedBases = initialSize/ZIPPED_BYTES_PER_BASE; -else - estimatedBases = initialSize/UNZIPPED_BYTES_PER_BASE; -long long estimatedReads = estimatedBases/readSize; -double estimatedReduction = (double)estimatedReads/desiredReadCount; -double conservativeReduction = estimatedReduction * 0.3; -if (conservativeReduction < 1) - conservativeReduction = 1; -return round(conservativeReduction); -} - - /* A bunch of statistics gathering variables set by oneFastqRecord below. */ #define MAX_READ_SIZE 100000 /* This is fastq, right now only get 160 base reads max. */ int maxReadBases, minReadBases, readCount; long long sumReadBases; double sumSquaredReadBases; int aCount[MAX_READ_SIZE], cCount[MAX_READ_SIZE], gCount[MAX_READ_SIZE], tCount[MAX_READ_SIZE]; int nCount[MAX_READ_SIZE]; double sumQuals[MAX_READ_SIZE], sumSquaredQuals[MAX_READ_SIZE]; int maxQual, minQual; double sumDoubleArray(double *array, int arraySize) /* Return sum of all items in array */ { double total = 0; int i; @@ -388,123 +346,111 @@ void fastqStatsAndSubsample(char *inFastq, char *outStats, char *outFastq) /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting * statistics, and also producing a smaller fastq out of a sample of the data. */ { /* Temporary file if any */ FILE *smallF = NULL; /* Make this work without making input. */ if (sameString("/dev/null", outFastq)) outFastq = NULL; /* Open up temp output file. This one will be for the initial scaling. We'll do * a second round of scaling as well. */ char smallFastqName[PATH_LEN] = ""; +char *smallishName = smallFastqName; if (outFastq != NULL) { /* Split up outFastq path, so we can make a temp file in the same dir. */ char outDir[PATH_LEN]; if (outFastq) splitPath(outFastq, outDir, NULL, NULL); safef(smallFastqName, PATH_LEN, "%sfastqSubsampleXXXXXX", outDir); int smallFd = mkstemp(smallFastqName); smallF = fdopen(smallFd, "w"); } /* Scan through input, collecting stats, validating, and creating a subset file. */ -int downStep = calcInitialReduction(inFastq, sampleSize); +int downStep = 10; struct lineFile *lf = lineFileOpen(inFastq, FALSE); boolean done = FALSE; int readsCopied = 0, totalReads = 0; long long basesInSample = 0; boolean firstTime = TRUE; while (!done) { int hotPosInCycle = rand()%downStep; int cycle; for (cycle=0; cycle totalReads) { if (smallOk) { - justUseSmall = TRUE; warn("%d reads total in %s, so sample is less than %d", totalReads, inFastq, sampleSize); } else { remove(smallFastqName); errAbort("SampleSize is set to %d reads, but there are only %d reads in %s", sampleSize, totalReads, inFastq); } + sampleSize = totalReads; } } char *qualType = "solexa"; int qualZero = 64; if (minQual <= 58) { qualType = "sanger"; qualZero = 33; } if (outFastq != NULL) { - if (justUseSmall) - { - mustRename(smallFastqName, outFastq); - sampleSize = readsCopied; - basesInSample= sumReadBases; - } - else - { FILE *f = mustOpen(outFastq, "w"); - basesInSample = reduceFastqSample(smallFastqName, f, readsCopied, sampleSize); + basesInSample = reduceFastqSample(smallishName, f, readsCopied, sampleSize); carefulClose(&f); remove(smallFastqName); } - } FILE *f = mustOpen(outStats, "w"); int posCount = maxReadBases; fprintf(f, "readCount %d\n", totalReads); fprintf(f, "baseCount %lld\n", sumReadBases); fprintf(f, "sampleCount %d\n", sampleSize); fprintf(f, "basesInSample %lld\n", basesInSample); fprintf(f, "readSizeMean %g\n", (double)sumReadBases/totalReads); if (minReadBases != maxReadBases) fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads)); else fprintf(f, "readSizeStd 0\n"); fprintf(f, "readSizeMin %d\n", minReadBases); fprintf(f, "readSizeMax %d\n", maxReadBases); double qSum = sumDoubleArray(sumQuals, maxReadBases); @@ -530,32 +476,30 @@ fprintf(f, "cRatio %g\n", (double)cSum/sumReadBases); fprintf(f, "gRatio %g\n", (double)gSum/sumReadBases); fprintf(f, "tRatio %g\n", (double)tSum/sumReadBases); fprintf(f, "nRatio %g\n", (double)nSum/sumReadBases); /* Now deal with array outputs. First make up count of all bases we've seen. */ fprintf(f, "posCount %d\n", posCount); long long totalAtPos[posCount]; int pos; for (pos=0; pos