1f4756ac08db3d4371556857dcf00ae2f1d0a8f4 kent Sat Jul 27 12:20:56 2013 -0700 Adding smallOk option. diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c index f992ce2..1f3e7a7 100644 --- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c +++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c @@ -1,49 +1,52 @@ /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting * statistics, and also producing a smaller fastq out of a sample of the data. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "portable.h" #include "obscure.h" #include "hmmstats.h" int sampleSize = 100000; int seed = 0; +boolean smallOk = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting statistics\n" "and also producing a smaller fastq out of a sample of the data. The fastq input may be\n" "compressed with gzip or bzip2. Unfortunately the fastq input can't be in a pipe\n" "usage:\n" " fastqStatsAndSubsample in.fastq out.stats out.fastq\n" "options:\n" " -sampleSize=N - default %d\n" - " -seed=N - use given seed for random number generator. Default %d\n" + " -seed=N - use given seed for random number generator. Default %d.\n" + " -smallOk - Not an error if less than sampleSize reads. out.fastq will be entire in.fastq\n" , sampleSize, seed ); } /* Command line validation table. */ static struct optionSpec options[] = { {"sampleSize", OPTION_INT}, {"seed", OPTION_INT}, + {"smallOk", OPTION_BOOLEAN}, {NULL, 0}, }; /* Estimate base count from file size based on this. */ #define ZIPPED_BYTES_PER_BASE 0.80 #define UNZIPPED_BYTES_PER_BASE 2.5 #define READ_SIZE 100 static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof) /* Get next line and make sure, other than whitespace, it matches 'match'. * Return FALSE on EOF, unless noEof is set, in which case abort */ { char *line; if (!lineFileNext(lf, &line, NULL)) { @@ -189,30 +192,31 @@ switch (c) { case 'a': aCount[i] += 1; break; case 'c': cCount[i] += 1; break; case 'g': gCount[i] += 1; break; case 't': tCount[i] += 1; break; case 'n': + case '.': nCount[i] += 1; break; default: errAbort("Unrecognized nucleotide character %c line %d of %s", c, lf->lineIx, lf->fileName); break; } } if (copy) mustWrite(f, line, lineSize); /* Deal with line containing just '+' that separates sequence from quality. */ nextLineMustMatchChar(lf, '+', TRUE); if (copy) @@ -350,70 +354,91 @@ boolean hotPos = (cycle == hotPosInCycle); if (!oneFastqRecord(lf, smallF, hotPos, firstTime)) { done = TRUE; break; } if (hotPos) ++readsCopied; firstTime = FALSE; ++totalReads; } } lineFileClose(&lf); carefulClose(&smallF); +boolean justUseSmall = FALSE; if (readsCopied < sampleSize) { - remove(smallFastqName); /* Our sample isn't big enough. This could have two causes - a bug in * our estimation, maybe caused by read sizes growing a bunch in the time * since this code was written - or a file that is actually smaller smaller * than sampleSize. */ - if (sampleSize > totalReads) + if (sampleSize <= totalReads) + { + remove(smallFastqName); + errAbort("Internal error: bad estimate %d for downStep on %s", downStep, inFastq); + } + else + { + if (smallOk) + { + justUseSmall = TRUE; + warn("%d reads total in %s, so sample is less than %d", + totalReads, inFastq, sampleSize); + } + else + { + remove(smallFastqName); errAbort("SampleSize is set to %d reads, but there are only %d reads in %s", sampleSize, totalReads, inFastq); - else - errAbort("Internal error: bad estimate %d for downStep on %s", downStep, inFastq); + } + } } char *qualType = "solexa"; int qualZero = 64; if (minQual <= 58) { qualType = "sanger"; qualZero = 33; } FILE *f = mustOpen(outStats, "w"); fprintf(f, "readCount %d\n", totalReads); fprintf(f, "baseCount %g\n", sumReadBases); fprintf(f, "readSizeMean %g\n", sumReadBases/totalReads); fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads)); fprintf(f, "readSizeMin %d\n", minReadBases); fprintf(f, "readSizeMax %d\n", maxReadBases); double qSum = sumDoubleArray(sumQuals, maxReadBases); double qSumSquared = sumDoubleArray(sumSquaredQuals, maxReadBases); fprintf(f, "qualMean %g\n", qSum/sumReadBases - qualZero); fprintf(f, "qualStd %g\n", calcStdFromSums(qSum, qSumSquared, sumReadBases)); fprintf(f, "qualMin %d\n", minQual - qualZero); fprintf(f, "qualMax %d\n", maxQual - qualZero); fprintf(f, "qualType %s\n", qualType); fprintf(f, "qualZero %d\n", qualZero); carefulClose(&f); +if (justUseSmall) + mustRename(smallFastqName, outFastq); +else + { f = mustOpen(outFastq, "w"); reduceFastqSample(smallFastqName, f, readsCopied, sampleSize); carefulClose(&f); remove(smallFastqName); } +} int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); sampleSize = optionInt("sampleSize", sampleSize); seed = optionInt("seed", seed); +smallOk = optionExists("smallOk"); fastqStatsAndSubsample(argv[1], argv[2], argv[3]); return 0; }