be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/utils/faSize/faSize.c src/utils/faSize/faSize.c index e4e4dbf..16edaef 100644 --- src/utils/faSize/faSize.c +++ src/utils/faSize/faSize.c @@ -1,265 +1,273 @@ /* faSize - print total size and total N count of FA file. */ #include "common.h" #include "fa.h" #include "dnautil.h" #include "options.h" /* command line options */ static struct optionSpec optionSpecs[] = { {"detailed", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, + {"veryDetailed", OPTION_BOOLEAN}, {NULL, 0} }; void usage() /* Print usage info and exit. */ { errAbort("faSize - print total base count in fa files.\n" "usage:\n" " faSize file(s).fa\n" "Command flags\n" " -detailed outputs name and size of each record\n" " has the side effect of printing nothing else\n" - " -tab output statistics in a tab separated format\n"); + " -tab output statistics in a tab separated format\n" + " -veryDetailed outputs name, size, #Ns, #real, #upper, #lower of each record\n" + ); } struct faInfo /* Summary info on one fa. */ { struct faInfo *next; /* Next in list. */ char *name; /* First word after >. The name of seq. */ int size; /* Size, including N's. */ int nCount; /* Number of N's. */ int lCount; /* Number of Upper-case chars. */ int uCount; /* Number of Lower-case chars. */ }; int cmpFaInfo(const void *va, const void *vb) /* Compare two faInfo. */ { const struct faInfo *a = *((struct faInfo **)va); const struct faInfo *b = *((struct faInfo **)vb); return a->size - b->size; } void printStats(struct faInfo **pFiList, boolean tabFmt) /* Print mean, standard deviation, min, max, and median. * As a side effect this sorts list by size.*/ { struct faInfo *fi, *minFi, *maxFi; int elCount = slCount(*pFiList); double total = 0, nTotal = 0, uTotal = 0, lTotal = 0; double mean, nMean, uMean, lMean; double sd, nSd, uSd, lSd; double variance = 0, nVariance = 0, uVariance = 0, lVariance = 0; double x; int median; if (elCount <= 1) return; for (fi = *pFiList; fi != NULL; fi = fi->next) { total += fi->size; nTotal += fi->nCount; uTotal += fi->uCount; lTotal += fi->lCount; } mean = total/elCount; nMean = nTotal/elCount; uMean = uTotal/elCount; lMean = lTotal/elCount; for (fi = *pFiList; fi != NULL; fi = fi->next) { x = (fi->size - mean); variance += x*x; x = (fi->nCount - nMean); nVariance += x*x; x = (fi->uCount - uMean); uVariance += x*x; x = (fi->lCount - lMean); lVariance += x*x; } if (elCount > 1) { variance /= (elCount-1); nVariance /= (elCount-1); uVariance /= (elCount-1); lVariance /= (elCount-1); } sd = sqrt(variance); nSd = sqrt(nVariance); uSd = sqrt(uVariance); lSd = sqrt(lVariance); slSort(pFiList, cmpFaInfo); fi = slElementFromIx(*pFiList, elCount/2); median = fi->size; minFi = *pFiList; maxFi = slLastEl(*pFiList); if (tabFmt) { printf("meanSize\t%2.1f\n" ,mean); printf("SdSize\t%2.1f\n", sd); printf("minSize\t%d\n", minFi->size); printf("minSeqSize\t%s\n", minFi->name); printf("maxSize\t%d\n", maxFi->size); printf("maxSeqSize\t%s\n", maxFi->name); printf("medianSize\t%d\n", median); printf("nCountMean\t%2.1f\n", nMean); printf("nCountSd\t%2.1f\n", nSd); printf("upperCountMean\t%2.1f\n", uMean); printf("upperCountSd\t%2.1f\n", uSd); printf("lowerCountMean\t%2.1f\n", lMean); printf("lowerCountSd\t%2.1f\n", lSd); } else { printf("Total size: mean %2.1f sd %2.1f min %d (%s) max %d (%s) median %d\n", mean, sd, minFi->size, minFi->name, maxFi->size, maxFi->name, median); printf("N count: mean %2.1f sd %2.1f\n", nMean, nSd); printf("U count: mean %2.1f sd %2.1f\n", uMean, uSd); printf("L count: mean %2.1f sd %2.1f\n", lMean, lSd); } } void faToDnaPC(char *poly, int size) /* Turn any strange characters to 'n'. Does not change size of sequence. Preserve case. */ { int i; dnaUtilOpen(); for (i=0; iname = cloneString(seq.name); fi->size = seq.size; fi->nCount = ns; fi->uCount = us; fi->lCount = ls; - if (detailed) + if (veryDetailed) + { + printf("%s\t%d\t%d\t%d\t%d\t%d\n", seq.name, seq.size, ns, seq.size-ns, us, ls); + } + else if (detailed) { printf("%s\t%d\n", seq.name, seq.size); } slAddHead(&fiList, fi); } lineFileClose(&lf); } -if (!detailed) +if (!(detailed || veryDetailed)) { double perCentMasked = 100.0; double perCentRealMasked = 100.0; if (baseCount > 0) perCentMasked = 100.0*(double)lCount/(double)baseCount; if ((baseCount - nCount) > 0) perCentRealMasked = 100.0*(double)lCount/(double)(baseCount - nCount); if (tabFmt) { printf("baseCount\t%llu\n", baseCount); printf("nBaseCount\t%llu\n", nCount); printf("realBaseCount\t%llu\n", baseCount - nCount); printf("upperBaseCount\t%llu\n", uCount); printf("lowerBaseCount\t%llu\n", lCount); printf("seqCount\t%d\n", seqCount); printf("fileCount\t%d\n", fileCount); printStats(&fiList, tabFmt); printf("fracMasked\t%.2f\n",perCentMasked/100.0); printf("fracRealMasked\t%.2f\n",perCentRealMasked/100.0); } else { printf("%llu bases (%llu N's %llu real %llu upper %llu lower) in %d sequences in %d files\n", baseCount, nCount, baseCount - nCount, uCount, lCount, seqCount, fileCount); printStats(&fiList, tabFmt); printf("%%%.2f masked total, %%%.2f masked real\n", perCentMasked, perCentRealMasked); } } } int main(int argc, char *argv[]) /* Process command line . */ { optionInit(&argc, argv, optionSpecs); if (argc < 2) usage(); faSize(argv+1, argc-1); return 0; }