be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/utils/faSize/faSize.c src/utils/faSize/faSize.c index e4e4dbf..16edaef 100644 --- src/utils/faSize/faSize.c +++ src/utils/faSize/faSize.c @@ -1,40 +1,43 @@ /* faSize - print total size and total N count of FA file. */ #include "common.h" #include "fa.h" #include "dnautil.h" #include "options.h" /* command line options */ static struct optionSpec optionSpecs[] = { {"detailed", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, + {"veryDetailed", OPTION_BOOLEAN}, {NULL, 0} }; void usage() /* Print usage info and exit. */ { errAbort("faSize - print total base count in fa files.\n" "usage:\n" " faSize file(s).fa\n" "Command flags\n" " -detailed outputs name and size of each record\n" " has the side effect of printing nothing else\n" - " -tab output statistics in a tab separated format\n"); + " -tab output statistics in a tab separated format\n" + " -veryDetailed outputs name, size, #Ns, #real, #upper, #lower of each record\n" + ); } struct faInfo /* Summary info on one fa. */ { struct faInfo *next; /* Next in list. */ char *name; /* First word after >. The name of seq. */ int size; /* Size, including N's. */ int nCount; /* Number of N's. */ int lCount; /* Number of Upper-case chars. */ int uCount; /* Number of Lower-case chars. */ }; int cmpFaInfo(const void *va, const void *vb) /* Compare two faInfo. */ @@ -158,30 +161,31 @@ void faSize(char *faFiles[], int faCount) /* faSize - print total size and total N count of FA files. */ { char *fileName; int i; struct dnaSeq seq; int fileCount = 0; int seqCount = 0; unsigned long long baseCount = 0; unsigned long long nCount = 0; unsigned long long uCount = 0; unsigned long long lCount = 0; struct lineFile *lf; struct faInfo *fiList = NULL, *fi; +boolean veryDetailed = optionExists("veryDetailed"); boolean detailed = optionExists("detailed"); boolean tabFmt = optionExists("tab"); ZeroVar(&seq); dnaUtilOpen(); for (i = 0; i<faCount; ++i) { fileName = faFiles[i]; lf = lineFileOpen(fileName, FALSE); ++fileCount; while (faSpeedReadNextPC(lf, &seq.dna, &seq.size, &seq.name)) { int j; int ns = 0; @@ -201,39 +205,43 @@ ++us; if (islower(d)) ++ls; } } baseCount += seq.size; nCount += ns; uCount += us; lCount += ls; AllocVar(fi); fi->name = cloneString(seq.name); fi->size = seq.size; fi->nCount = ns; fi->uCount = us; fi->lCount = ls; - if (detailed) + if (veryDetailed) + { + printf("%s\t%d\t%d\t%d\t%d\t%d\n", seq.name, seq.size, ns, seq.size-ns, us, ls); + } + else if (detailed) { printf("%s\t%d\n", seq.name, seq.size); } slAddHead(&fiList, fi); } lineFileClose(&lf); } -if (!detailed) +if (!(detailed || veryDetailed)) { double perCentMasked = 100.0; double perCentRealMasked = 100.0; if (baseCount > 0) perCentMasked = 100.0*(double)lCount/(double)baseCount; if ((baseCount - nCount) > 0) perCentRealMasked = 100.0*(double)lCount/(double)(baseCount - nCount); if (tabFmt) { printf("baseCount\t%llu\n", baseCount); printf("nBaseCount\t%llu\n", nCount); printf("realBaseCount\t%llu\n", baseCount - nCount); printf("upperBaseCount\t%llu\n", uCount); printf("lowerBaseCount\t%llu\n", lCount);