1dc0d205786891dc35074276f3a64ca436b09221
angie
  Mon Jan 11 16:36:27 2021 -0800
As for -detailed, don't print summary at end for -veryDetailed output so it's more pipeline-friendly.

diff --git src/utils/faSize/faSize.c src/utils/faSize/faSize.c
index 632ff9d..16edaef 100644
--- src/utils/faSize/faSize.c
+++ src/utils/faSize/faSize.c
@@ -1,273 +1,273 @@
 /* faSize - print total size and total N count of FA file. */
 #include "common.h"
 #include "fa.h"
 #include "dnautil.h"
 #include "options.h"
 
 
 /* command line options */
 static struct optionSpec optionSpecs[] =
 {
     {"detailed", OPTION_BOOLEAN},
     {"tab", OPTION_BOOLEAN},
     {"veryDetailed", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 void usage()
 /* Print usage info and exit. */
 {
 errAbort("faSize - print total base count in fa files.\n"
 	 "usage:\n"
 	 "   faSize file(s).fa\n"
 	 "Command flags\n"
 	 "   -detailed        outputs name and size of each record\n"
          "                    has the side effect of printing nothing else\n"
          "   -tab             output statistics in a tab separated format\n"
          "   -veryDetailed    outputs name, size, #Ns, #real, #upper, #lower of each record\n"
          );
 }
 
 struct faInfo
 /* Summary info on one fa. */
    {
    struct faInfo *next;	/* Next in list. */
    char *name;		/* First word after >.  The name of seq. */
    int size;            /* Size, including N's. */
    int nCount;          /* Number of N's. */
    int lCount;          /* Number of Upper-case chars. */
    int uCount;          /* Number of Lower-case chars. */
    };
 
 int cmpFaInfo(const void *va, const void *vb)
 /* Compare two faInfo. */
 {
 const struct faInfo *a = *((struct faInfo **)va);
 const struct faInfo *b = *((struct faInfo **)vb);
 return a->size - b->size;
 }
 
 
 void printStats(struct faInfo **pFiList, boolean tabFmt)
 /* Print mean, standard deviation, min, max, and median. 
  * As a side effect this sorts list by size.*/
 {
 struct faInfo *fi, *minFi, *maxFi;
 int elCount = slCount(*pFiList);
 double total = 0, nTotal = 0, uTotal = 0, lTotal = 0;
 double mean, nMean, uMean, lMean;
 double sd, nSd, uSd, lSd;
 double variance = 0, nVariance = 0, uVariance = 0, lVariance = 0;
 double x;
 int median;
 
 if (elCount <= 1)
     return;
 for (fi = *pFiList; fi != NULL; fi = fi->next)
     {
     total += fi->size;
     nTotal += fi->nCount;
     uTotal += fi->uCount;
     lTotal += fi->lCount;
     }
 mean = total/elCount;
 nMean = nTotal/elCount;
 uMean = uTotal/elCount;
 lMean = lTotal/elCount;
 
 for (fi = *pFiList; fi != NULL; fi = fi->next)
     {
     x = (fi->size - mean);
     variance += x*x;
     x = (fi->nCount - nMean);
     nVariance  += x*x;
     x = (fi->uCount - uMean);
     uVariance  += x*x;
     x = (fi->lCount - lMean);
     lVariance  += x*x;
     }
 if (elCount > 1)
     {
     variance /= (elCount-1);
     nVariance /= (elCount-1);
     uVariance /= (elCount-1);
     lVariance /= (elCount-1);
     }
 sd = sqrt(variance);
 nSd = sqrt(nVariance);
 uSd = sqrt(uVariance);
 lSd = sqrt(lVariance);
 slSort(pFiList, cmpFaInfo);
 fi = slElementFromIx(*pFiList, elCount/2);
 median = fi->size;
 minFi = *pFiList;
 maxFi = slLastEl(*pFiList);
 
 if (tabFmt)
     {
     printf("meanSize\t%2.1f\n" ,mean);
     printf("SdSize\t%2.1f\n", sd);
     printf("minSize\t%d\n", minFi->size);
     printf("minSeqSize\t%s\n", minFi->name);
     printf("maxSize\t%d\n", maxFi->size);
     printf("maxSeqSize\t%s\n", maxFi->name);
     printf("medianSize\t%d\n", median);
     printf("nCountMean\t%2.1f\n", nMean);
     printf("nCountSd\t%2.1f\n", nSd);
     printf("upperCountMean\t%2.1f\n", uMean);
     printf("upperCountSd\t%2.1f\n", uSd);
     printf("lowerCountMean\t%2.1f\n", lMean);
     printf("lowerCountSd\t%2.1f\n", lSd);
     }
 else
     {
     printf("Total size: mean %2.1f sd %2.1f min %d (%s) max %d (%s) median %d\n",
 	mean, sd, minFi->size, minFi->name, maxFi->size, maxFi->name, median);
     printf("N count: mean %2.1f sd %2.1f\n", nMean, nSd);
     printf("U count: mean %2.1f sd %2.1f\n", uMean, uSd);
     printf("L count: mean %2.1f sd %2.1f\n", lMean, lSd);
     }
 }
 
 void faToDnaPC(char *poly, int size)
 /* Turn any strange characters to 'n'.  Does not change size of sequence.
    Preserve case.
 */
 {
 int i;
 dnaUtilOpen();
 for (i=0; i<size; ++i)
     {
     if (ntChars[(int)poly[i]] == 0)
     	poly[i] = 'n';
     }
 }
 
 
 boolean faSpeedReadNextPC(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
 /* Read in DNA record. */
 {
 char *poly;
 int size;
 
 if (!faMixedSpeedReadNext(lf, retDna, retSize, retName))
     return FALSE;
 size = *retSize;
 poly = *retDna;
 faToDnaPC(poly, size);
 return TRUE;
 }
 
 
 void faSize(char *faFiles[], int faCount)
 /* faSize - print total size and total N count of FA files. */
 {
 char *fileName;
 int i;
 struct dnaSeq seq;
 int fileCount = 0;
 int seqCount = 0;
 unsigned long long baseCount = 0;
 unsigned long long nCount = 0;
 unsigned long long uCount = 0;
 unsigned long long lCount = 0;
 struct lineFile *lf;
 struct faInfo *fiList = NULL, *fi;
 boolean veryDetailed = optionExists("veryDetailed");
 boolean detailed = optionExists("detailed");
 boolean tabFmt = optionExists("tab");
 
 ZeroVar(&seq);
 
 dnaUtilOpen();
 for (i = 0; i<faCount; ++i)
     {
     fileName = faFiles[i];
     lf = lineFileOpen(fileName, FALSE);
     ++fileCount;
     while (faSpeedReadNextPC(lf, &seq.dna, &seq.size, &seq.name))
 	{
 	int j;
 	int ns = 0;
 	int us = 0;
 	int ls = 0;
 	++seqCount;
 	for (j=0; j<seq.size; ++j)
 	    {
 	    DNA d = seq.dna[j];
 	    if (d == 'n' || d == 'N')
 		{
 		++ns;
 		}
 	    else
 		{
 		if (isupper(d)) 
 		    ++us;
 		if (islower(d)) 
 		    ++ls;
 		}
 	    }
 	baseCount += seq.size;
 	nCount += ns;
 	uCount += us;
 	lCount += ls;
 	AllocVar(fi);
 	fi->name = cloneString(seq.name);
 	fi->size = seq.size;
 	fi->nCount = ns;
 	fi->uCount = us;
 	fi->lCount = ls;
         if (veryDetailed)
             {
 	    printf("%s\t%d\t%d\t%d\t%d\t%d\n", seq.name, seq.size, ns, seq.size-ns, us, ls);
             }
 	else if (detailed)
 	    {
 	    printf("%s\t%d\n", seq.name, seq.size);
 	    }
 	slAddHead(&fiList, fi);
 	}
     lineFileClose(&lf);
     }
-if (!detailed)
+if (!(detailed || veryDetailed))
     {
     double perCentMasked = 100.0;
     double perCentRealMasked = 100.0;
     if (baseCount > 0)
 	perCentMasked = 100.0*(double)lCount/(double)baseCount;
     if ((baseCount - nCount) > 0)
 	perCentRealMasked = 100.0*(double)lCount/(double)(baseCount - nCount);
 
     if (tabFmt)
         {
         printf("baseCount\t%llu\n", baseCount);
         printf("nBaseCount\t%llu\n", nCount);
         printf("realBaseCount\t%llu\n", baseCount - nCount);
         printf("upperBaseCount\t%llu\n", uCount);
         printf("lowerBaseCount\t%llu\n", lCount);
         printf("seqCount\t%d\n", seqCount);
         printf("fileCount\t%d\n", fileCount);
         printStats(&fiList, tabFmt);
         printf("fracMasked\t%.2f\n",perCentMasked/100.0);
         printf("fracRealMasked\t%.2f\n",perCentRealMasked/100.0);
         }
     else
         {
         printf("%llu bases (%llu N's %llu real %llu upper %llu lower) in %d sequences in %d files\n",
                baseCount, nCount, baseCount - nCount, uCount, lCount, seqCount, fileCount);
         printStats(&fiList, tabFmt);
         printf("%%%.2f masked total, %%%.2f masked real\n",
                perCentMasked, perCentRealMasked);
         }
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line . */
 {
 optionInit(&argc, argv, optionSpecs);
 if (argc < 2)
     usage();
 faSize(argv+1, argc-1);
 return 0;
 }