e89a47bff299bdd888d5b7f0a470b20aed221e01 hiram Fri Mar 13 12:57:40 2026 -0700 improved faSize inputs to allow 2bit files and added tests - code by claude diff --git src/utils/faSize/faSize.c src/utils/faSize/faSize.c index 16edaef5816..652592b0740 100644 --- src/utils/faSize/faSize.c +++ src/utils/faSize/faSize.c @@ -1,37 +1,38 @@ /* faSize - print total size and total N count of FA file. */ #include "common.h" #include "fa.h" #include "dnautil.h" +#include "dnaLoad.h" #include "options.h" /* command line options */ static struct optionSpec optionSpecs[] = { {"detailed", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, {"veryDetailed", OPTION_BOOLEAN}, {NULL, 0} }; void usage() /* Print usage info and exit. */ { -errAbort("faSize - print total base count in fa files.\n" +errAbort("faSize - print total base count in fa or 2bit files.\n" "usage:\n" - " faSize file(s).fa\n" + " faSize file(s).fa|file(s).2bit\n" "Command flags\n" " -detailed outputs name and size of each record\n" " has the side effect of printing nothing else\n" " -tab output statistics in a tab separated format\n" " -veryDetailed outputs name, size, #Ns, #real, #upper, #lower of each record\n" ); } struct faInfo /* Summary info on one fa. */ { struct faInfo *next; /* Next in list. */ char *name; /* First word after >. The name of seq. */ int size; /* Size, including N's. */ int nCount; /* Number of N's. */ @@ -148,98 +149,98 @@ /* Read in DNA record. */ { char *poly; int size; if (!faMixedSpeedReadNext(lf, retDna, retSize, retName)) return FALSE; size = *retSize; poly = *retDna; faToDnaPC(poly, size); return TRUE; } void faSize(char *faFiles[], int faCount) -/* faSize - print total size and total N count of FA files. */ +/* faSize - print total size and total N count of FA or 2bit files. */ { char *fileName; int i; -struct dnaSeq seq; +struct dnaSeq *seq; +struct dnaLoad *dl; int fileCount = 0; int seqCount = 0; unsigned long long baseCount = 0; unsigned long long nCount = 0; unsigned long long uCount = 0; unsigned long long lCount = 0; -struct lineFile *lf; struct faInfo *fiList = NULL, *fi; boolean veryDetailed = optionExists("veryDetailed"); boolean detailed = optionExists("detailed"); boolean tabFmt = optionExists("tab"); -ZeroVar(&seq); - dnaUtilOpen(); for (i = 0; idna, seq->size); ++seqCount; - for (j=0; jsize; ++j) { - DNA d = seq.dna[j]; + DNA d = seq->dna[j]; if (d == 'n' || d == 'N') { ++ns; } else { if (isupper(d)) ++us; if (islower(d)) ++ls; } } - baseCount += seq.size; + baseCount += seq->size; nCount += ns; uCount += us; lCount += ls; AllocVar(fi); - fi->name = cloneString(seq.name); - fi->size = seq.size; + fi->name = cloneString(seq->name); + fi->size = seq->size; fi->nCount = ns; fi->uCount = us; fi->lCount = ls; if (veryDetailed) { - printf("%s\t%d\t%d\t%d\t%d\t%d\n", seq.name, seq.size, ns, seq.size-ns, us, ls); + printf("%s\t%d\t%d\t%d\t%d\t%d\n", seq->name, seq->size, ns, seq->size-ns, us, ls); } else if (detailed) { - printf("%s\t%d\n", seq.name, seq.size); + printf("%s\t%d\n", seq->name, seq->size); } slAddHead(&fiList, fi); + dnaSeqFree(&seq); } - lineFileClose(&lf); + dnaLoadClose(&dl); } if (!(detailed || veryDetailed)) { double perCentMasked = 100.0; double perCentRealMasked = 100.0; if (baseCount > 0) perCentMasked = 100.0*(double)lCount/(double)baseCount; if ((baseCount - nCount) > 0) perCentRealMasked = 100.0*(double)lCount/(double)(baseCount - nCount); if (tabFmt) { printf("baseCount\t%llu\n", baseCount); printf("nBaseCount\t%llu\n", nCount); printf("realBaseCount\t%llu\n", baseCount - nCount);