b1bf46da53916a9741f163ea439ac459b3957b0a hiram Mon Mar 16 14:39:59 2026 -0700 enhanced command to allow 2bit inputs - code by claude assistance - no redmine diff --git src/utils/faCount/faCount.c src/utils/faCount/faCount.c index 49c8127135f..990138d2652 100644 --- src/utils/faCount/faCount.c +++ src/utils/faCount/faCount.c @@ -1,89 +1,89 @@ /* faCount - count base statistics and CpGs in FA files */ #include "common.h" #include "fa.h" #include "dnautil.h" +#include "dnaLoad.h" #include "options.h" static struct optionSpec optionSpecs[] = { {"summary", OPTION_BOOLEAN}, {"dinuc", OPTION_BOOLEAN}, {"strands", OPTION_BOOLEAN}, {NULL, 0}, }; bool summary = FALSE; bool dinuc = FALSE; bool strands = FALSE; void usage() /* Print usage info and exit. */ { -errAbort("faCount - count base statistics and CpGs in FA files.\n" +errAbort("faCount - count base statistics and CpGs in FA or 2bit files.\n" "usage:\n" - " faCount file(s).fa\n" + " faCount file(s).fa|file(s).2bit\n" " -summary show only summary statistics\n" " -dinuc include statistics on dinucletoide frequencies\n" " -strands count bases on both strands\n" ); } void faCount(char *faFiles[], int faCount) /* faCount - count bases. */ { int f, i, j, k; -struct dnaSeq seq; +struct dnaSeq *seq; +struct dnaLoad *dl; unsigned long long totalLength = 0; unsigned long long totalBaseCount[5]; unsigned long long totalDinucleotideCount[5][5]; unsigned long long totalCpgCount = 0; -struct lineFile *lf; -ZeroVar(&seq); for (i = 0; i < ArraySize(totalBaseCount); i++) totalBaseCount[i] = 0; for (i = 0; i < ArraySize(totalDinucleotideCount); i++) for (j = 0; j < ArraySize(totalDinucleotideCount[i]); j++) totalDinucleotideCount[i][j] = 0; printf("#seq\tlen\tA\tC\tG\tT\tN\tcpg"); if (dinuc) printf("\tAA\tAC\tAG\tAT\tCA\tCC\tCG\tCT\tGA\tGC\tGG\tGT\tTA\tTC\tTG\tTT"); printf("\n"); dnaUtilOpen(); for (f = 0; fsize; ++j) { - int baseVal = ntVal5[(int)(seq.dna[j])]; + int baseVal = ntVal5[(int)(seq->dna[j])]; int rcBaseVal; assert(baseVal != -1); assert(baseVal <= 4); length++; switch(baseVal) { case A_BASE_VAL: rcBaseVal = T_BASE_VAL; break; case C_BASE_VAL: rcBaseVal = G_BASE_VAL; break; case G_BASE_VAL: rcBaseVal = C_BASE_VAL; break; case T_BASE_VAL: rcBaseVal = A_BASE_VAL; break; default: rcBaseVal = N_BASE_VAL; break; } baseCount[baseVal]++; if ((prevBase == C_BASE_VAL) && (baseVal == G_BASE_VAL)) cpgCount++; @@ -92,55 +92,56 @@ if (strands) { length++; baseCount[rcBaseVal]++; if ((prevRcBase == G_BASE_VAL) && (rcBaseVal == C_BASE_VAL)) cpgCount++; if (prevRcBase != -1) dinucleotideCount[rcBaseVal][prevRcBase]++; } prevBase = baseVal; prevRcBase = rcBaseVal; } if (!summary) { printf("%s\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu", - seq.name, length, + seq->name, length, baseCount[A_BASE_VAL], baseCount[C_BASE_VAL], baseCount[G_BASE_VAL], baseCount[T_BASE_VAL], baseCount[N_BASE_VAL], cpgCount); if (dinuc) printf("\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu", dinucleotideCount[A_BASE_VAL][A_BASE_VAL], dinucleotideCount[A_BASE_VAL][C_BASE_VAL], dinucleotideCount[A_BASE_VAL][G_BASE_VAL], dinucleotideCount[A_BASE_VAL][T_BASE_VAL], dinucleotideCount[C_BASE_VAL][A_BASE_VAL], dinucleotideCount[C_BASE_VAL][C_BASE_VAL], dinucleotideCount[C_BASE_VAL][G_BASE_VAL], dinucleotideCount[C_BASE_VAL][T_BASE_VAL], dinucleotideCount[G_BASE_VAL][A_BASE_VAL], dinucleotideCount[G_BASE_VAL][C_BASE_VAL], dinucleotideCount[G_BASE_VAL][G_BASE_VAL], dinucleotideCount[G_BASE_VAL][T_BASE_VAL], dinucleotideCount[T_BASE_VAL][A_BASE_VAL], dinucleotideCount[T_BASE_VAL][C_BASE_VAL], dinucleotideCount[T_BASE_VAL][G_BASE_VAL], dinucleotideCount[T_BASE_VAL][T_BASE_VAL]); printf("\n"); } totalLength += length; totalCpgCount += cpgCount; for (i = 0; i < ArraySize(baseCount); i++) totalBaseCount[i] += baseCount[i]; for (i = 0; i < ArraySize(dinucleotideCount); i++) for (k = 0; k < ArraySize(dinucleotideCount[i]); k++) totalDinucleotideCount[i][k] += dinucleotideCount[i][k]; + dnaSeqFree(&seq); } - lineFileClose(&lf); + dnaLoadClose(&dl); } printf("total\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu", totalLength, totalBaseCount[A_BASE_VAL], totalBaseCount[C_BASE_VAL], totalBaseCount[G_BASE_VAL], totalBaseCount[T_BASE_VAL], totalBaseCount[N_BASE_VAL], totalCpgCount); if (dinuc) printf("\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu", totalDinucleotideCount[A_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][C_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][T_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][C_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][T_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][C_BASE_VAL],