src/utils/faCount/faCount.c 1.8
1.8 2009/08/02 19:51:20 markd
Add options for counting di-nucleotide frequencies and looking at both strands of the input sequences. (from stanford)
Index: src/utils/faCount/faCount.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/faCount/faCount.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 4 -r1.7 -r1.8
--- src/utils/faCount/faCount.c 18 Mar 2006 01:51:36 -0000 1.7
+++ src/utils/faCount/faCount.c 2 Aug 2009 19:51:20 -0000 1.8
@@ -5,90 +5,181 @@
#include "options.h"
static char const rcsid[] = "$Id$";
+static struct optionSpec optionSpecs[] =
+{
+ {"summary", OPTION_BOOLEAN},
+ {"dinuc", OPTION_BOOLEAN},
+ {"strands", OPTION_BOOLEAN},
+ {NULL, 0},
+};
+
bool summary = FALSE;
+bool dinuc = FALSE;
+bool strands = FALSE;
void usage()
/* Print usage info and exit. */
{
errAbort("faCount - count base statistics and CpGs in FA files.\n"
"usage:\n"
" faCount file(s).fa\n"
" -summary show only summary statistics\n"
+ " -dinuc include statistics on dinucletoide frequencies\n"
+ " -strands count bases on both strands\n"
);
}
void faCount(char *faFiles[], int faCount)
/* faCount - count bases. */
{
-int f, i, j;
+int f, i, j, k;
struct dnaSeq seq;
unsigned long long totalLength = 0;
unsigned long long totalBaseCount[5];
+unsigned long long totalDinucleotideCount[5][5];
unsigned long long totalCpgCount = 0;
struct lineFile *lf;
ZeroVar(&seq);
for (i = 0; i < ArraySize(totalBaseCount); i++)
totalBaseCount[i] = 0;
-printf("#seq\tlen\tA\tC\tG\tT\tN\tcpg\n");
+for (i = 0; i < ArraySize(totalDinucleotideCount); i++)
+ for (j = 0; j < ArraySize(totalDinucleotideCount[i]); j++)
+ totalDinucleotideCount[i][j] = 0;
+
+printf("#seq\tlen\tA\tC\tG\tT\tN\tcpg");
+if (dinuc)
+ printf("\tAA\tAC\tAG\tAT\tCA\tCC\tCG\tCT\tGA\tGC\tGG\tGT\tTA\tTC\tTG\tTT");
+printf("\n");
dnaUtilOpen();
for (f = 0; f<faCount; ++f)
{
lf = lineFileOpen(faFiles[f], FALSE);
while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
{
int prevBase = -1;
+ int prevRcBase = -1;
unsigned long long length = 0;
unsigned long long baseCount[5];
+ unsigned long long dinucleotideCount[5][5];
unsigned long long cpgCount = 0;
for (i = 0; i < ArraySize(baseCount); i++)
baseCount[i] = 0;
+ for (i = 0; i < ArraySize(dinucleotideCount); i++)
+ for (j = 0; j < ArraySize(dinucleotideCount[i]); j++)
+ dinucleotideCount[i][j] = 0;
for (j=0; j<seq.size; ++j)
{
int baseVal = ntVal5[(int)(seq.dna[j])];
+ int rcBaseVal;
assert(baseVal != -1);
assert(baseVal <= 4);
length++;
+ switch(baseVal)
+ {
+ case A_BASE_VAL: rcBaseVal = T_BASE_VAL; break;
+ case C_BASE_VAL: rcBaseVal = G_BASE_VAL; break;
+ case G_BASE_VAL: rcBaseVal = C_BASE_VAL; break;
+ case T_BASE_VAL: rcBaseVal = A_BASE_VAL; break;
+ default: rcBaseVal = N_BASE_VAL; break;
+ }
baseCount[baseVal]++;
if ((prevBase == C_BASE_VAL) && (baseVal == G_BASE_VAL))
cpgCount++;
+ if (prevBase != -1)
+ dinucleotideCount[prevBase][baseVal]++;
+ if (strands)
+ {
+ length++;
+ baseCount[rcBaseVal]++;
+ if ((prevRcBase == G_BASE_VAL) && (rcBaseVal == C_BASE_VAL))
+ cpgCount++;
+ if (prevRcBase != -1)
+ dinucleotideCount[rcBaseVal][prevRcBase]++;
+ }
prevBase = baseVal;
+ prevRcBase = rcBaseVal;
}
if (!summary)
- printf("%s\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\n",
+ {
+ printf("%s\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu",
seq.name, length,
baseCount[A_BASE_VAL], baseCount[C_BASE_VAL],
baseCount[G_BASE_VAL], baseCount[T_BASE_VAL],
baseCount[N_BASE_VAL], cpgCount);
+ if (dinuc)
+ printf("\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu",
+ dinucleotideCount[A_BASE_VAL][A_BASE_VAL], dinucleotideCount[A_BASE_VAL][C_BASE_VAL],
+ dinucleotideCount[A_BASE_VAL][G_BASE_VAL], dinucleotideCount[A_BASE_VAL][T_BASE_VAL],
+ dinucleotideCount[C_BASE_VAL][A_BASE_VAL], dinucleotideCount[C_BASE_VAL][C_BASE_VAL],
+ dinucleotideCount[C_BASE_VAL][G_BASE_VAL], dinucleotideCount[C_BASE_VAL][T_BASE_VAL],
+ dinucleotideCount[G_BASE_VAL][A_BASE_VAL], dinucleotideCount[G_BASE_VAL][C_BASE_VAL],
+ dinucleotideCount[G_BASE_VAL][G_BASE_VAL], dinucleotideCount[G_BASE_VAL][T_BASE_VAL],
+ dinucleotideCount[T_BASE_VAL][A_BASE_VAL], dinucleotideCount[T_BASE_VAL][C_BASE_VAL],
+ dinucleotideCount[T_BASE_VAL][G_BASE_VAL], dinucleotideCount[T_BASE_VAL][T_BASE_VAL]);
+ printf("\n");
+ }
totalLength += length;
totalCpgCount += cpgCount;
for (i = 0; i < ArraySize(baseCount); i++)
totalBaseCount[i] += baseCount[i];
+ for (i = 0; i < ArraySize(dinucleotideCount); i++)
+ for (k = 0; k < ArraySize(dinucleotideCount[i]); k++)
+ totalDinucleotideCount[i][k] += dinucleotideCount[i][k];
}
lineFileClose(&lf);
}
-printf("total\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\n",
+
+
+printf("total\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu",
totalLength,
totalBaseCount[A_BASE_VAL], totalBaseCount[C_BASE_VAL],
totalBaseCount[G_BASE_VAL], totalBaseCount[T_BASE_VAL],
totalBaseCount[N_BASE_VAL], totalCpgCount);
+if (dinuc)
+ printf("\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu",
+ totalDinucleotideCount[A_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][C_BASE_VAL],
+ totalDinucleotideCount[A_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][T_BASE_VAL],
+ totalDinucleotideCount[C_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][C_BASE_VAL],
+ totalDinucleotideCount[C_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][T_BASE_VAL],
+ totalDinucleotideCount[G_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][C_BASE_VAL],
+ totalDinucleotideCount[G_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][T_BASE_VAL],
+ totalDinucleotideCount[T_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[T_BASE_VAL][C_BASE_VAL],
+ totalDinucleotideCount[T_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[T_BASE_VAL][T_BASE_VAL]);
+printf("\n");
+
if (summary)
- printf("percent\t%10.1f\t%10.4f\t%10.4f\t%10.4f\t%10.4f\t%10.6f\t%10.4f\n",
+ {
+ printf("prcnt\t%-5.1f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f",
(float)totalLength/totalLength,
((float)totalBaseCount[A_BASE_VAL])/(float)totalLength, ((float)totalBaseCount[C_BASE_VAL])/(float)totalLength,
((float)totalBaseCount[G_BASE_VAL])/(float)totalLength, ((float)totalBaseCount[T_BASE_VAL])/(float)totalLength,
((float)totalBaseCount[N_BASE_VAL])/(float)totalLength, (float)totalCpgCount/(float)totalLength);
+ if (dinuc)
+ printf("\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f",
+ (float)totalDinucleotideCount[A_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[A_BASE_VAL][C_BASE_VAL]/(float)totalLength,
+ (float)totalDinucleotideCount[A_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[A_BASE_VAL][T_BASE_VAL]/(float)totalLength,
+ (float)totalDinucleotideCount[C_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[C_BASE_VAL][C_BASE_VAL]/(float)totalLength,
+ (float)totalDinucleotideCount[C_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[C_BASE_VAL][T_BASE_VAL]/(float)totalLength,
+ (float)totalDinucleotideCount[G_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[G_BASE_VAL][C_BASE_VAL]/(float)totalLength,
+ (float)totalDinucleotideCount[G_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[G_BASE_VAL][T_BASE_VAL]/(float)totalLength,
+ (float)totalDinucleotideCount[T_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[T_BASE_VAL][C_BASE_VAL]/(float)totalLength,
+ (float)totalDinucleotideCount[T_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[T_BASE_VAL][T_BASE_VAL]/(float)totalLength);
+ printf("\n");
+ }
}
int main(int argc, char *argv[])
/* Process command line . */
{
-optionHash(&argc, argv);
-summary = optionExists("summary");
+optionInit(&argc, argv, optionSpecs);
if (argc < 2)
usage();
+summary = optionExists("summary");
+dinuc = optionExists("dinuc");
+strands = optionExists("strands");
faCount(argv+1, argc-1);
return 0;
}