0eb7bb9aa4577cfa8dbce54d835fd39ced613785 braney Wed May 18 11:03:21 2022 -0700 first cut of mafCounts, a utility to help build a sequence logo track from a MAF file diff --git src/hg/ratStuff/mafCounts/mafCounts.c src/hg/ratStuff/mafCounts/mafCounts.c new file mode 100644 index 0000000..bb2b084 --- /dev/null +++ src/hg/ratStuff/mafCounts/mafCounts.c @@ -0,0 +1,136 @@ +/* mafCounts - Filter out maf files. */ + +/* Copyright (C) 2011 The Regents of the University of California + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ +#include "common.h" +#include "errAbort.h" +#include "linefile.h" +#include "hash.h" +#include "chain.h" +#include "options.h" +#include "maf.h" +#include "bed.h" +#include "twoBit.h" +#include "binRange.h" + + +char *masterSpecies; +char *masterChrom; +struct hash *speciesHash; +struct subSpecies *speciesList; +struct strandHead *strandHeads; + +boolean addN = FALSE; +boolean addDash = FALSE; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "mafCounts - count the number of A,T,C,G,N in a maf and output four wiggles\n" + "usage:\n" + " mafCounts mafIn wigPrefix\n" + "WARNING: requires a maf with only a single target sequence\n" + "options:\n" + ); +} + +static struct optionSpec options[] = { + {NULL, 0}, +}; + +unsigned letterBox[256]; +unsigned ourBufSize = 16 * 1024; + +void mafCounts(char *mafIn, char *wigPrefix) +/* mafCounts - Filter out maf files. */ +{ +int jj; +char buffer[4096]; +FILE *f[6]; +safef(buffer, sizeof buffer, "%sA.wig", wigPrefix); +f[0] = mustOpen(buffer, "w"); +safef(buffer, sizeof buffer, "%sC.wig", wigPrefix); +f[1] = mustOpen(buffer, "w"); +safef(buffer, sizeof buffer, "%sT.wig", wigPrefix); +f[2] = mustOpen(buffer, "w"); +safef(buffer, sizeof buffer, "%sG.wig", wigPrefix); +f[3] = mustOpen(buffer, "w"); +safef(buffer, sizeof buffer, "%sD.wig", wigPrefix); +f[4] = mustOpen(buffer, "w"); +safef(buffer, sizeof buffer, "%sN.wig", wigPrefix); +f[5] = mustOpen(buffer, "w"); +struct mafAli *maf; +struct mafFile *mf = mafOpen(mafIn); + +unsigned *counts = needMem(6 * ourBufSize * sizeof(unsigned)); +int last = -1; + +for(; (maf = mafNext(mf)) != NULL;) + { + struct mafComp *comp = maf->components; + int seqLen = strlen(comp->text); + if (seqLen > ourBufSize) + errAbort("ourBufSize not big enough for %d\n", seqLen); + + if (comp->start != last) + { + char *dot = strchr(comp->src, '.'); + *dot++ = 0; + for(jj = 0; jj < 6; jj++) + fprintf(f[jj], "fixedStep chrom=%s start=%d step=1\n", dot, comp->start + 1); + } + last = comp->start + seqLen; + memset(counts, 0, 6 * ourBufSize * sizeof(unsigned)); + int ii; + for(; comp; comp = comp->next) + { + char *str = comp->text; + char *end = &str[seqLen]; + + for(ii=0; str < end; str++, ii++) + counts[letterBox[tolower(*str)] * ourBufSize + ii ]++; + } + char *str = maf->components->text; + for(ii = 0; ii < seqLen; ii++, str++) + { + if (*str == '-') + continue; + + + double total = 0; + for(jj = 0; jj < 4; jj++) + { + unsigned offset = jj * ourBufSize + ii; + total += counts[offset]; + } + + for(jj = 0; jj < 4; jj++) + //for(jj = 0; jj < 6; jj++) + { + unsigned offset = jj * ourBufSize + ii; + fprintf(f[jj],"%g\n",counts[offset]/total); + counts[offset] = 0; + } + } + + mafAliFree(&maf); + } +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); + +letterBox['a'] = 0; +letterBox['c'] = 1; +letterBox['t'] = 2; +letterBox['g'] = 3; +letterBox['-'] = 4; +letterBox['n'] = 5; +mafCounts(argv[1], argv[2]); +return 0; +}