b187b81900601cb093a96e2159a30d8a94427de9 braney Thu Jun 16 16:15:39 2022 -0700 tweaks to support sequence logo diff --git src/hg/ratStuff/mafCounts/mafCounts.c src/hg/ratStuff/mafCounts/mafCounts.c index bb2b084..dcb8cd4 100644 --- src/hg/ratStuff/mafCounts/mafCounts.c +++ src/hg/ratStuff/mafCounts/mafCounts.c @@ -1,136 +1,165 @@ -/* mafCounts - Filter out maf files. */ +/* mafCounts - count A,C,T,G in maf files and make four wiggles. */ -/* Copyright (C) 2011 The Regents of the University of California +/* Copyright (C) 2022 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "errAbort.h" #include "linefile.h" #include "hash.h" #include "chain.h" #include "options.h" #include "maf.h" #include "bed.h" #include "twoBit.h" #include "binRange.h" - +#include "bigWig.h" char *masterSpecies; char *masterChrom; struct hash *speciesHash; struct subSpecies *speciesList; struct strandHead *strandHeads; +struct bbiFile *scaleBbi ; +struct bigWigValsOnChrom *scaleVals; +double dataMin, dataRange; -boolean addN = FALSE; -boolean addDash = FALSE; void usage() /* Explain usage and exit. */ { errAbort( - "mafCounts - count the number of A,T,C,G,N in a maf and output four wiggles\n" + "mafCounts - count the number of A,T,C,G in a maf and output four wiggles\n" "usage:\n" " mafCounts mafIn wigPrefix\n" "WARNING: requires a maf with only a single target sequence\n" "options:\n" + " -scale=file.bw scale logo by bigWig value\n" ); } static struct optionSpec options[] = { + {"scale", OPTION_STRING}, {NULL, 0}, }; -unsigned letterBox[256]; -unsigned ourBufSize = 16 * 1024; +unsigned char letterBox[256]; +unsigned ourBufSize = 256 *1024; void mafCounts(char *mafIn, char *wigPrefix) -/* mafCounts - Filter out maf files. */ +/* mafCounts - count A,C,T,G */ { int jj; char buffer[4096]; -FILE *f[6]; -safef(buffer, sizeof buffer, "%sA.wig", wigPrefix); +FILE *f[4]; +safef(buffer, sizeof buffer, "%s.A.wig", wigPrefix); f[0] = mustOpen(buffer, "w"); -safef(buffer, sizeof buffer, "%sC.wig", wigPrefix); +safef(buffer, sizeof buffer, "%s.C.wig", wigPrefix); f[1] = mustOpen(buffer, "w"); -safef(buffer, sizeof buffer, "%sT.wig", wigPrefix); +safef(buffer, sizeof buffer, "%s.T.wig", wigPrefix); f[2] = mustOpen(buffer, "w"); -safef(buffer, sizeof buffer, "%sG.wig", wigPrefix); +safef(buffer, sizeof buffer, "%s.G.wig", wigPrefix); f[3] = mustOpen(buffer, "w"); -safef(buffer, sizeof buffer, "%sD.wig", wigPrefix); -f[4] = mustOpen(buffer, "w"); -safef(buffer, sizeof buffer, "%sN.wig", wigPrefix); -f[5] = mustOpen(buffer, "w"); struct mafAli *maf; struct mafFile *mf = mafOpen(mafIn); -unsigned *counts = needMem(6 * ourBufSize * sizeof(unsigned)); +unsigned *counts = needLargeMem(4 * ourBufSize * sizeof(unsigned)); int last = -1; +double *values = NULL; +char *lastChrom = NULL; +double scaleValue = 0.0; for(; (maf = mafNext(mf)) != NULL;) { struct mafComp *comp = maf->components; int seqLen = strlen(comp->text); if (seqLen > ourBufSize) errAbort("ourBufSize not big enough for %d\n", seqLen); + unsigned blockStart = comp->start; - if (comp->start != last) + if (blockStart != last) + { + char *chrom = strchr(comp->src, '.'); + *chrom++ = 0; + for(jj = 0; jj < 4; jj++) + fprintf(f[jj], "fixedStep chrom=%s start=%d step=1\n", chrom, blockStart + 1); + if (scaleBbi && !sameOk(lastChrom, chrom)) { - char *dot = strchr(comp->src, '.'); - *dot++ = 0; - for(jj = 0; jj < 6; jj++) - fprintf(f[jj], "fixedStep chrom=%s start=%d step=1\n", dot, comp->start + 1); + bigWigValsOnChromFetchData(scaleVals, chrom, scaleBbi); + values = scaleVals->valBuf; + lastChrom = cloneString(chrom); } - last = comp->start + seqLen; - memset(counts, 0, 6 * ourBufSize * sizeof(unsigned)); + } + last = blockStart + seqLen; int ii; + memset(counts, 0, 4 * ourBufSize * sizeof(unsigned)); for(; comp; comp = comp->next) { char *str = comp->text; char *end = &str[seqLen]; for(ii=0; str < end; str++, ii++) - counts[letterBox[tolower(*str)] * ourBufSize + ii ]++; + { + unsigned char nucToIndex = letterBox[tolower(*str)]; + if (nucToIndex > 0) + counts[(nucToIndex - 1) * ourBufSize + ii]++; + } } char *str = maf->components->text; + unsigned scaleOffset = 0; for(ii = 0; ii < seqLen; ii++, str++) { if (*str == '-') continue; + if (scaleBbi && (values != NULL)) + scaleValue = (values[blockStart + scaleOffset]); + scaleOffset++; double total = 0; for(jj = 0; jj < 4; jj++) { unsigned offset = jj * ourBufSize + ii; total += counts[offset]; } for(jj = 0; jj < 4; jj++) - //for(jj = 0; jj < 6; jj++) { unsigned offset = jj * ourBufSize + ii; - fprintf(f[jj],"%g\n",counts[offset]/total); - counts[offset] = 0; + double value = counts[offset]/total; + + if (scaleBbi) + value *= scaleValue; + + fprintf(f[jj],"%g\n",value); } } mafAliFree(&maf); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); -letterBox['a'] = 0; -letterBox['c'] = 1; -letterBox['t'] = 2; -letterBox['g'] = 3; -letterBox['-'] = 4; -letterBox['n'] = 5; +char *scaleBw = optionVal("scale", NULL); +if (scaleBw) + { + scaleBbi = bigWigFileOpen(scaleBw); + struct bbiSummaryElement sum = bbiTotalSummary(scaleBbi); + dataMin = sum.minVal; + dataRange = sum.maxVal - dataMin; + scaleVals = bigWigValsOnChromNew(); + + } + +letterBox['a'] = 1; +letterBox['c'] = 2; +letterBox['t'] = 3; +letterBox['g'] = 4; mafCounts(argv[1], argv[2]); return 0; }