4d7b22546a7eee42f93719f2cfda6fbe0c853807 braney Wed Aug 31 13:48:03 2022 -0700 ignore blocks with no sequence diff --git src/hg/ratStuff/mafCounts/mafCounts.c src/hg/ratStuff/mafCounts/mafCounts.c index ff991b6..3253214 100644 --- src/hg/ratStuff/mafCounts/mafCounts.c +++ src/hg/ratStuff/mafCounts/mafCounts.c @@ -1,178 +1,181 @@ /* mafCounts - count A,C,T,G in maf files and make four wiggles. */ /* Copyright (C) 2022 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "errAbort.h" #include "linefile.h" #include "hash.h" #include "chain.h" #include "options.h" #include "maf.h" #include "bed.h" #include "twoBit.h" #include "binRange.h" #include "bigWig.h" char *masterSpecies; char *masterChrom; struct hash *speciesHash; struct subSpecies *speciesList; struct strandHead *strandHeads; struct bbiFile *scaleBbi ; struct bigWigValsOnChrom *scaleVals; double dataMin, dataRange; void usage() /* Explain usage and exit. */ { errAbort( "mafCounts - count the number of A,T,C,G in a maf and output four wiggles with values that sum to one\n" "usage:\n" " mafCounts mafIn wigPrefix\n" "WARNING: requires a maf with only a single target sequence\n" "options:\n" " -scale=file.bw scale each wiggle by the bigWig value at that base. This might be something like a phyloP score.\n" ); } static struct optionSpec options[] = { {"scale", OPTION_STRING}, {NULL, 0}, }; unsigned char letterBox[256]; // to make ASCII values to indices unsigned ourBufSize = 256 *1024; // this is the max size of a maf block #define NUMBER_OF_NUCS 4 void mafCounts(char *mafIn, char *wigPrefix) /* mafCounts - count A,C,T,G */ { int jj; char buffer[4096]; // holds a relatively short filename FILE *f[NUMBER_OF_NUCS]; // one file for each different nucleotide safef(buffer, sizeof buffer, "%s.A.wig", wigPrefix); f[0] = mustOpen(buffer, "w"); safef(buffer, sizeof buffer, "%s.C.wig", wigPrefix); f[1] = mustOpen(buffer, "w"); safef(buffer, sizeof buffer, "%s.T.wig", wigPrefix); f[2] = mustOpen(buffer, "w"); safef(buffer, sizeof buffer, "%s.G.wig", wigPrefix); f[3] = mustOpen(buffer, "w"); struct mafAli *maf; struct mafFile *mf = mafOpen(mafIn); unsigned *counts = needLargeMem(NUMBER_OF_NUCS * ourBufSize * sizeof(unsigned)); // this array keeps the counts of each of the 4 possible nucleotides int last = -1; double *values = NULL; char *lastChrom = NULL; double scaleValue = 0.0; for(; (maf = mafNext(mf)) != NULL;) { struct mafComp *comp = maf->components; int seqLen = strlen(comp->text); if (seqLen > ourBufSize) errAbort("ourBufSize not big enough for %d\n", seqLen); unsigned blockStart = comp->start; // if this block is not contiguos with the last one, we need to start a new wig span if (blockStart != last) { char *chrom = strchr(comp->src, '.'); *chrom++ = 0; for(jj = 0; jj < NUMBER_OF_NUCS; jj++) fprintf(f[jj], "fixedStep chrom=%s start=%d step=1\n", chrom, blockStart + 1); // if we're scaling and we're on a new chrom, grab the scale values from the bigWig if (scaleBbi && !sameOk(lastChrom, chrom)) { bigWigValsOnChromFetchData(scaleVals, chrom, scaleBbi); values = scaleVals->valBuf; lastChrom = cloneString(chrom); } } // keep track of where this block ends last = blockStart + seqLen; int ii; // for each maf block we reset the counts of each type of nucleotide to zero memset(counts, 0, NUMBER_OF_NUCS * ourBufSize * sizeof(unsigned)); // now go through each component of the maf block and count the nucs for(; comp; comp = comp->next) { + if (comp->text == NULL) + continue; + char *str = comp->text; char *end = &str[seqLen]; for(ii=0; str < end; str++, ii++) { unsigned char nucToIndex = letterBox[tolower(*str)]; if (nucToIndex > 0) counts[(nucToIndex - 1) * ourBufSize + ii]++; } } // now we normalize and scale if asked. char *str = maf->components->text; unsigned scaleOffset = 0; for(ii = 0; ii < seqLen; ii++, str++) { if (*str == '-') continue; if (scaleBbi && (values != NULL)) scaleValue = (values[blockStart + scaleOffset]); scaleOffset++; double total = 0; for(jj = 0; jj < NUMBER_OF_NUCS; jj++) { unsigned offset = jj * ourBufSize + ii; total += counts[offset]; } for(jj = 0; jj < NUMBER_OF_NUCS; jj++) { unsigned offset = jj * ourBufSize + ii; double value = counts[offset]/total; if (scaleBbi) value *= scaleValue; fprintf(f[jj],"%g\n",value); } } mafAliFree(&maf); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); char *scaleBw = optionVal("scale", NULL); if (scaleBw) { scaleBbi = bigWigFileOpen(scaleBw); struct bbiSummaryElement sum = bbiTotalSummary(scaleBbi); dataMin = sum.minVal; dataRange = sum.maxVal - dataMin; scaleVals = bigWigValsOnChromNew(); } letterBox['a'] = 1; letterBox['c'] = 2; letterBox['t'] = 3; letterBox['g'] = 4; mafCounts(argv[1], argv[2]); return 0; }