b585d168856254d9e7814e87e63fc778adcf03c5 kent Fri Dec 20 16:03:27 2013 -0800 Utility to merge adjacent same-valued bedGraph records seems to work. diff --git src/utils/bedGraphPack/bedGraphPack.c src/utils/bedGraphPack/bedGraphPack.c new file mode 100644 index 0000000..530dee1 --- /dev/null +++ src/utils/bedGraphPack/bedGraphPack.c @@ -0,0 +1,112 @@ +/* bedGraphPack - Pack together adjacent records representing same value.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "bedGraphPack - Pack together adjacent records representing same value.\n" + "usage:\n" + " bedGraphPack in.bedGraph out.bedGraph\n" + "The input needs to be sorted by chrom and this is checked. To put in a pipe\n" + "use stdin and stdout in the command line in place of file names.\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +void bedGraphPack(char *input, char *output) +/* bedGraphPack - Pack together adjacent records representing same value.. */ +{ +/* We'll keep a hash of chroms to help make sure we're sorted. */ +struct hash *chromHash = hashNew(0); + +/* Open input and output. */ +struct lineFile *lf = lineFileOpen(input, TRUE); +FILE *f = mustOpen(output, "w"); + +/* Loop is a little complex - it keeps track of previous value, and merges + * current record into previous where possible.*/ +char *prevChrom = "", *newChrom = ""; +char prevStart[16] = "", prevEnd[16] = "", prevVal[32] = ""; +boolean done = FALSE; +while (!done) + { + char *row[5]; + char *chrom = NULL, *start = NULL, *end = NULL, *val = NULL; + char *newStart = prevStart; + int rowSize = lineFileChopNext(lf, row, ArraySize(row)); + boolean outputLast = FALSE; + if (rowSize == 0) + { + /* Cope with end of file. */ + outputLast = TRUE; + done = TRUE; + } + else + { + /* Fetch next line into local string variables. */ + lineFileExpectWords(lf, 4, rowSize); + chrom = row[0]; + start = row[1]; + end = row[2]; + val = row[3]; + + /* We need to output if on a different chrom or skipping between previous record. + * This is also when we need to reset our starting point. */ + if (!sameString(prevChrom, chrom)) + { + if (hashLookup(chromHash, chrom)) + errAbort("%s is hopping at line %d of %s", chrom, lf->lineIx, lf->fileName); + hashAddSaveName(chromHash, chrom, NULL, &newChrom); + newStart = start; + outputLast = TRUE; + } + else + { + if (!sameString(start, prevEnd) || !sameString(val,prevVal)) + { + newStart = start; + outputLast = TRUE; + } + } + + } + + /* This is the one and only place in loop we output */ + if (outputLast && !isEmpty(prevChrom)) + { + fprintf(f, "%s\t%s\t%s\t%s\n", prevChrom, prevStart, prevEnd, prevVal); + outputLast = FALSE; + } + + /* Copy current record to previous. The additional done check here prevents + * us from having to duplicate the output clause to handle EOF */ + if (!done) + { + prevChrom = newChrom; + if (newStart != prevStart) + safef(prevStart, sizeof(prevStart), "%s", newStart); + safef(prevEnd, sizeof(prevEnd), "%s", end); + safef(prevVal, sizeof(prevVal), "%s", val); + } + } + +carefulClose(&f); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +bedGraphPack(argv[1], argv[2]); +return 0; +}