94234c062dc26dd2ee4fe2b381ca11486848c601 mmaddren Wed May 21 16:09:32 2014 -0700 more or less final methylateGenome, and added a new tool analyzeCpG which currently clusters CpG regions in a bed file diff --git src/utils/analyzeCpG/analyzeCpG.c src/utils/analyzeCpG/analyzeCpG.c new file mode 100644 index 0000000..1702c3d --- /dev/null +++ src/utils/analyzeCpG/analyzeCpG.c @@ -0,0 +1,194 @@ +/* analyzeCpG - cluster CpG regions from an input bed file and perform some analysis on them. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "basicBed.h" +#include "sqlNum.h" + +struct bedGraphRecord +{ + struct bedGraphRecord *next; + + char *chrom; + unsigned chromStart; + unsigned chromEnd; + char *name; + double score; + char strand; +}; + +struct bedGraphRecord *bedGraphLoadNext(struct lineFile *lf) +{ +char *row[6]; +int rowSize = lineFileChopNext(lf, row, ArraySize(row)); + +if (rowSize == 0) + return NULL; + +struct bedGraphRecord *bg; +AllocVar(bg); + +bg->chrom = cloneString(row[0]); +bg->chromStart = sqlUnsigned(row[1]); +bg->chromEnd = sqlUnsigned(row[2]); +bg->name = cloneString(row[3]); +bg->score = sqlFloat(row[4]); +bg->strand = row[5][0]; + +return bg; +} + +int clClusterSize = 100; +int clMinCluster = 100; +boolean clHist = FALSE; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "analyzeCpG - cluster CpG regions from an input bed file and perform some analysis on them\n" + "usage:\n" + " analyzeCpG input.bed output.bed\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"clusterSize", OPTION_INT}, + {"minCluster", OPTION_INT}, + {"hist", OPTION_BOOLEAN} +}; + +void writeCluster(struct bedGraphRecord *clusterList, FILE *out) +{ +int size = slCount(clusterList); + +if (size < clMinCluster) + return; + +int blockStarts[size]; +int blockSizes[size]; +double score = 0; +struct bedGraphRecord *last = clusterList; + +slReverse(&clusterList); + +struct bed *outBed; +AllocVar(outBed); + +outBed->chrom = cloneString(clusterList->chrom); +outBed->chromStart = clusterList->chromStart; +outBed->chromEnd = last->chromEnd; +outBed->name = "asdf"; +outBed->strand[0] = clusterList->strand; + +outBed->blockCount = size; + +int i; +for (i = 0; i < size; i++) + blockSizes[i] = 1; +outBed->blockSizes = blockSizes; + +i = 0; +struct bedGraphRecord *cur; +for (cur = clusterList; cur != NULL; cur = cur->next) + { + blockStarts[i] = cur->chromStart - outBed->chromStart; + score += cur->score; + i++; + } +outBed->chromStarts = blockStarts; +outBed->score = (int)(score * 10 / size); + +//printf("printing %s %d %d", outBed->chrom, outBed->chromStart, outBed->chromEnd); + +bedTabOutN(outBed, 12, out); +} + +void analyzeCpG(char *inputName, char *outputName) +/* analyzeCpG - cluster CpG regions from an input bed file and perform some analysis on them. */ +{ + +// open input bed file +// struct bed *input = bedLoadAll(inputName); +struct lineFile *input = lineFileOpen(inputName, TRUE); +FILE *out = fopen(outputName, "w"); + +struct bedGraphRecord *plusPrev = NULL; +struct bedGraphRecord *minusPrev = NULL; + +// set up lists +struct bedGraphRecord *plusClusters = NULL; +struct bedGraphRecord *minusClusters = NULL; + +int hist[8096]; +int i; +for (i = 0; i < 8096; i++) + hist[i] = 0; + +// loop over bed file +for (;;) + { + struct bedGraphRecord *record = bedGraphLoadNext(input); + + if (record == NULL) + break; + + if (record->strand == '+') + { + //if (plusPrev != NULL) + // printf("+: cur %s %d, prev %s %d = %d", record->chrom, record->chromStart, plusPrev->chrom, plusPrev->chromStart, record->chromStart - plusPrev->chromStart); + if (plusPrev != NULL && (strcmp(record->chrom, plusPrev->chrom) != 0 || record->chromStart - plusPrev->chromStart > clClusterSize)) + { + hist[slCount(plusClusters)]++; + writeCluster(plusClusters, out); + slFreeList(&plusClusters); + } + //printf("\n"); + slAddHead(&plusClusters, record); + plusPrev = record; + } + else + { + if (minusPrev != NULL && (strcmp(record->chrom, minusPrev->chrom) != 0 || record->chromStart - minusPrev->chromStart > clClusterSize)) + { + hist[slCount(minusClusters)]++; + writeCluster(minusClusters, out); + slFreeList(&minusClusters); + } + slAddHead(&minusClusters, record); + minusPrev = record; + } + //printf("plusPrev: %p, minusPrev: %p, plusClusterCount: %d\n", plusPrev, minusPrev, slCount(plusClusters)); + } + +lineFileClose(&input); + +fclose(out); + +if (clHist) + { + for (i = 0; i < 8096; i++) + { + if (hist[i] > 0) + printf("%5d%10d\n", i, hist[i]); + } + } + +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +clClusterSize = optionInt("clusterSize", clClusterSize); +clMinCluster = optionInt("minCluster", clMinCluster); +clHist = optionExists("hist"); +if (argc != 3) + usage(); +analyzeCpG(argv[1], argv[2]); +return 0; +}