d4bb68fad25eb93844b05973b8b568a708f99847 kent Tue Jul 26 16:51:14 2011 -0700 Adding new utility that is useful for associating features with closest promoter. diff --git src/utils/bedToClosestPoint/bedToClosestPoint.c src/utils/bedToClosestPoint/bedToClosestPoint.c new file mode 100644 index 0000000..502b39d --- /dev/null +++ src/utils/bedToClosestPoint/bedToClosestPoint.c @@ -0,0 +1,153 @@ +/* bedToClosestPoint - Given a bed file, and a list of unique points (often promoters) in genome, + * assign each bed to the closest point. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "genomeRangeTree.h" +#include "basicBed.h" +#include "sqlNum.h" + +static char const rcsid[] = "$Id: newProg.c,v 1.30 2010/03/24 21:18:33 hiram Exp $"; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "bedToClosestPoint - Given a bed file, and a list of unique points (often promoters) in genome,\n" + "assign each bed to the closest point.\n" + "usage:\n" + " bedToClosestPoint inRanges.bed inPoints.bed out.tab\n" + "Where:\n" + " inRanges.bed is a bed file without exons or strand\n" + " inPoints.bed is a bed file (promoters), where the chromStart on the + strand and\n" + " the chromEnd on the - strand will determine the point\n" + " out.tab has the following fields\n" + " - the name field from inPoints.bed\n" + " - the name field from inRanges.bed, or '.' if the field is missing\n" + " - distance between point and middle of in.bed record. May be - if upstream\n" + " - the score field if any from inRanges.bed\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +static struct optionSpec options[] = { + {NULL, 0}, +}; + +struct range *rangeTreeClosestPoint(struct rbTree *t, int start, int end) +/* Find the closest range in tree to given range. Closeness is measured by + * closeness of the midpoint. I'm not quite sure if this is general purpose + * enough to go into the range tree library. */ +{ +struct rbTreeNode *p, *nextP = NULL; +struct rbTreeNode *closestNode = NULL; +int closestDistance = 0; +int mid = (start+end)/2; + +/* Repeatedly explore either the left or right branch, depending on the + * value of the key, keeping track of the closest point we've seen. */ +for (p = t->root; p != NULL; p = nextP) + { + struct range *r = p->item; + int rMid = (r->start + r->end)/2; + int distance = mid - rMid; + int absDistance; + + if (distance < 0) + { + nextP = p->left; + absDistance = -distance; + } + else if (distance > 0) + { + nextP = p->right; + absDistance = distance; + } + else + { + absDistance = 0; + } + if (closestNode == NULL || absDistance < closestDistance) + { + closestDistance = absDistance; + closestNode = p; + if (closestDistance == 0) + break; + } + } +return (closestNode == NULL ? NULL : closestNode->item); +} + +struct range *genomeRangeTreeClosest(struct genomeRangeTree *tree, char *chrom, int start, int end) +/* Return closest range in genome tree to given segment. Returns NULL if chromosome is empty. */ +{ +struct rbTree *t; +return (t=genomeRangeTreeFindRangeTree(tree,chrom)) ? rangeTreeClosestPoint(t, start, end) : NULL; +} + +void bedToClosestPoint(char *inRangesFile, char *inPointsFile, char *outFile) +/* bedToClosestPoint - Given a bed file, and a list of unique points (often promoters) in genome, + * assign each bed to the closest point. */ +{ +/* Load up inPoints file and put it into a range tree, insuring that each point is unique. */ +struct bed *inPointsList = bedLoadNAll(inPointsFile, 6); +struct genomeRangeTree *pointTree = genomeRangeTreeNew(); +struct bed *bed, *next; +for (bed = inPointsList; bed != NULL; bed = next) + { + next = bed->next; + bed->next = NULL; + int start; + if (bed->strand[0] == '-') + start = bed->chromEnd-1; + else + start = bed->chromStart; + struct range *range = genomeRangeTreeAddValList(pointTree, bed->chrom, start, start+1, bed); + if (slCount(range->val) > 1) + errAbort("Multiple points at %s %d in %s", bed->chrom, start, inPointsFile); + } + +/* Go through ranges file. */ +FILE *f = mustOpen(outFile, "w"); +struct lineFile *lf = lineFileOpen(inRangesFile, TRUE); +char *row[6]; +int wordCount; +while ((wordCount = lineFileChop(lf, row)) != 0) + { + /* Parse out bed fields. */ + lineFileExpectAtLeast(lf, 4, wordCount); + char *chrom = row[0]; + unsigned chromStart = sqlUnsigned(row[1]); + unsigned chromEnd = sqlUnsigned(row[2]); + char *name = (wordCount > 3 ? row[3] : "."); + int score = (wordCount > 4 ? sqlSigned(row[4]) : 0); + + struct range *range = genomeRangeTreeClosest(pointTree, chrom, chromStart, chromEnd); + if (range != NULL) + { + struct bed *bed = range->val; + fprintf(f, "%s\t", bed->name); + fprintf(f, "%s\t", name); + int midRange = (chromStart + chromEnd)/2; + int distance = midRange - range->start; + if (bed->strand[0] == '-') + distance = -distance; + fprintf(f, "%d\t", distance); + fprintf(f, "%d\n", score); + } + } + +carefulClose(&f); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 4) + usage(); +bedToClosestPoint(argv[1], argv[2], argv[3]); +return 0; +}