415dee4b01f3d60c9dfe1c2bb20045dc62fdfe81 kent Mon Mar 14 16:48:15 2011 -0700 Making a little utility to extract common regions from bed files. Not bulletproof, but should abort on input it can't handle 99% of the time, and limitations are stated in usage. diff --git src/utils/bedCommonRegions/bedCommonRegions.c src/utils/bedCommonRegions/bedCommonRegions.c new file mode 100644 index 0000000..f5c0f6b --- /dev/null +++ src/utils/bedCommonRegions/bedCommonRegions.c @@ -0,0 +1,120 @@ +/* bedCommonRegions - Create a bed file (just bed3) that contains the regions common + * to all input beds. Regions are common only if exactly the same chromosome, starts, + * and end. Mere overlap is not enough. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "obscure.h" + +static char const rcsid[] = "$Id: newProg.c,v 1.30 2010/03/24 21:18:33 hiram Exp $"; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "bedCommonRegions - Create a bed file (just bed3) that contains the regions common to all inputs.\n" + "Regions are common only if exactly the same chromosome, starts, and end. Overlap is not enough.\n" + "Each region must be in each input at most once. Output is stdout.\n" + "usage:\n" + " bedCommonRegions file1 file2 file3 ... fileN\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +static struct optionSpec options[] = { + {NULL, 0}, +}; + +#define BED_STRING_SIZE 256 + +char *bedString(char *chrom, char *start, char *end, char result[BED_STRING_SIZE]) +/* Return space delimited concatenation: chrom start end */ +{ +safef(result, BED_STRING_SIZE, "%s\t%s\t%s", chrom, start, end); +return result; +} + +struct hash *readFileIntoHashCountOfOne(char *fileName, struct slRef **pRetList) +/* Read in a bed file. Return a integer hash keyed by bedString. + * The returned list is the hashEls of the hash, but in the same order + * as they appear as lines in the file. */ +{ +/* Add each bed item to hash, and list, checking uniqueness */ +struct hash *hash = hashNew(21); +struct lineFile *lf = lineFileOpen(fileName, TRUE); +struct slRef *refList = NULL; +char *row[3]; +while (lineFileRow(lf, row)) + { + char key[BED_STRING_SIZE]; + bedString(row[0], row[1], row[2], key); + if (hashLookup(hash, key)) + errAbort("Got %s:%s-%s twice (second time line %d of %s), key not unique", + row[0], row[1], row[2], lf->lineIx, lf->fileName); + struct hashEl *hel = hashAddInt(hash, key, 1); + refAdd(&refList, hel); + } + +/* Clean up and go home. */ +lineFileClose(&lf); +slReverse(&refList); +*pRetList = refList; +return hash; +} + +void addFileToCountHash(char *fileName, struct hash *countHash) +/* Add bedStrings from file to countHash, just ignoring the ones + * that don't appear. */ +{ +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *row[3]; +while (lineFileRow(lf, row)) + { + char key[BED_STRING_SIZE]; + bedString(row[0], row[1], row[2], key); + struct hashEl *hel = hashLookup(countHash, key); + if (hel != NULL) + hel->val = ((char *)hel->val)+1; + } +lineFileClose(&lf); +} + +void bedCommonRegions(int fileCount, char *files[]) +/* Create a bed file (just bed3) that contains the regions common to all + * input beds. Regions are common only if exactly the same chromosome, starts, + * and end. Mere overlap is not enough. */ +{ +/* Build up hash with counts of usage */ +struct slRef *ref, *refList = NULL; +struct hash *countHash = readFileIntoHashCountOfOne(files[0], &refList); +int i; +for (i=1; inext) + { + struct hashEl *hel = ref->val; + int count = ptToInt(hel->val); + if (count == fileCount) + printf("%s\n", hel->name); + else if (count > fileCount) /* Detect many but not all non-uniquenesses. + * Elsewhere we detect non-uniquenesses in first file + * some non-uniquenesses slip through, but this is + * just a fast utility. */ + errAbort("%s appears more than once in some file", hel->name); + } +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc < 3) + usage(); +bedCommonRegions(argc-1, argv+1); +return 0; +}