155526b6cc2cd537d56ad99a5d958502c1a4315c kent Fri Jun 17 11:03:23 2011 -0700 First cut of utility to check for coverage gaps in a track. Doesn't work for bigBeds yet. diff --git src/hg/checkCoverageGaps/checkCoverageGaps.c src/hg/checkCoverageGaps/checkCoverageGaps.c new file mode 100644 index 0000000..dbb69d0 --- /dev/null +++ src/hg/checkCoverageGaps/checkCoverageGaps.c @@ -0,0 +1,216 @@ +/* checkCoverageGaps - Check for biggest gap in coverage for a list of tracks.. */ +#include "common.h" +#include "localmem.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "jksql.h" +#include "trackDb.h" +#include "hdb.h" +#include "obscure.h" +#include "rangeTree.h" +#include "bigWig.h" +#include "bigBed.h" + +static char const rcsid[] = "$Id: newProg.c,v 1.30 2010/03/24 21:18:33 hiram Exp $"; + +boolean allParts = FALSE; +boolean female = FALSE; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "checkCoverageGaps - Check for biggest gap in coverage for a list of tracks.\n" + "usage:\n" + " checkCoverageGaps database track1 ... trackN\n" + "Note: for bigWig and bigBeds, the biggest gap is rounded to the nearest 10,000 or so\n" + "options:\n" + " -allParts If set then include _hap and _random and other wierd chroms\n" + " -female If set then don't check chrY\n" + ); +} + +static struct optionSpec options[] = { + {"allParts", OPTION_BOOLEAN}, + {"female", OPTION_BOOLEAN}, + {NULL, 0}, +}; + +void bigWigBiggestGap(struct trackDb *tdb, struct sqlConnection *conn, + char *chrom, int chromSize, struct rbTree *rt) +/* Find biggest gap in given chromosome in bigWig */ +{ +char *fileName = bbiNameFromSettingOrTable(tdb, conn, tdb->table); +struct bbiFile *bbi = bigWigFileOpen(fileName); +int sampleSize = 10000; +int sampleCount = chromSize/sampleSize; +if (sampleCount > 1) + { + int evenEnd = sampleCount * sampleSize; + double *summaryVals; + AllocArray(summaryVals, sampleCount); + if (bigWigSummaryArray(bbi, chrom, 0, evenEnd, bbiSumCoverage, sampleCount, summaryVals)) + { + int s=0, e, i; + for (i=0; i<sampleCount; ++i) + { + e = s + sampleSize; + if (summaryVals[i] > 0.0) + rangeTreeAdd(rt, s, e); + s = e; + } + } + } +bigWigFileClose(&bbi); +} + +void bigBedBiggestGap(struct trackDb *tdb, struct sqlConnection *conn, + char *chrom, int chromSize, struct rbTree *rt) +/* Find biggest gap in given chromosome in bigBed */ +{ +uglyAbort("bigBed not implemented"); +} + +void tableBiggestGap(struct hTableInfo *hti, struct trackDb *tdb, struct sqlConnection *conn, + char *chrom, int chromSize, struct rbTree *rt) +/* Find biggest gap in given chromosome in bigBed */ +{ +char fields[512]; +safef(fields, sizeof(fields), "%s,%s", hti->startField, hti->endField); +struct sqlResult *sr = hExtendedChromQuery(conn, hti->rootName, chrom, NULL, FALSE, + fields, NULL); +char **row; +while ((row = sqlNextRow(sr)) != NULL) + { + rangeTreeAdd(rt, sqlUnsigned(row[0]), sqlUnsigned(row[1])); + } +sqlFreeResult(&sr); +} + +void biggestGapFromRangeTree(struct rbTree *rt, int chromSize, + int *retStart, int *retEnd, int *retSize) +/* Given a range tree filled with data for given chromosome, figure out + * location and size of biggest gap in data. */ +{ +struct range *range, *rangeList = rangeTreeList(rt); +if (rangeList == NULL) + { + *retStart = 0; + *retEnd = *retSize = chromSize; + } +else + { + int lastEnd = 0; + int biggestSize = 0, biggestStart = 0, biggestEnd = 0; + for (range = rangeList; range != NULL; range = range->next) + { + int size = range->start - lastEnd; + if (size > biggestSize) + { + biggestSize = size; + biggestStart = lastEnd; + biggestEnd = range->start; + } + lastEnd = range->end; + if (range->next == NULL) + { + size = chromSize - lastEnd; + if (size > biggestSize) + { + biggestSize = size; + biggestStart = lastEnd; + biggestEnd = chromSize; + } + } + } + *retStart = biggestStart; + *retEnd = biggestEnd; + *retSize = biggestSize; + } +} + +void addGaps(struct sqlConnection *conn, char *chrom, struct rbTree *rt) +/* Add gaps to range tree. */ +{ +char query[256]; +safef(query, sizeof(query), "select chromStart, chromEnd from gap where chrom='%s'", chrom); +struct sqlResult *sr = sqlGetResult(conn, query); +char **row; +while ((row = sqlNextRow(sr)) != NULL) + { + rangeTreeAdd(rt, sqlUnsigned(row[0]), sqlUnsigned(row[1])); + } +sqlFreeResult(&sr); +} + +void printBiggestGap(char *database, struct sqlConnection *conn, + struct slName *chromList, struct hash *chromHash, char *track) +/* Look up track in database, figure out which type it is, call + * appropriate biggest gap finder, and then print result. */ +{ +struct trackDb *tdb = hTrackInfo(conn, track); +struct hTableInfo *hti = hFindTableInfo(database, chromList->name, tdb->table); +char *typeWord = cloneFirstWord(tdb->type); +char *biggestChrom = NULL; +int biggestSize = 0, biggestStart = 0, biggestEnd = 0; +struct slName *chrom; +for (chrom = chromList; chrom != NULL; chrom = chrom->next) + { + if (!allParts && strchr(chrom->name, '_')) // Generally skip weird chroms + continue; + if (female && sameString(chrom->name, "chrY")) + continue; + int chromSize = hashIntVal(chromHash, chrom->name); + struct rbTree *rt = rangeTreeNew(); + int start = 0, end = 0, size = 0; + if (sameString("bigWig", typeWord)) + bigWigBiggestGap(tdb, conn, chrom->name, chromSize, rt); + else if (sameString("bigBed", typeWord)) + bigBedBiggestGap(tdb, conn, chrom->name, chromSize, rt); + else + tableBiggestGap(hti, tdb, conn, chrom->name, chromSize, rt); + if (rt->n > 0) // Want to keep completely uncovered chromosome uncovered + addGaps(conn, chrom->name, rt); + biggestGapFromRangeTree(rt, chromSize, &start, &end, &size); + if (size > biggestSize) + { + biggestSize = size; + biggestStart = start; + biggestEnd = end; + biggestChrom = chrom->name; + } + rangeTreeFree(&rt); + } +printf("%s\t%s:%d-%d\t", track, biggestChrom, biggestStart+1, biggestEnd); +printLongWithCommas(stdout, biggestSize); +putchar('\n'); +freez(&typeWord); +} + +void checkCoverageGaps(char *database, int trackCount, char *tracks[]) +/* checkCoverageGaps - Check for biggest gap in coverage for a list of tracks.. */ +{ +struct slName *chromList = hChromList(database); +struct hash *chromHash = hChromSizeHash(database); +struct sqlConnection *conn = sqlConnect(database); +int i; +for (i=0; i<trackCount; ++i) + { + char *track = tracks[i]; + printBiggestGap(database, conn, chromList, chromHash, track); + } +sqlDisconnect(&conn); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +allParts = optionExists("allParts"); +female = optionExists("female"); +if (argc < 3) + usage(); +checkCoverageGaps(argv[1], argc-2, argv+2); +return 0; +}