src/hg/encode/validateFiles/validateFiles.c 1.16
1.16 2009/04/03 17:11:08 mikep
option for quick checking of files (1000 lines of each)
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.15
retrieving revision 1.16
diff -b -B -U 4 -r1.15 -r1.16
--- src/hg/encode/validateFiles/validateFiles.c 2 Apr 2009 22:49:46 -0000 1.15
+++ src/hg/encode/validateFiles/validateFiles.c 3 Apr 2009 17:11:08 -0000 1.16
@@ -11,16 +11,19 @@
#define MAX_ERRORS 10
#define PEAK_WORDS 16
#define TAG_WORDS 9
+#define QUICK_DEFAULT 1000
enum bedType {BED_GRAPH = 0, BROAD_PEAK, NARROW_PEAK, GAPPED_PEAK};
int maxErrors;
boolean colorSpace;
boolean zeroSizeOk;
+boolean chrMSizeOk;
boolean printOkLines;
boolean printFailLines;
+int quick;
struct hash *chrHash = NULL;
char dnaChars[256];
char qualChars[256];
char csQualChars[256];
@@ -63,8 +66,9 @@
" stopping (default %d)\n"
" -zeroSizeOk For BED-type positional data, allow rows with start==end\n"
" otherwise require strictly start < end\n"
" -printOkLines Print lines which pass validation to stdout\n"
+ " -quick[=N] Just test the first N lines of each file (default 1000)\n"
" -printFailLines Print lines which fail validation to stdout\n"
" -version Print version\n"
, MAX_ERRORS);
}
@@ -75,10 +79,12 @@
{"chromInfo", OPTION_STRING},
{"maxErrors", OPTION_INT},
{"colorSpace", OPTION_BOOLEAN},
{"zeroSizeOk", OPTION_BOOLEAN},
+ {"chrMSizeOk", OPTION_BOOLEAN},
{"printOkLines", OPTION_BOOLEAN},
{"printFailLines", OPTION_BOOLEAN},
+ {"quick", OPTION_INT},
{"version", OPTION_BOOLEAN},
{NULL, 0},
};
@@ -392,8 +398,9 @@
boolean checkStartEnd(char *file, int line, char *row, char *start, char *end, char *chrom, unsigned chromSize)
// Return TRUE if start and end are both >= 0,
// and if zeroSizeOk then start <= end
// otherwise then start < end
+// Also check end <= chromSize (as a special case, ignore chrM end if chrMSizeOk)
// Othewise print warning and return FALSE
{
verbose(3,"[%s %3d] inputLine=%d [%s..%s] (chrom=%s,size=%u) [%s]\n", __func__, __LINE__, line, start, end, chrom, chromSize, row);
unsigned s, e;
@@ -401,9 +408,9 @@
|| !checkUnsigned(file, line, row, end, &e, "chromEnd"))
return FALSE;
if (chromSize > 0)
{
- if (e > chromSize)
+ if (e > chromSize && !(chrMSizeOk && sameString(chrom, "chrM")))
{
warn("Error [file=%s, line=%d]: end(%u) > chromSize(%s=%u) [%s]", file, line, e, chrom, chromSize, row);
return FALSE;
}
@@ -531,10 +538,13 @@
dnaChars[(int)'.'] = 1;
verbose(2,"[%s %3d] paired=%d file(%s)\n", __func__, __LINE__, paired, file);
while (lineFileNext(lf, &row, &size))
{
+ ++line;
+ if (quick && line > quick)
+ break;
safecpy(buf, sizeof(buf), row);
- if ( checkColumns(file, ++line, row, buf, words, TAG_WORDS, (paired ? 8 : 6))
+ if ( checkColumns(file, line, row, buf, words, TAG_WORDS, (paired ? 8 : 6))
&& checkChrom(file, line, row, words[0], &chromSize)
&& checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
&& checkIntBetween(file, line, row, words[4], "score", 0, 1000)
&& checkStrand(file, line, row, words[5])
@@ -591,10 +601,13 @@
int gappedOffset = (type == GAPPED_PEAK ? 6 : 0);
verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
while (lineFileNext(lf, &row, &size))
{
+ ++line;
+ if (quick && line > quick)
+ break;
safecpy(buf, sizeof(buf), row);
- if ( checkColumns(file, ++line, row, buf, words, PEAK_WORDS, bedTypeCols[type])
+ if ( checkColumns(file, line, row, buf, words, PEAK_WORDS, bedTypeCols[type])
&& checkChrom(file, line, row, words[0], &chromSize)
&& checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
&& ( type == BED_GRAPH ?
(checkFloat(file, line, row, words[3], "value")) // canonical bedGraph has float in 4th column
@@ -660,8 +673,10 @@
verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
while ( lineFileNext(lf, &seqName, NULL))
{
++line;
+ if (quick && line > quick)
+ break;
if (startOfFile)
{
if (*seqName == '#')
continue;
@@ -701,8 +716,10 @@
verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
while ( lineFileNext(lf, &seqName, NULL))
{
++line;
+ if (quick && line > quick)
+ break;
if (startOfFile)
{
if (*seqName == '#')
continue;
@@ -750,8 +767,10 @@
verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
while (lineFileNext(lf, &seqName, NULL))
{
++line;
+ if (quick && line > quick)
+ break;
if (startOfFile)
{
if (*seqName == '#')
continue;
@@ -799,8 +818,10 @@
verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
while (lineFileNext(lf, &seqName, NULL))
{
++line;
+ if (quick && line > quick)
+ break;
if (startOfFile)
{
if (*seqName == '#')
continue;
@@ -873,10 +894,12 @@
if (strlen(type) == 0)
errAbort("please specify type");
maxErrors = optionInt("maxErrors", MAX_ERRORS);
zeroSizeOk = optionExists("zeroSizeOk");
+chrMSizeOk = optionExists("chrMSizeOk");
printOkLines = optionExists("printOkLines");
printFailLines = optionExists("printFailLines");
+quick = optionExists("quick") ? optionInt("quick",QUICK_DEFAULT) : 0;
colorSpace = optionExists("colorSpace") || sameString(type, "csfasta");
initArrays();
// Get chromInfo from DB or file
if ( (chromDb = optionVal("chromDb", NULL)) != NULL)