src/hg/encode/validateFiles/validateFiles.c 1.16

1.16 2009/04/03 17:11:08 mikep
option for quick checking of files (1000 lines of each)
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.15
retrieving revision 1.16
diff -b -B -U 4 -r1.15 -r1.16
--- src/hg/encode/validateFiles/validateFiles.c	2 Apr 2009 22:49:46 -0000	1.15
+++ src/hg/encode/validateFiles/validateFiles.c	3 Apr 2009 17:11:08 -0000	1.16
@@ -11,16 +11,19 @@
 
 #define MAX_ERRORS 10
 #define PEAK_WORDS 16
 #define TAG_WORDS 9
+#define QUICK_DEFAULT 1000
 
 enum bedType {BED_GRAPH = 0, BROAD_PEAK, NARROW_PEAK, GAPPED_PEAK};
 
 int maxErrors;
 boolean colorSpace;
 boolean zeroSizeOk;
+boolean chrMSizeOk;
 boolean printOkLines;
 boolean printFailLines;
+int quick;
 struct hash *chrHash = NULL;
 char dnaChars[256];
 char qualChars[256];
 char csQualChars[256];
@@ -63,8 +66,9 @@
   "                                  stopping (default %d)\n"
   "   -zeroSizeOk                  For BED-type positional data, allow rows with start==end\n"
   "                                  otherwise require strictly start < end\n"
   "   -printOkLines                Print lines which pass validation to stdout\n"
+  "   -quick[=N]                   Just test the first N lines of each file (default 1000)\n"
   "   -printFailLines              Print lines which fail validation to stdout\n"
   "   -version                     Print version\n"
   , MAX_ERRORS);
 }
@@ -75,10 +79,12 @@
    {"chromInfo", OPTION_STRING},
    {"maxErrors", OPTION_INT},
    {"colorSpace", OPTION_BOOLEAN},
    {"zeroSizeOk", OPTION_BOOLEAN},
+   {"chrMSizeOk", OPTION_BOOLEAN},
    {"printOkLines", OPTION_BOOLEAN},
    {"printFailLines", OPTION_BOOLEAN},
+   {"quick", OPTION_INT},
    {"version", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
@@ -392,8 +398,9 @@
 boolean checkStartEnd(char *file, int line, char *row, char *start, char *end, char *chrom, unsigned chromSize)
 // Return TRUE if start and end are both >= 0,
 // and if zeroSizeOk then start <= end 
 //        otherwise  then start < end
+// Also check end <= chromSize (as a special case, ignore chrM end if chrMSizeOk)
 // Othewise print warning and return FALSE
 {
 verbose(3,"[%s %3d] inputLine=%d [%s..%s] (chrom=%s,size=%u) [%s]\n", __func__, __LINE__, line, start, end, chrom, chromSize, row);
 unsigned s, e; 
@@ -401,9 +408,9 @@
     || !checkUnsigned(file, line, row, end, &e, "chromEnd"))
     return FALSE;
 if (chromSize > 0)
     {
-    if (e > chromSize)
+    if (e > chromSize && !(chrMSizeOk && sameString(chrom, "chrM")))
 	{
 	warn("Error [file=%s, line=%d]: end(%u) > chromSize(%s=%u) [%s]", file, line, e, chrom, chromSize, row);
 	return FALSE;
 	}
@@ -531,10 +538,13 @@
 dnaChars[(int)'.'] = 1; 
 verbose(2,"[%s %3d] paired=%d file(%s)\n", __func__, __LINE__, paired, file);
 while (lineFileNext(lf, &row, &size))
     {
+    ++line;
+    if (quick && line > quick)
+	break;
     safecpy(buf, sizeof(buf), row);
-    if ( checkColumns(file, ++line, row, buf, words, TAG_WORDS, (paired ? 8 : 6))
+    if ( checkColumns(file, line, row, buf, words, TAG_WORDS, (paired ? 8 : 6))
 	&& checkChrom(file, line, row, words[0], &chromSize)
 	&& checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
 	&& checkIntBetween(file, line, row, words[4], "score", 0, 1000)
 	&& checkStrand(file, line, row, words[5])
@@ -591,10 +601,13 @@
 int gappedOffset = (type == GAPPED_PEAK ? 6 : 0);
 verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
 while (lineFileNext(lf, &row, &size))
     {
+    ++line;
+    if (quick && line > quick)
+	break;
     safecpy(buf, sizeof(buf), row);
-    if ( checkColumns(file, ++line, row, buf, words, PEAK_WORDS, bedTypeCols[type])
+    if ( checkColumns(file, line, row, buf, words, PEAK_WORDS, bedTypeCols[type])
 	&& checkChrom(file, line, row, words[0], &chromSize)
 	&& checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
 	&& ( type == BED_GRAPH ? 
 	      (checkFloat(file, line, row, words[3], "value")) // canonical bedGraph has float in 4th column 
@@ -660,8 +673,10 @@
 verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
 while ( lineFileNext(lf, &seqName, NULL))
     {
     ++line;
+    if (quick && line > quick)
+	break;
     if (startOfFile)
 	{
 	if (*seqName == '#')
 	    continue;
@@ -701,8 +716,10 @@
 verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
 while ( lineFileNext(lf, &seqName, NULL))
     {
     ++line;
+    if (quick && line > quick)
+	break;
     if (startOfFile)
 	{
 	if (*seqName == '#')
 	    continue;
@@ -750,8 +767,10 @@
 verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
 while (lineFileNext(lf, &seqName, NULL))
     {
     ++line;
+    if (quick && line > quick)
+	break;
     if (startOfFile)
 	{
 	if (*seqName == '#')
 	    continue;
@@ -799,8 +818,10 @@
 verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
 while (lineFileNext(lf, &seqName, NULL))
     {
     ++line;
+    if (quick && line > quick)
+	break;
     if (startOfFile)
 	{
 	if (*seqName == '#')
 	    continue;
@@ -873,10 +894,12 @@
 if (strlen(type) == 0)
     errAbort("please specify type");
 maxErrors      = optionInt("maxErrors", MAX_ERRORS);
 zeroSizeOk     = optionExists("zeroSizeOk");
+chrMSizeOk     = optionExists("chrMSizeOk");
 printOkLines   = optionExists("printOkLines");
 printFailLines = optionExists("printFailLines");
+quick          = optionExists("quick") ? optionInt("quick",QUICK_DEFAULT) : 0;
 colorSpace     = optionExists("colorSpace") || sameString(type, "csfasta");
 initArrays();
 // Get chromInfo from DB or file
 if ( (chromDb = optionVal("chromDb", NULL)) != NULL)