src/hg/encode/validateFiles/validateFiles.c 1.8

1.8 2009/03/13 20:52:41 mikep
support for broadPeak
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 4 -r1.7 -r1.8
--- src/hg/encode/validateFiles/validateFiles.c	13 Mar 2009 20:15:25 -0000	1.7
+++ src/hg/encode/validateFiles/validateFiles.c	13 Mar 2009 20:52:41 -0000	1.8
@@ -9,8 +9,11 @@
 static char const rcsid[] = "$Id$";
 static char *version = "$Revision$";
 
 #define MAX_ERRORS 10
+#define PEAK_WORDS 16
+#define TAG_WORDS 9
+
 int maxErrors;
 boolean colorSpace;
 boolean zeroSizeOk;
 boolean printOkLines;
@@ -39,9 +42,9 @@
   "                OK or failing file lines can be optionally written to stdout\n"
   "usage:\n"
   "   validateFiles -type=FILE_TYPE file1 [file2 [...]]\n"
   "options:\n"
-  "   -type=(fastq|csfasta|csqual|tagAlign|pairedTagAlign)\n"
+  "   -type=(fastq|csfasta|csqual|tagAlign|pairedTagAlign|broadPeak|narrowPeak|gappedPeak)\n"
   "                                csfasta = Colorspace fasta (SOLiD platform) (implies -colorSpace)\n"
   "                                csqual  = Colorspace quality lines (SOLiD platform)\n"
   "   -chromInfo=file.txt          Specify chromInfo file to validate chrom names and sizes\n"
   "   -colorSpace                  Sequences are colorspace 0-3 values\n"
@@ -369,10 +372,25 @@
 warn("Error [file=%s, line=%d]: %s %d outside bounds (%d, %d) [%s]", file, line, name, i, min, max, row);
 return FALSE;
 }
 
+boolean checkFloat(char *file, int line, char *row, char *val, char *name)
+// Return TRUE if val is floating point number
+// Othewise print warning and return FALSE
+// taken from sqlNum.c
+{
+char* end;
+strtod(val, &end);
+if ((end == val) || (*end != '\0'))
+    {
+    warn("Error [file=%s, line=%d]: invalid %s '%s' [%s]", file, line, name, val, row);
+    return FALSE;
+    }
+return TRUE;
+}
+
 boolean checkStrand(char *file, int line, char *row, char *strand)
-// Return TRUE if strand == '+' or '-',
+// Return TRUE if strand == '+' or '-' or '.',
 // Othewise print warning and return FALSE
 {
 if (strlen(strand) == 1 && (*strand == '+' || *strand == '-' || *strand == '.'))
     {
@@ -407,18 +425,18 @@
 int validateTagOrPairedTagAlign(struct lineFile *lf, char *file, boolean paired)
 {
 char *row;
 char buf[1024];
-char *words[9];
+char *words[TAG_WORDS];
 int line = 0;
 int errs = 0;
 unsigned chromSize;
 int size;
 verbose(2,"[%s %3d] paired=%d file(%s)\n", __func__, __LINE__, paired, file);
 while (lineFileNext(lf, &row, &size))
     {
     safecpy(buf, sizeof(buf), row);
-    if ( checkColumns(file, ++line, row, buf, words, 9, (paired ? 8 : 6))
+    if ( checkColumns(file, ++line, row, buf, words, TAG_WORDS, (paired ? 8 : 6))
 	&& checkChrom(file, line, row, words[0], &chromSize)
 	&& checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
 	&& checkIntBetween(file, line, row, words[4], "score", 0, 1000)
 	&& checkStrand(file, line, row, words[5])
@@ -461,8 +479,45 @@
 {
 return validateTagOrPairedTagAlign(lf, file, TRUE);
 }
 
+int validateBroadPeak(struct lineFile *lf, char *file)
+{
+char *row;
+char buf[1024];
+char *words[PEAK_WORDS];
+int line = 0;
+int errs = 0;
+unsigned chromSize;
+int size;
+verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
+while (lineFileNext(lf, &row, &size))
+    {
+    safecpy(buf, sizeof(buf), row);
+    if ( checkColumns(file, ++line, row, buf, words, PEAK_WORDS, 9)
+	&& checkChrom(file, line, row, words[0], &chromSize)
+	&& checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
+	&& checkString(file, line, row, words[3], "name")
+	&& checkIntBetween(file, line, row, words[4], "score", 0, 1000)
+	&& checkStrand(file, line, row, words[5])
+	&& checkFloat(file, line, row, words[6], "signalValue")
+	&& checkFloat(file, line, row, words[7], "pValue")
+	&& checkFloat(file, line, row, words[8], "qValue"))
+	{
+	if (printOkLines)
+	    printf("%s\n", row);
+	}
+    else
+	{
+	if (printFailLines)
+	    printf("%s\n", row);
+	if (++errs >= maxErrors)
+	    errAbort("Aborting .. found %d errors\n", errs);
+	}
+    }
+return errs;
+}
+
 // fastq:
 // @NINA_1_FC30G3VAAXX:5:1:110:908
 // ATCGTCAGGTGGGATAATCCTTACCTTTTCCTCCTC
 // +NINA_1_FC30G3VAAXX:5:1:110:908
@@ -666,8 +721,11 @@
 hashAdd(funcs, "pairedTagAlign", &validatePairedTagAlign);
 hashAdd(funcs, "fastq", &validateFastq);
 hashAdd(funcs, "csfasta", &validateCsfasta);
 hashAdd(funcs, "csqual", &validateCsqual);
+hashAdd(funcs, "broadPeak", &validateBroadPeak);
+//hashAdd(funcs, "narrowPeak", &validateNarrowPeak);
+//hashAdd(funcs, "gappedPeak", &validateGappedPeak);
 //hashAdd(funcs, "test", &testFunc);
 if (!(func = hashFindVal(funcs, type)))
     errAbort("Cannot validate %s type files\n", type);
 validateFiles(func, argc, argv);