src/hg/encode/validateFiles/validateFiles.c 1.9
1.9 2009/03/13 21:36:00 mikep
adding narrow and gapped peaks
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -b -B -U 4 -r1.8 -r1.9
--- src/hg/encode/validateFiles/validateFiles.c 13 Mar 2009 20:52:41 -0000 1.8
+++ src/hg/encode/validateFiles/validateFiles.c 13 Mar 2009 21:36:00 -0000 1.9
@@ -12,8 +12,10 @@
#define MAX_ERRORS 10
#define PEAK_WORDS 16
#define TAG_WORDS 9
+enum peakType {BROAD_PEAK, NARROW_PEAK, GAPPED_PEAK};
+
int maxErrors;
boolean colorSpace;
boolean zeroSizeOk;
boolean printOkLines;
@@ -115,13 +117,11 @@
boolean checkString(char *file, int line, char *row, char *s, char *name)
// Return TRUE if string has non-zero length
// Othewise print warning that name column is empty and return FALSE
{
+verbose(3,"[%s %3d] %s(%s)\n", __func__, __LINE__, name, s);
if (strlen(s) > 0)
- {
- verbose(2,"[%s %3d] %s(%s)\n", __func__, __LINE__, name, s);
return TRUE;
- }
warn("Error [file=%s, line=%d]: %s column empty [%s]", file, line, name, row);
return FALSE;
}
@@ -160,8 +160,9 @@
boolean checkSeq(char *file, int line, char *row, char *s, char *name)
// Return TRUE if string has non-zero length and contains only chars [ACGTNacgtn0-3]
// Othewise print warning that name column is empty and return FALSE
{
+verbose(3,"[%s %3d] inputLine=%d %s seq(%s) [%s]\n", __func__, __LINE__, line, name, s, row);
int i;
for ( i = 0; s[i] ; ++i)
{
if (!dnaChars[(int)s[i]])
@@ -321,11 +322,11 @@
// and if zeroSizeOk then start <= end
// otherwise then start < end
// Othewise print warning and return FALSE
{
+verbose(3,"[%s %3d] inputLine=%d [%s..%s] (chrom=%s,size=%u) [%s]\n", __func__, __LINE__, line, start, end, chrom, chromSize, row);
unsigned s = sqlUnsigned(start);
unsigned e = sqlUnsigned(end);
-verbose(2,"[%s %3d] inputLine=%d [%s..%s] -> [%u..%u] (chrom=%s,size=%u) [%s]\n", __func__, __LINE__, line, start, end, s, e, chrom, chromSize, row);
if (chromSize > 0)
{
if (e > chromSize)
{
@@ -357,8 +358,24 @@
}
return FALSE;
}
+boolean checkPeak(char *file, int line, char *row, char *peak, char *start, char *end)
+// Return TRUE if peak is >= 0 and <= (end-start)
+// Othewise print warning and return FALSE
+{
+verbose(3,"[%s %3d] inputLine=%d peak(%s) (%s,%s) [%s]\n", __func__, __LINE__, line, peak, start, end, row);
+unsigned p = sqlUnsigned(peak);
+unsigned s = sqlUnsigned(start);
+unsigned e = sqlUnsigned(end);
+if (p > e - s)
+ {
+ warn("Error [file=%s, line=%d]: peak(%u) past block length (%u) [%s]", file, line, p, e - s, row);
+ return FALSE;
+ }
+return TRUE;
+}
+
boolean checkIntBetween(char *file, int line, char *row, char *val, char *name, int min, int max)
// Return TRUE if val is integer between min and max
// Othewise print warning and return FALSE
{
@@ -479,17 +496,18 @@
{
return validateTagOrPairedTagAlign(lf, file, TRUE);
}
-int validateBroadPeak(struct lineFile *lf, char *file)
+int validatePeakFormat(struct lineFile *lf, char *file, enum peakType type)
{
char *row;
char buf[1024];
char *words[PEAK_WORDS];
int line = 0;
int errs = 0;
unsigned chromSize;
int size;
+int gappedOffset = (type == GAPPED_PEAK ? 6 : 0);
verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
while (lineFileNext(lf, &row, &size))
{
safecpy(buf, sizeof(buf), row);
@@ -498,11 +516,13 @@
&& checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
&& checkString(file, line, row, words[3], "name")
&& checkIntBetween(file, line, row, words[4], "score", 0, 1000)
&& checkStrand(file, line, row, words[5])
- && checkFloat(file, line, row, words[6], "signalValue")
- && checkFloat(file, line, row, words[7], "pValue")
- && checkFloat(file, line, row, words[8], "qValue"))
+// && ((type != GAPPED_PEAK) || ()) // for now dont check all the BED 12 gapped fields
+ && checkFloat(file, line, row, words[6 + gappedOffset], "signalValue")
+ && checkFloat(file, line, row, words[7 + gappedOffset], "pValue")
+ && checkFloat(file, line, row, words[8 + gappedOffset], "qValue")
+ && ((type != NARROW_PEAK) || (checkPeak(file, line, row, words[4], words[1], words[2]))))
{
if (printOkLines)
printf("%s\n", row);
}
@@ -516,8 +536,23 @@
}
return errs;
}
+int validateBroadPeak(struct lineFile *lf, char *file)
+{
+return validatePeakFormat(lf, file, BROAD_PEAK);
+}
+
+int validateNarrowPeak(struct lineFile *lf, char *file)
+{
+return validatePeakFormat(lf, file, NARROW_PEAK);
+}
+
+int validateGappedPeak(struct lineFile *lf, char *file)
+{
+return validatePeakFormat(lf, file, GAPPED_PEAK);
+}
+
// fastq:
// @NINA_1_FC30G3VAAXX:5:1:110:908
// ATCGTCAGGTGGGATAATCCTTACCTTTTCCTCCTC
// +NINA_1_FC30G3VAAXX:5:1:110:908
@@ -722,10 +757,10 @@
hashAdd(funcs, "fastq", &validateFastq);
hashAdd(funcs, "csfasta", &validateCsfasta);
hashAdd(funcs, "csqual", &validateCsqual);
hashAdd(funcs, "broadPeak", &validateBroadPeak);
-//hashAdd(funcs, "narrowPeak", &validateNarrowPeak);
-//hashAdd(funcs, "gappedPeak", &validateGappedPeak);
+hashAdd(funcs, "narrowPeak", &validateNarrowPeak);
+hashAdd(funcs, "gappedPeak", &validateGappedPeak);
//hashAdd(funcs, "test", &testFunc);
if (!(func = hashFindVal(funcs, type)))
errAbort("Cannot validate %s type files\n", type);
validateFiles(func, argc, argv);