src/hg/encode/validateFiles/validateFiles.c 1.7
1.7 2009/03/13 20:15:25 mikep
improved usage message, continue after error in # columns, add csqual, put colorSpace as option
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 4 -r1.6 -r1.7
--- src/hg/encode/validateFiles/validateFiles.c 13 Mar 2009 16:45:26 -0000 1.6
+++ src/hg/encode/validateFiles/validateFiles.c 13 Mar 2009 20:15:25 -0000 1.7
@@ -17,8 +17,9 @@
boolean printFailLines;
struct hash *chrHash = NULL;
char dnaChars[256];
char qualChars[256];
+char csQualChars[256];
char seqName[256];
char digits[256];
char alpha[256];
char csSeqName[256];
@@ -26,14 +27,23 @@
void usage()
/* Explain usage and exit. */
{
errAbort(
- "validateFiles - validate format of different track input files\n"
+ "validateFiles - Validate format of different track input files\n"
+ " Program exits with non-zero status if any errors detected\n"
+ " otherwise exits with zero status\n"
+ " Use filename 'stdin' to read from stdin\n"
+ " Files can be in .gz, .bz2, .zip, .Z format and are \n"
+ " automatically decompressed\n"
+ " Multiple input files of the same type can be listed\n"
+ " Error messages are written to stderr\n"
+ " OK or failing file lines can be optionally written to stdout\n"
"usage:\n"
" validateFiles -type=FILE_TYPE file1 [file2 [...]]\n"
"options:\n"
- " -type=(fastq|csfasta|tagAlign|pairedTagAlign)\n"
- " csfasta = Colorspace fasta (SOLiD platform)\n"
+ " -type=(fastq|csfasta|csqual|tagAlign|pairedTagAlign)\n"
+ " csfasta = Colorspace fasta (SOLiD platform) (implies -colorSpace)\n"
+ " csqual = Colorspace quality lines (SOLiD platform)\n"
" -chromInfo=file.txt Specify chromInfo file to validate chrom names and sizes\n"
" -colorSpace Sequences are colorspace 0-3 values\n"
" -maxErrors=N Maximum lines with errors to report in one file before \n"
" stopping (default %d)\n"
@@ -60,13 +70,14 @@
void initArrays()
// Set up array of chars
// dnaChars: DNA chars ACGTNacgtn, and optionally include colorspace 0-3
// qualChars: fastq quality scores as ascii [!-~] (ord(!)=33, ord(~)=126)
+// csQualChars: csfasta quality scores are decimals separated by spaces
// seqName: fastq sequence name chars [A-Za-z0-9_.:/-]
{
int i;
for (i=0 ; i < 256 ; ++i)
- dnaChars[i] = qualChars[i] = seqName[i] = csSeqName[i] = digits[i] = alpha[i] = 0;
+ dnaChars[i] = qualChars[i] = csQualChars[i] = seqName[i] = csSeqName[i] = digits[i] = alpha[i] = 0;
dnaChars['a'] = dnaChars['c'] = dnaChars['g'] = dnaChars['t'] = dnaChars['n'] = 1;
dnaChars['A'] = dnaChars['C'] = dnaChars['G'] = dnaChars['T'] = dnaChars['N'] = 1;
if (colorSpace)
{
@@ -74,11 +85,12 @@
}
for (i= (int)'A' ; i <= (int)'Z' ; ++i)
seqName[i] = seqName[i+(int)('a'-'A')] = alpha[i] = alpha[i+(int)('a'-'A')] = 1;
for (i= (int)'0' ; i <= (int)'9' ; ++i)
- seqName[i] = digits[i] = csSeqName[i] = 1;
+ seqName[i] = digits[i] = csSeqName[i] = csQualChars[i] = 1;
seqName['_'] = seqName['.'] = seqName[':'] = seqName['/'] = seqName['-'] = 1;
csSeqName[','] = csSeqName['.'] = csSeqName['-'] = csSeqName['#'] = 1;
+csQualChars[' '] = 1;
for (i= (int)'!' ; i <= (int)'~' ; ++i)
qualChars[i] = 1;
}
@@ -150,15 +162,21 @@
for ( i = 0; s[i] ; ++i)
{
if (!dnaChars[(int)s[i]])
{
+ if (s==row)
+ warn("Error [file=%s, line=%d]: invalid DNA chars in %s(%s)", file, line, name, s);
+ else
warn("Error [file=%s, line=%d]: invalid DNA chars in %s(%s) [%s]", file, line, name, s, row);
return FALSE;
}
}
if (i == 0)
{
- warn("Error [file=%s, line=%d]: %s column empty [%s]", file, line, name, row);
+ if (s==row)
+ warn("Error [file=%s, line=%d]: %s empty", file, line, name);
+ else
+ warn("Error [file=%s, line=%d]: %s empty in line [%s]", file, line, name, row);
return FALSE;
}
return TRUE;
}
@@ -273,8 +291,29 @@
}
return TRUE;
}
+boolean checkCsQual(char *file, int line, char *s)
+// Return TRUE if string has non-zero length and contains quality scores
+// Othewise print warning that quality is empty and return FALSE
+{
+int i;
+for ( i = 0; s[i] ; ++i)
+ {
+ if (!csQualChars[(int)s[i]])
+ {
+ warn("Error [file=%s, line=%d]: invalid colorspace quality chars in [%s]", file, line, s);
+ return FALSE;
+ }
+ }
+if (i == 0)
+ {
+ warn("Error [file=%s, line=%d]: colorspace quality empty [%s]", file, line, s);
+ return FALSE;
+ }
+return TRUE;
+}
+
boolean checkStartEnd(char *file, int line, char *row, char *start, char *end, char *chrom, unsigned chromSize)
// Return TRUE if start and end are both >= 0,
// and if zeroSizeOk then start <= end
// otherwise then start < end
@@ -343,8 +382,29 @@
warn("Error [file=%s, line=%d]: invalid strand '%s' (want '+','-','.') [%s]", file, line, strand, row);
return FALSE;
}
+boolean wantNewLine(struct lineFile *lf, char *file, int line, char **row, char *msg)
+{
+boolean res = lineFileNext(lf, row, NULL);
+if (!res)
+ warn("Error [file=%s, line=%d]: %s not found", file, line, msg);
+return res;
+}
+
+boolean checkColumns(char *file, int line, char *row, char *buf, char *words[], int wordSize, int expected)
+// Split buf into wordSize columns in words[] array
+// Return TRUE if number of columns == expected, otherwise FALSE
+{
+int n = chopByWhite(buf, words, wordSize);
+if ( n != expected)
+ {
+ warn("Error [file=%s, line=%d]: found %d columns, expected %d [%s]", file, line, n, expected, row);
+ return FALSE;
+ }
+return TRUE;
+}
+
int validateTagOrPairedTagAlign(struct lineFile *lf, char *file, boolean paired)
{
char *row;
char buf[1024];
@@ -355,14 +415,11 @@
int size;
verbose(2,"[%s %3d] paired=%d file(%s)\n", __func__, __LINE__, paired, file);
while (lineFileNext(lf, &row, &size))
{
- ++line;
safecpy(buf, sizeof(buf), row);
- int n = chopByWhite(buf, words, 9);
- if ( n != (paired ? 8 : 6))
- errAbort("Error: found %d columns, expected %d [%s]\n", n, (paired ? 8 : 6), row);
- if (checkChrom(file, line, row, words[0], &chromSize)
+ if ( checkColumns(file, ++line, row, buf, words, 9, (paired ? 8 : 6))
+ && checkChrom(file, line, row, words[0], &chromSize)
&& checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
&& checkIntBetween(file, line, row, words[4], "score", 0, 1000)
&& checkStrand(file, line, row, words[5])
&& (paired ?
@@ -428,14 +485,14 @@
else
startOfFile = FALSE;
}
if (checkSeqName(file, line, seqName, '@', "sequence name")
- && (lineFileNext(lf, &seq, NULL))
- && checkSeq(file, ++line, seq, seq, "sequence")
- && (lineFileNext(lf, &qName, NULL))
- && checkSeqName(file, ++line, qName, '+', "quality name")
- && (lineFileNext(lf, &qual, NULL))
- && checkQual(file, ++line, qual) )
+ && (wantNewLine(lf, file, ++line, &seq, "fastq sequence line"))
+ && checkSeq(file, line, seq, seq, "sequence")
+ && (wantNewLine(lf, file, ++line, &qName, "fastq sequence name (quality line)"))
+ && checkSeqName(file, line, qName, '+', "quality name")
+ && (wantNewLine(lf, file, ++line, &qual, "quality line"))
+ && checkQual(file, line, qual) )
{
if (printOkLines)
printf("%s\n%s\n%s\n%s\n", seqName, seq, qName, qual);
}
@@ -449,15 +506,18 @@
}
return errs;
}
-// CS Fasta:
-// >461_19_209_F3
-// T022213002230311203200200322000
-// >920_22_656_F3,1.-152654094.1.35.35.0###,19.43558664.1.35.35.0###
-// T01301010111200210102321210100112312
+/* Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
+CS Fasta:
+>461_19_209_F3
+T022213002230311203200200322000
+>920_22_656_F3,1.-152654094.1.35.35.0###,19.43558664.1.35.35.0###
+T01301010111200210102321210100112312
+*/
int validateCsfasta(struct lineFile *lf, char *file)
+// Validate Colorspace fasta files
{
char *seqName = NULL;
char *seq = NULL;
int line = 0;
@@ -474,10 +534,10 @@
else
startOfFile = FALSE;
}
if (checkCsSeqName(file, line, seqName)
- && (lineFileNext(lf, &seq, NULL))
- && checkSeq(file, ++line, seq, seq, "sequence") )
+ && (wantNewLine(lf, file, ++line, &seq, "colorspace sequence name"))
+ && checkSeq(file, line, seq, seq, "colorspace sequence") )
{
if (printOkLines)
printf("%s\n%s\n", seqName, seq);
}
@@ -491,8 +551,57 @@
}
return errs;
}
+
+/* Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
+ Sample:-
+
+# Cwd: /home/pipeline
+# Title: S0033_20080723_2_I22_EA_
+>461_19_90_F3
+20 10 8 13 8 10 20 7 7 24 15 22 21 14 14 8 11 15 5 20 6 5 8 22 6 24 3 16 7 11
+>461_19_209_F3
+16 8 5 12 20 24 19 8 13 17 11 23 8 24 8 7 17 4 20 8 29 7 3 16 3 4 8 20 17 9
+*/
+
+int validateCsqual(struct lineFile *lf, char *file)
+// Validate Colorspace quality files
+{
+char *seqName = NULL;
+char *qual = NULL;
+int line = 0;
+int errs = 0;
+boolean startOfFile = TRUE;
+verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
+while (lineFileNext(lf, &seqName, NULL))
+ {
+ ++line;
+ if (startOfFile)
+ {
+ if (*seqName == '#')
+ continue;
+ else
+ startOfFile = FALSE;
+ }
+ if (checkCsSeqName(file, line, seqName)
+ && (wantNewLine(lf, file, ++line, &qual, "colorspace quality line"))
+ && checkCsQual(file, line, qual) )
+ {
+ if (printOkLines)
+ printf("%s\n%s\n", seqName, qual);
+ }
+ else
+ {
+ if (printFailLines)
+ printf("%s\n%s\n", seqName, qual);
+ if (++errs >= maxErrors)
+ errAbort("Aborting .. found %d errors\n", errs);
+ }
+ }
+return errs;
+}
+
void validateFiles(int (*validate)(struct lineFile *lf, char *file), int numFiles, char *files[])
/* validateFile - validate format of different track input files. */
{
int i;
@@ -541,9 +650,9 @@
maxErrors = optionInt("maxErrors", MAX_ERRORS);
zeroSizeOk = optionExists("zeroSizeOk");
printOkLines = optionExists("printOkLines");
printFailLines = optionExists("printFailLines");
-colorSpace = optionExists("colorSpace");
+colorSpace = optionExists("colorSpace") || sameString(type, "csfasta");
initArrays();
if (strlen(optionVal("chromInfo", "")) > 0)
{
if (!(ci = chromInfoLoadAll(optionVal("chromInfo", ""))))
@@ -556,8 +665,9 @@
hashAdd(funcs, "tagAlign", &validateTagAlign);
hashAdd(funcs, "pairedTagAlign", &validatePairedTagAlign);
hashAdd(funcs, "fastq", &validateFastq);
hashAdd(funcs, "csfasta", &validateCsfasta);
+hashAdd(funcs, "csqual", &validateCsqual);
//hashAdd(funcs, "test", &testFunc);
if (!(func = hashFindVal(funcs, type)))
errAbort("Cannot validate %s type files\n", type);
validateFiles(func, argc, argv);