src/hg/encode/validateFiles/validateFiles.c 1.6
1.6 2009/03/13 16:45:26 mikep
provide colorspace as option to include chars 0-3 in DNA
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 4 -r1.5 -r1.6
--- src/hg/encode/validateFiles/validateFiles.c 13 Mar 2009 16:39:21 -0000 1.5
+++ src/hg/encode/validateFiles/validateFiles.c 13 Mar 2009 16:45:26 -0000 1.6
@@ -10,8 +10,9 @@
static char *version = "$Revision$";
#define MAX_ERRORS 10
int maxErrors;
+boolean colorSpace;
boolean zeroSizeOk;
boolean printOkLines;
boolean printFailLines;
struct hash *chrHash = NULL;
@@ -32,8 +33,9 @@
"options:\n"
" -type=(fastq|csfasta|tagAlign|pairedTagAlign)\n"
" csfasta = Colorspace fasta (SOLiD platform)\n"
" -chromInfo=file.txt Specify chromInfo file to validate chrom names and sizes\n"
+ " -colorSpace Sequences are colorspace 0-3 values\n"
" -maxErrors=N Maximum lines with errors to report in one file before \n"
" stopping (default %d)\n"
" -zeroSizeOk For BED-type positional data, allow rows with start==end\n"
" otherwise require start < end\n"
@@ -46,8 +48,9 @@
static struct optionSpec options[] = {
{"type", OPTION_STRING},
{"chromInfo", OPTION_STRING},
{"maxErrors", OPTION_INT},
+ {"colorSpace", OPTION_BOOLEAN},
{"zeroSizeOk", OPTION_BOOLEAN},
{"printOkLines", OPTION_BOOLEAN},
{"printFailLines", OPTION_BOOLEAN},
{"version", OPTION_BOOLEAN},
@@ -55,18 +58,21 @@
};
void initArrays()
// Set up array of chars
-// dnaChars: or DNA chars include colorspace 0-3 as valid dna sequences for SOLiD data
+// dnaChars: DNA chars ACGTNacgtn, and optionally include colorspace 0-3
// qualChars: fastq quality scores as ascii [!-~] (ord(!)=33, ord(~)=126)
// seqName: fastq sequence name chars [A-Za-z0-9_.:/-]
{
int i;
for (i=0 ; i < 256 ; ++i)
dnaChars[i] = qualChars[i] = seqName[i] = csSeqName[i] = digits[i] = alpha[i] = 0;
dnaChars['a'] = dnaChars['c'] = dnaChars['g'] = dnaChars['t'] = dnaChars['n'] = 1;
dnaChars['A'] = dnaChars['C'] = dnaChars['G'] = dnaChars['T'] = dnaChars['N'] = 1;
-dnaChars['0'] = dnaChars['1'] = dnaChars['2'] = dnaChars['3'] = 1;
+if (colorSpace)
+ {
+ dnaChars['0'] = dnaChars['1'] = dnaChars['2'] = dnaChars['3'] = 1;
+ }
for (i= (int)'A' ; i <= (int)'Z' ; ++i)
seqName[i] = seqName[i+(int)('a'-'A')] = alpha[i] = alpha[i+(int)('a'-'A')] = 1;
for (i= (int)'0' ; i <= (int)'9' ; ++i)
seqName[i] = digits[i] = csSeqName[i] = 1;
@@ -524,9 +530,8 @@
struct hash *funcs = newHash(0);
optionInit(&argc, argv, options);
++argv;
--argc;
-initArrays();
if (optionExists("version"))
errAbort(version);
if (argc==0)
usage();
@@ -536,8 +541,10 @@
maxErrors = optionInt("maxErrors", MAX_ERRORS);
zeroSizeOk = optionExists("zeroSizeOk");
printOkLines = optionExists("printOkLines");
printFailLines = optionExists("printFailLines");
+colorSpace = optionExists("colorSpace");
+initArrays();
if (strlen(optionVal("chromInfo", "")) > 0)
{
if (!(ci = chromInfoLoadAll(optionVal("chromInfo", ""))))
errAbort("could not load chromInfo file %s\n", optionVal("chromInfo", ""));