src/hg/encode/validateFiles/validateFiles.c 1.6

1.6 2009/03/13 16:45:26 mikep
provide colorspace as option to include chars 0-3 in DNA
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 4 -r1.5 -r1.6
--- src/hg/encode/validateFiles/validateFiles.c	13 Mar 2009 16:39:21 -0000	1.5
+++ src/hg/encode/validateFiles/validateFiles.c	13 Mar 2009 16:45:26 -0000	1.6
@@ -10,8 +10,9 @@
 static char *version = "$Revision$";
 
 #define MAX_ERRORS 10
 int maxErrors;
+boolean colorSpace;
 boolean zeroSizeOk;
 boolean printOkLines;
 boolean printFailLines;
 struct hash *chrHash = NULL;
@@ -32,8 +33,9 @@
   "options:\n"
   "   -type=(fastq|csfasta|tagAlign|pairedTagAlign)\n"
   "                                csfasta = Colorspace fasta (SOLiD platform)\n"
   "   -chromInfo=file.txt          Specify chromInfo file to validate chrom names and sizes\n"
+  "   -colorSpace                  Sequences are colorspace 0-3 values\n"
   "   -maxErrors=N                 Maximum lines with errors to report in one file before \n"
   "                                  stopping (default %d)\n"
   "   -zeroSizeOk                  For BED-type positional data, allow rows with start==end\n"
   "                                  otherwise require start < end\n"
@@ -46,8 +48,9 @@
 static struct optionSpec options[] = {
    {"type", OPTION_STRING},
    {"chromInfo", OPTION_STRING},
    {"maxErrors", OPTION_INT},
+   {"colorSpace", OPTION_BOOLEAN},
    {"zeroSizeOk", OPTION_BOOLEAN},
    {"printOkLines", OPTION_BOOLEAN},
    {"printFailLines", OPTION_BOOLEAN},
    {"version", OPTION_BOOLEAN},
@@ -55,18 +58,21 @@
 };
 
 void initArrays()
 // Set up array of chars
-// dnaChars:  or DNA chars include colorspace 0-3 as valid dna sequences for SOLiD data
+// dnaChars:  DNA chars ACGTNacgtn, and optionally include colorspace 0-3
 // qualChars: fastq quality scores as ascii [!-~] (ord(!)=33, ord(~)=126)
 // seqName:   fastq sequence name chars [A-Za-z0-9_.:/-]
 {
 int i;
 for (i=0 ; i < 256 ; ++i)
     dnaChars[i] = qualChars[i] = seqName[i] = csSeqName[i] = digits[i] = alpha[i] = 0;
 dnaChars['a'] = dnaChars['c'] = dnaChars['g'] = dnaChars['t'] = dnaChars['n'] = 1;
 dnaChars['A'] = dnaChars['C'] = dnaChars['G'] = dnaChars['T'] = dnaChars['N'] = 1;
-dnaChars['0'] = dnaChars['1'] = dnaChars['2'] = dnaChars['3'] = 1;
+if (colorSpace)
+    {
+    dnaChars['0'] = dnaChars['1'] = dnaChars['2'] = dnaChars['3'] = 1;
+    }
 for (i= (int)'A' ; i <= (int)'Z' ; ++i)
     seqName[i] = seqName[i+(int)('a'-'A')] = alpha[i] = alpha[i+(int)('a'-'A')] = 1;
 for (i= (int)'0' ; i <= (int)'9' ; ++i)
     seqName[i] = digits[i] = csSeqName[i] = 1;
@@ -524,9 +530,8 @@
 struct hash *funcs = newHash(0);
 optionInit(&argc, argv, options);
 ++argv; 
 --argc;
-initArrays();
 if (optionExists("version"))
     errAbort(version);
 if (argc==0)
     usage();
@@ -536,8 +541,10 @@
 maxErrors      = optionInt("maxErrors", MAX_ERRORS);
 zeroSizeOk     = optionExists("zeroSizeOk");
 printOkLines   = optionExists("printOkLines");
 printFailLines = optionExists("printFailLines");
+colorSpace     = optionExists("colorSpace");
+initArrays();
 if (strlen(optionVal("chromInfo", "")) > 0)
     {
     if (!(ci = chromInfoLoadAll(optionVal("chromInfo", ""))))
 	errAbort("could not load chromInfo file %s\n", optionVal("chromInfo", ""));