src/hg/encode/validateFiles/validateFiles.c 1.34

1.34 2009/12/17 18:46:57 tdreszer
Added 'privateData' option and require tab delimit instead of white space delimit
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.33
retrieving revision 1.34
diff -b -B -U 4 -r1.33 -r1.34
--- src/hg/encode/validateFiles/validateFiles.c	15 Dec 2009 23:50:33 -0000	1.33
+++ src/hg/encode/validateFiles/validateFiles.c	17 Dec 2009 18:46:57 -0000	1.34
@@ -24,8 +24,9 @@
 boolean printFailLines;
 boolean mmPerPair;
 boolean nMatch;
 boolean isSort;
+boolean privateData;
 int quick;
 struct hash *chrHash = NULL;
 char dnaChars[256];
 char qualChars[256];
@@ -85,14 +86,15 @@
   "   -matchFirst=n                only check the first N bases of the sequence\n"
   "   -mmPerPair                   Check either pair dont exceed mismatch count if validating\n"
   "                                  pairedTagAlign files (default is the total for the pair)\n"
   "   -mmCheckOneInN=n             Check mismatches in only one in 'n' lines (default=1, all)\n"
+  "   -nMatch                      N's do not count as a mismatch\n"
+  "   -privateData                 Private data so empty sequence is tolerated\n"
   "   -printOkLines                Print lines which pass validation to stdout\n"
   "   -quick[=N]                   Just test the first N lines of each file (default 1000)\n"
   "   -printFailLines              Print lines which fail validation to stdout\n"
   "   -isSort                      input is sorted by chrom\n"
 //"   -acceptDot                   Accept '.' as 'N' in DNA sequence\n"
-  "   -nMatch                      N's do not count as a mismatch\n"
   "   -version                     Print version\n"
   , MAX_ERRORS);
 }
 
@@ -111,8 +113,9 @@
    {"mmPerPair", OPTION_BOOLEAN},
    {"mmCheckOneInN", OPTION_INT},
    {"quick", OPTION_INT},
    {"nMatch", OPTION_BOOLEAN},
+   {"privateData", OPTION_BOOLEAN},
 // {"acceptDot", OPTION_BOOLEAN},
    {"isSort", OPTION_BOOLEAN},
    {"version", OPTION_BOOLEAN},
    {NULL, 0},
@@ -294,14 +297,23 @@
 	}
     }
 if (i == 0)
     {
+    if(privateData)  // PrivateData means sequence should be empty
+        return TRUE;
     if (s==row)
 	warn("Error [file=%s, line=%d]: %s empty", file, line, name);
     else
 	warn("Error [file=%s, line=%d]: %s empty in line [%s]", file, line, name, row);
     return FALSE;
     }
+else if(privateData) { // PrivateData means sequence should be empty
+    if (s==row)
+        warn("Error [file=%s, line=%d]: %s is not empty but this should be private data", file, line, name);
+    else
+        warn("Error [file=%s, line=%d]: %s  is not empty but this should be private data in line [%s]", file, line, name, row);
+    return FALSE;
+    }
 return TRUE;
 }
 
 boolean checkSeqName(char *file, int line, char *s, char firstChar, char *name)
@@ -561,9 +573,9 @@
 boolean checkColumns(char *file, int line, char *row, char *buf, char *words[], int wordSize, int expected)
 // Split buf into wordSize columns in words[] array
 // Return TRUE if number of columns == expected, otherwise FALSE
 {
-int n = chopByWhite(buf, words, wordSize);
+int n = chopByChar(buf, '\t', words, wordSize);
 if ( n != expected)
     {
     warn("Error [file=%s, line=%d]: found %d columns, expected %d [%s]", file, line, n, expected, row);
     return FALSE;
@@ -580,8 +592,11 @@
 static char bigArr[100 * 1024]; // 100K limit on tagAlign seqLen
 struct dnaSeq ourSeq;
 boolean chrMSizeAjustment=FALSE;
 
+if(privateData)  // No way to check private data
+    return TRUE;
+
 if (!genome)
     return TRUE; // only check if 2bit file specified
 if (line % mmCheckOneInN != 0)
     return TRUE; // dont check if this is not one in N
@@ -655,8 +670,10 @@
 boolean checkMismatchesSeq1Seq2(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq1, char *seq2)
 {
 int i, mm1, mm2, len1, len2;
 struct dnaSeq *g1, *g2;
+if(privateData)  // No way to check private data
+    return TRUE;
 if (!genome)
     return TRUE; // dont check unless 2bit file specified
 if (line % mmCheckOneInN != 0)
     return TRUE; // dont check if this is not one in N
@@ -1108,8 +1125,9 @@
 mismatches     = optionInt("mismatches",0);
 matchFirst     = optionInt("matchFirst",0);
 mmPerPair      = optionExists("mmPerPair");
 nMatch         = optionExists("nMatch");
+privateData    = optionExists("privateData");
 isSort         = optionExists("isSort");
 mmCheckOneInN  = optionInt("mmCheckOneInN", 1);
 quick          = optionExists("quick") ? optionInt("quick",QUICK_DEFAULT) : 0;
 colorSpace     = optionExists("colorSpace") || sameString(type, "csfasta");