src/hg/encode/validateFiles/validateFiles.c 1.34
1.34 2009/12/17 18:46:57 tdreszer
Added 'privateData' option and require tab delimit instead of white space delimit
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.33
retrieving revision 1.34
diff -b -B -U 4 -r1.33 -r1.34
--- src/hg/encode/validateFiles/validateFiles.c 15 Dec 2009 23:50:33 -0000 1.33
+++ src/hg/encode/validateFiles/validateFiles.c 17 Dec 2009 18:46:57 -0000 1.34
@@ -24,8 +24,9 @@
boolean printFailLines;
boolean mmPerPair;
boolean nMatch;
boolean isSort;
+boolean privateData;
int quick;
struct hash *chrHash = NULL;
char dnaChars[256];
char qualChars[256];
@@ -85,14 +86,15 @@
" -matchFirst=n only check the first N bases of the sequence\n"
" -mmPerPair Check either pair dont exceed mismatch count if validating\n"
" pairedTagAlign files (default is the total for the pair)\n"
" -mmCheckOneInN=n Check mismatches in only one in 'n' lines (default=1, all)\n"
+ " -nMatch N's do not count as a mismatch\n"
+ " -privateData Private data so empty sequence is tolerated\n"
" -printOkLines Print lines which pass validation to stdout\n"
" -quick[=N] Just test the first N lines of each file (default 1000)\n"
" -printFailLines Print lines which fail validation to stdout\n"
" -isSort input is sorted by chrom\n"
//" -acceptDot Accept '.' as 'N' in DNA sequence\n"
- " -nMatch N's do not count as a mismatch\n"
" -version Print version\n"
, MAX_ERRORS);
}
@@ -111,8 +113,9 @@
{"mmPerPair", OPTION_BOOLEAN},
{"mmCheckOneInN", OPTION_INT},
{"quick", OPTION_INT},
{"nMatch", OPTION_BOOLEAN},
+ {"privateData", OPTION_BOOLEAN},
// {"acceptDot", OPTION_BOOLEAN},
{"isSort", OPTION_BOOLEAN},
{"version", OPTION_BOOLEAN},
{NULL, 0},
@@ -294,14 +297,23 @@
}
}
if (i == 0)
{
+ if(privateData) // PrivateData means sequence should be empty
+ return TRUE;
if (s==row)
warn("Error [file=%s, line=%d]: %s empty", file, line, name);
else
warn("Error [file=%s, line=%d]: %s empty in line [%s]", file, line, name, row);
return FALSE;
}
+else if(privateData) { // PrivateData means sequence should be empty
+ if (s==row)
+ warn("Error [file=%s, line=%d]: %s is not empty but this should be private data", file, line, name);
+ else
+ warn("Error [file=%s, line=%d]: %s is not empty but this should be private data in line [%s]", file, line, name, row);
+ return FALSE;
+ }
return TRUE;
}
boolean checkSeqName(char *file, int line, char *s, char firstChar, char *name)
@@ -561,9 +573,9 @@
boolean checkColumns(char *file, int line, char *row, char *buf, char *words[], int wordSize, int expected)
// Split buf into wordSize columns in words[] array
// Return TRUE if number of columns == expected, otherwise FALSE
{
-int n = chopByWhite(buf, words, wordSize);
+int n = chopByChar(buf, '\t', words, wordSize);
if ( n != expected)
{
warn("Error [file=%s, line=%d]: found %d columns, expected %d [%s]", file, line, n, expected, row);
return FALSE;
@@ -580,8 +592,11 @@
static char bigArr[100 * 1024]; // 100K limit on tagAlign seqLen
struct dnaSeq ourSeq;
boolean chrMSizeAjustment=FALSE;
+if(privateData) // No way to check private data
+ return TRUE;
+
if (!genome)
return TRUE; // only check if 2bit file specified
if (line % mmCheckOneInN != 0)
return TRUE; // dont check if this is not one in N
@@ -655,8 +670,10 @@
boolean checkMismatchesSeq1Seq2(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq1, char *seq2)
{
int i, mm1, mm2, len1, len2;
struct dnaSeq *g1, *g2;
+if(privateData) // No way to check private data
+ return TRUE;
if (!genome)
return TRUE; // dont check unless 2bit file specified
if (line % mmCheckOneInN != 0)
return TRUE; // dont check if this is not one in N
@@ -1108,8 +1125,9 @@
mismatches = optionInt("mismatches",0);
matchFirst = optionInt("matchFirst",0);
mmPerPair = optionExists("mmPerPair");
nMatch = optionExists("nMatch");
+privateData = optionExists("privateData");
isSort = optionExists("isSort");
mmCheckOneInN = optionInt("mmCheckOneInN", 1);
quick = optionExists("quick") ? optionInt("quick",QUICK_DEFAULT) : 0;
colorSpace = optionExists("colorSpace") || sameString(type, "csfasta");