src/hg/encode/validateFiles/validateFiles.c 1.26
1.26 2009/08/19 17:58:00 braney
add matchFirst argument
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.25
retrieving revision 1.26
diff -b -B -U 4 -r1.25 -r1.26
--- src/hg/encode/validateFiles/validateFiles.c 10 Aug 2009 21:05:29 -0000 1.25
+++ src/hg/encode/validateFiles/validateFiles.c 19 Aug 2009 17:58:00 -0000 1.26
@@ -37,8 +37,9 @@
char csSeqName[256];
char bedTypeCols[10];
struct twoBitFile *genome = NULL;
int mismatches;
+int matchFirst=0;
int mmCheckOneInN;
void usage()
/* Explain usage and exit. */
@@ -77,8 +78,9 @@
" -genome=path/to/hg18.2bit Validate tagAlign or pairedTagAlign sequences match genome\n"
" in .2bit file\n"
" -mismatches=n Maximum number of mismatches in sequence (or read pair) if \n"
" validating tagAlign or pairedTagAlign files\n"
+ " -matchFirst=n only check the first N bases of the sequence\n"
" -mmPerPair Check either pair dont exceed mismatch count if validating\n"
" pairedTagAlign files (default is the total for the pair)\n"
" -mmCheckOneInN=n Check mismatches in only one in 'n' lines (default=1, all)\n"
" -printOkLines Print lines which pass validation to stdout\n"
@@ -101,8 +103,9 @@
{"printOkLines", OPTION_BOOLEAN},
{"printFailLines", OPTION_BOOLEAN},
{"genome", OPTION_STRING},
{"mismatches", OPTION_INT},
+ {"matchFirst", OPTION_INT},
{"mmPerPair", OPTION_BOOLEAN},
{"mmCheckOneInN", OPTION_INT},
{"quick", OPTION_INT},
{"nMatch", OPTION_BOOLEAN},
@@ -566,9 +569,9 @@
int i, mm = 0;
struct dnaSeq *g;
static struct dnaSeq *cacheSeq = NULL;
static char cacheChrom[1024];
-static char bigArr[100 * 1024];
+static char bigArr[100 * 1024]; // 100K limit on tagAlign seqLen
struct dnaSeq ourSeq;
if (!genome)
return TRUE; // only check if 2bit file specified
@@ -579,9 +582,9 @@
g = twoBitReadSeqFragLower(genome, chrom, chromStart, chromEnd);
}
else
{
- AllocVar(g);
+ // read the whole chrom
if ((cacheChrom == NULL) || !sameString(chrom, cacheChrom))
{
freeDnaSeq(&cacheSeq);
int size = twoBitSeqSize(genome, chrom);
@@ -603,13 +606,19 @@
reverseComplement(g->dna, g->size);
if (g->size != strlen(seq) || g->size != chromEnd-chromStart)
{
- warn("Error [file=%s, line=%d]: sequence (%s) length (%d) does not match genomic coords (%d / %d)",
- file, line, seq, (int)strlen(seq), chromEnd-chromStart, g->size);
+ warn("Error [file=%s, line=%d]: sequence (%s) length (%d) does not match genomic coords (%d / %d - %s %d %d)",
+ file, line, seq, (int)strlen(seq), chromEnd-chromStart, g->size,
+ chrom, chromStart, chromEnd);
return FALSE;
}
-for (i=0 ; i < g->size ; ++i)
+
+int length = g->size;
+if (matchFirst && (matchFirst < length))
+ length = matchFirst;
+
+for (i=0 ; i < length; ++i)
{
char c = tolower(seq[i]);
if (checkMismatch(c, g->dna[i]))
++mm;
@@ -1068,8 +1077,9 @@
printOkLines = optionExists("printOkLines");
printFailLines = optionExists("printFailLines");
genome = optionExists("genome") ? twoBitOpen(optionVal("genome",NULL)) : NULL;
mismatches = optionInt("mismatches",0);
+matchFirst = optionInt("matchFirst",0);
mmPerPair = optionExists("mmPerPair");
nMatch = optionExists("nMatch");
isSort = optionExists("isSort");
mmCheckOneInN = optionInt("mmCheckOneInN", 1);