src/hg/encode/validateFiles/validateFiles.c 1.23
1.23 2009/08/10 17:22:49 braney
add options to ignore N's in mismatching and to deal more efficiently with sorted files (assuming there are a lot of records)
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 4 -r1.22 -r1.23
--- src/hg/encode/validateFiles/validateFiles.c 4 Aug 2009 22:43:33 -0000 1.22
+++ src/hg/encode/validateFiles/validateFiles.c 10 Aug 2009 17:22:49 -0000 1.23
@@ -1,5 +1,4 @@
-/* validateFiles - validate format of different track input files. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
@@ -24,8 +23,10 @@
boolean chrMSizeOk;
boolean printOkLines;
boolean printFailLines;
boolean mmPerPair;
+boolean nMatch;
+boolean isSort;
int quick;
struct hash *chrHash = NULL;
char dnaChars[256];
char qualChars[256];
@@ -82,8 +83,10 @@
" -mmCheckOneInN=n Check mismatches in only one in 'n' lines (default=1, all)\n"
" -printOkLines Print lines which pass validation to stdout\n"
" -quick[=N] Just test the first N lines of each file (default 1000)\n"
" -printFailLines Print lines which fail validation to stdout\n"
+ " -isSort input is sorted by chrom\n"
+ " -nMatch N's do not count as a mismatch\n"
" -version Print version\n"
, MAX_ERRORS);
}
@@ -101,8 +104,10 @@
{"mismatches", OPTION_INT},
{"mmPerPair", OPTION_BOOLEAN},
{"mmCheckOneInN", OPTION_INT},
{"quick", OPTION_INT},
+ {"nMatch", OPTION_BOOLEAN},
+ {"isSort", OPTION_BOOLEAN},
{"version", OPTION_BOOLEAN},
{NULL, 0},
};
@@ -548,34 +553,65 @@
boolean checkMismatchesSeq(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq)
{
int i, mm = 0;
struct dnaSeq *g;
+static struct dnaSeq *cacheSeq = NULL;
+static char cacheChrom[1024];
+static char bigArr[100 * 1024];
+struct dnaSeq ourSeq;
+
if (!genome)
return TRUE; // only check if 2bit file specified
if (line % mmCheckOneInN != 0)
return TRUE; // dont check if this is not one in N
-g = twoBitReadSeqFragLower(genome, chrom, chromStart, chromEnd);
+if (!isSort)
+ {
+ g = twoBitReadSeqFragLower(genome, chrom, chromStart, chromEnd);
+ }
+else
+ {
+ AllocVar(g);
+ if ((cacheChrom == NULL) || !sameString(chrom, cacheChrom))
+ {
+ freeDnaSeq(&cacheSeq);
+ int size = twoBitSeqSize(genome, chrom);
+ cacheSeq = twoBitReadSeqFragLower(genome, chrom, 0, size);
+ strcpy(cacheChrom, chrom);
+ printf("read in %s size %d\n",cacheChrom, size);
+ }
+ int len = chromEnd - chromStart;
+ if (len > sizeof(bigArr))
+ errAbort("static array not big enough for sequence len %d on line %d\n",
+ len, line);
+ g = &ourSeq;
+ g->dna = bigArr;
+ g->size = len;
+ memcpy(g->dna, &cacheSeq->dna[chromStart], len);
+ }
+
if (strand == '-')
reverseComplement(g->dna, g->size);
+
if (g->size != strlen(seq) || g->size != chromEnd-chromStart)
{
- warn("Error [file=%s, line=%d]: sequence length (%d) does not match genomic coords (%d / %d)",
- file, line, (int)strlen(seq), chromEnd-chromStart, g->size);
+ warn("Error [file=%s, line=%d]: sequence (%s) length (%d) does not match genomic coords (%d / %d)",
+ file, line, seq, (int)strlen(seq), chromEnd-chromStart, g->size);
return FALSE;
}
for (i=0 ; i < g->size ; ++i)
{
char c = tolower(seq[i]);
- if (c != 'n' && c != g->dna[i])
+ if (!((nMatch && c == 'n') || c == g->dna[i]))
++mm;
}
if (mm > mismatches)
{
warn("Error [file=%s, line=%d]: too many mismatches (found %d/%d, maximum is %d) (%s %d %d %c)\nseq=[%s]\ngen=[%s]\n",
file, line, mm, g->size, mismatches, chrom, chromStart, chromEnd, strand, seq, g->dna);
return FALSE;
}
-freeDnaSeq(&g);
+if (!isSort)
+ freeDnaSeq(&g);
return TRUE;
}
boolean checkMismatchesSeq1Seq2(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq1, char *seq2)
@@ -609,16 +645,16 @@
mm1 = 0;
for (i=0 ; i < g1->size ; ++i)
{
char c = tolower(seq1[i]);
- if (c != 'n' && c != g1->dna[i])
+ if (!((nMatch && c == 'n') || c == g1->dna[i]))
++mm1;
}
mm2 = 0;
for (i=0 ; i < g2->size ; ++i)
{
char c = tolower(seq2[i]);
- if (c != 'n' && c != g2->dna[i])
+ if (!((nMatch && c == 'n') || c == g2->dna[i]))
++mm2;
}
if (mmPerPair)
{
@@ -1022,8 +1058,10 @@
printFailLines = optionExists("printFailLines");
genome = optionExists("genome") ? twoBitOpen(optionVal("genome",NULL)) : NULL;
mismatches = optionInt("mismatches",0);
mmPerPair = optionExists("mmPerPair");
+nMatch = optionExists("nMatch");
+isSort = optionExists("isSort");
mmCheckOneInN = optionInt("mmCheckOneInN", 1);
quick = optionExists("quick") ? optionInt("quick",QUICK_DEFAULT) : 0;
colorSpace = optionExists("colorSpace") || sameString(type, "csfasta");
initArrays();