src/hg/encode/validateFiles/validateFiles.c 1.23

1.23 2009/08/10 17:22:49 braney
add options to ignore N's in mismatching and to deal more efficiently with sorted files (assuming there are a lot of records)
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 4 -r1.22 -r1.23
--- src/hg/encode/validateFiles/validateFiles.c	4 Aug 2009 22:43:33 -0000	1.22
+++ src/hg/encode/validateFiles/validateFiles.c	10 Aug 2009 17:22:49 -0000	1.23
@@ -1,5 +1,4 @@
-/* validateFiles - validate format of different track input files. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
@@ -24,8 +23,10 @@
 boolean chrMSizeOk;
 boolean printOkLines;
 boolean printFailLines;
 boolean mmPerPair;
+boolean nMatch;
+boolean isSort;
 int quick;
 struct hash *chrHash = NULL;
 char dnaChars[256];
 char qualChars[256];
@@ -82,8 +83,10 @@
   "   -mmCheckOneInN=n             Check mismatches in only one in 'n' lines (default=1, all)\n"
   "   -printOkLines                Print lines which pass validation to stdout\n"
   "   -quick[=N]                   Just test the first N lines of each file (default 1000)\n"
   "   -printFailLines              Print lines which fail validation to stdout\n"
+  "   -isSort                      input is sorted by chrom\n"
+  "   -nMatch                      N's do not count as a mismatch\n"
   "   -version                     Print version\n"
   , MAX_ERRORS);
 }
 
@@ -101,8 +104,10 @@
    {"mismatches", OPTION_INT},
    {"mmPerPair", OPTION_BOOLEAN},
    {"mmCheckOneInN", OPTION_INT},
    {"quick", OPTION_INT},
+   {"nMatch", OPTION_BOOLEAN},
+   {"isSort", OPTION_BOOLEAN},
    {"version", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
@@ -548,34 +553,65 @@
 boolean checkMismatchesSeq(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq)
 {
 int i, mm = 0;
 struct dnaSeq *g;
+static struct dnaSeq *cacheSeq = NULL;
+static char cacheChrom[1024];
+static char bigArr[100 * 1024];
+struct dnaSeq ourSeq;
+
 if (!genome)
     return TRUE; // only check if 2bit file specified
 if (line % mmCheckOneInN != 0)
     return TRUE; // dont check if this is not one in N
-g = twoBitReadSeqFragLower(genome, chrom, chromStart, chromEnd);
+if (!isSort)
+    {
+    g = twoBitReadSeqFragLower(genome, chrom, chromStart, chromEnd);
+    }
+else
+    {
+    AllocVar(g);
+    if ((cacheChrom == NULL) || !sameString(chrom, cacheChrom))
+	{
+	freeDnaSeq(&cacheSeq);
+	int size =  twoBitSeqSize(genome, chrom);
+	cacheSeq = twoBitReadSeqFragLower(genome, chrom, 0, size);
+	strcpy(cacheChrom, chrom);
+	printf("read in %s size %d\n",cacheChrom, size);
+	}
+    int len = chromEnd - chromStart;
+    if (len > sizeof(bigArr))
+	errAbort("static array not big enough for sequence len %d on line %d\n",
+	    len, line);
+    g = &ourSeq;
+    g->dna = bigArr;
+    g->size = len;
+    memcpy(g->dna, &cacheSeq->dna[chromStart], len);
+    }
+
 if (strand == '-')
     reverseComplement(g->dna, g->size);
+
 if (g->size != strlen(seq) || g->size != chromEnd-chromStart)
     {
-    warn("Error [file=%s, line=%d]: sequence length (%d) does not match genomic coords (%d / %d)", 
-         file, line, (int)strlen(seq), chromEnd-chromStart, g->size);
+    warn("Error [file=%s, line=%d]: sequence (%s) length (%d) does not match genomic coords (%d / %d)", 
+         file, line, seq, (int)strlen(seq), chromEnd-chromStart, g->size);
     return FALSE;
     }
 for (i=0 ; i < g->size ; ++i)
     {
     char c = tolower(seq[i]);
-    if (c != 'n' && c != g->dna[i])
+    if (!((nMatch && c == 'n') || c == g->dna[i]))
         ++mm;
     }
 if (mm > mismatches)
     {
     warn("Error [file=%s, line=%d]: too many mismatches (found %d/%d, maximum is %d) (%s %d %d %c)\nseq=[%s]\ngen=[%s]\n", 
          file, line, mm, g->size, mismatches, chrom, chromStart, chromEnd, strand, seq, g->dna);
     return FALSE;
     }
-freeDnaSeq(&g);
+if (!isSort)
+    freeDnaSeq(&g);
 return TRUE;
 }
 
 boolean checkMismatchesSeq1Seq2(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq1, char *seq2)
@@ -609,16 +645,16 @@
 mm1 = 0;
 for (i=0 ; i < g1->size ; ++i)
     {
     char c = tolower(seq1[i]);
-    if (c != 'n' && c != g1->dna[i])
+    if (!((nMatch && c == 'n') || c == g1->dna[i]))
         ++mm1;
     }
 mm2 = 0;
 for (i=0 ; i < g2->size ; ++i)
     {
     char c = tolower(seq2[i]);
-    if (c != 'n' && c != g2->dna[i])
+    if (!((nMatch && c == 'n') || c == g2->dna[i]))
         ++mm2;
     }
 if (mmPerPair)
     {
@@ -1022,8 +1058,10 @@
 printFailLines = optionExists("printFailLines");
 genome         = optionExists("genome") ? twoBitOpen(optionVal("genome",NULL)) : NULL;
 mismatches     = optionInt("mismatches",0);
 mmPerPair      = optionExists("mmPerPair");
+nMatch         = optionExists("nMatch");
+isSort         = optionExists("isSort");
 mmCheckOneInN  = optionInt("mmCheckOneInN", 1);
 quick          = optionExists("quick") ? optionInt("quick",QUICK_DEFAULT) : 0;
 colorSpace     = optionExists("colorSpace") || sameString(type, "csfasta");
 initArrays();