src/hg/encode/validateFiles/validateFiles.c 1.26

1.26 2009/08/19 17:58:00 braney
add matchFirst argument
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.25
retrieving revision 1.26
diff -b -B -U 4 -r1.25 -r1.26
--- src/hg/encode/validateFiles/validateFiles.c	10 Aug 2009 21:05:29 -0000	1.25
+++ src/hg/encode/validateFiles/validateFiles.c	19 Aug 2009 17:58:00 -0000	1.26
@@ -37,8 +37,9 @@
 char csSeqName[256];
 char bedTypeCols[10];
 struct twoBitFile *genome = NULL;
 int mismatches;
+int matchFirst=0;
 int mmCheckOneInN;
 
 void usage()
 /* Explain usage and exit. */
@@ -77,8 +78,9 @@
   "   -genome=path/to/hg18.2bit    Validate tagAlign or pairedTagAlign sequences match genome\n"
   "                                  in .2bit file\n"
   "   -mismatches=n                Maximum number of mismatches in sequence (or read pair) if \n"
   "                                  validating tagAlign or pairedTagAlign files\n"
+  "   -matchFirst=n                only check the first N bases of the sequence\n"
   "   -mmPerPair                   Check either pair dont exceed mismatch count if validating\n"
   "                                  pairedTagAlign files (default is the total for the pair)\n"
   "   -mmCheckOneInN=n             Check mismatches in only one in 'n' lines (default=1, all)\n"
   "   -printOkLines                Print lines which pass validation to stdout\n"
@@ -101,8 +103,9 @@
    {"printOkLines", OPTION_BOOLEAN},
    {"printFailLines", OPTION_BOOLEAN},
    {"genome", OPTION_STRING},
    {"mismatches", OPTION_INT},
+   {"matchFirst", OPTION_INT},
    {"mmPerPair", OPTION_BOOLEAN},
    {"mmCheckOneInN", OPTION_INT},
    {"quick", OPTION_INT},
    {"nMatch", OPTION_BOOLEAN},
@@ -566,9 +569,9 @@
 int i, mm = 0;
 struct dnaSeq *g;
 static struct dnaSeq *cacheSeq = NULL;
 static char cacheChrom[1024];
-static char bigArr[100 * 1024];
+static char bigArr[100 * 1024]; // 100K limit on tagAlign seqLen
 struct dnaSeq ourSeq;
 
 if (!genome)
     return TRUE; // only check if 2bit file specified
@@ -579,9 +582,9 @@
     g = twoBitReadSeqFragLower(genome, chrom, chromStart, chromEnd);
     }
 else
     {
-    AllocVar(g);
+    // read the whole chrom
     if ((cacheChrom == NULL) || !sameString(chrom, cacheChrom))
 	{
 	freeDnaSeq(&cacheSeq);
 	int size =  twoBitSeqSize(genome, chrom);
@@ -603,13 +606,19 @@
     reverseComplement(g->dna, g->size);
 
 if (g->size != strlen(seq) || g->size != chromEnd-chromStart)
     {
-    warn("Error [file=%s, line=%d]: sequence (%s) length (%d) does not match genomic coords (%d / %d)", 
-         file, line, seq, (int)strlen(seq), chromEnd-chromStart, g->size);
+    warn("Error [file=%s, line=%d]: sequence (%s) length (%d) does not match genomic coords (%d / %d - %s %d %d)", 
+         file, line, seq, (int)strlen(seq), chromEnd-chromStart, g->size,
+	 chrom, chromStart, chromEnd);
     return FALSE;
     }
-for (i=0 ; i < g->size ; ++i)
+
+int length = g->size;
+if (matchFirst && (matchFirst < length))
+    length = matchFirst;
+
+for (i=0 ; i < length; ++i)
     {
     char c = tolower(seq[i]);
     if (checkMismatch(c,  g->dna[i]))
         ++mm;
@@ -1068,8 +1077,9 @@
 printOkLines   = optionExists("printOkLines");
 printFailLines = optionExists("printFailLines");
 genome         = optionExists("genome") ? twoBitOpen(optionVal("genome",NULL)) : NULL;
 mismatches     = optionInt("mismatches",0);
+matchFirst     = optionInt("matchFirst",0);
 mmPerPair      = optionExists("mmPerPair");
 nMatch         = optionExists("nMatch");
 isSort         = optionExists("isSort");
 mmCheckOneInN  = optionInt("mmCheckOneInN", 1);