src/hg/encode/validateFiles/validateFiles.c 1.19
1.19 2009/07/23 22:22:22 mikep
check sequence or seq1/seq2 match the genome for tagAlign and pairedTagAlign files
Index: src/hg/encode/validateFiles/validateFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/validateFiles/validateFiles.c,v
retrieving revision 1.18
retrieving revision 1.19
diff -b -B -U 4 -r1.18 -r1.19
--- src/hg/encode/validateFiles/validateFiles.c 22 Jun 2009 21:32:44 -0000 1.18
+++ src/hg/encode/validateFiles/validateFiles.c 23 Jul 2009 22:22:22 -0000 1.19
@@ -4,8 +4,10 @@
#include "hash.h"
#include "options.h"
#include "chromInfo.h"
#include "jksql.h"
+#include "twoBit.h"
+#include "dnaseq.h"
static char const rcsid[] = "$Id$";
static char *version = "$Revision$";
@@ -21,8 +23,9 @@
boolean zeroSizeOk;
boolean chrMSizeOk;
boolean printOkLines;
boolean printFailLines;
+boolean mmPerPair;
int quick;
struct hash *chrHash = NULL;
char dnaChars[256];
char qualChars[256];
@@ -31,8 +34,11 @@
char digits[256];
char alpha[256];
char csSeqName[256];
char bedTypeCols[10];
+struct twoBitFile *genome = NULL;
+int mismatches;
+int mmCheckOneInN;
void usage()
/* Explain usage and exit. */
{
@@ -57,16 +63,24 @@
" csfasta : Colorspace fasta (implies -colorSpace) (see link below)\n"
" csqual : Colorspace quality (see link below)\n"
" (see http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm)\n"
"\n"
- " -chromDb=db Specify DB containing chromInfo table to validate chrom names and sizes\n"
+ " -chromDb=db Specify DB containing chromInfo table to validate chrom names\n"
+ " and sizes\n"
" -chromInfo=file.txt Specify chromInfo file to validate chrom names and sizes\n"
" -colorSpace Sequences include colorspace values [0-3] (can be used \n"
" with formats such as tagAlign and pairedTagAlign)\n"
" -maxErrors=N Maximum lines with errors to report in one file before \n"
" stopping (default %d)\n"
" -zeroSizeOk For BED-type positional data, allow rows with start==end\n"
" otherwise require strictly start < end\n"
+ " -genome=path/to/hg18.2bit Validate tagAlign or pairedTagAlign sequences match genome\n"
+ " in .2bit file\n"
+ " -mismatches=n Maximum number of mismatches in sequence (or read pair) if \n"
+ " validating tagAlign or pairedTagAlign files\n"
+ " -mmPerPair Check either pair dont exceed mismatch count if validating\n"
+ " pairedTagAlign files (default is the total for the pair)\n"
+ " -mmCheckOneInN=n Check mismatches in only one in 'n' lines (default=1, all)\n"
" -printOkLines Print lines which pass validation to stdout\n"
" -quick[=N] Just test the first N lines of each file (default 1000)\n"
" -printFailLines Print lines which fail validation to stdout\n"
" -version Print version\n"
@@ -82,8 +96,12 @@
{"zeroSizeOk", OPTION_BOOLEAN},
{"chrMSizeOk", OPTION_BOOLEAN},
{"printOkLines", OPTION_BOOLEAN},
{"printFailLines", OPTION_BOOLEAN},
+ {"genome", OPTION_STRING},
+ {"mismatches", OPTION_INT},
+ {"mmPerPair", OPTION_BOOLEAN},
+ {"mmCheckOneInN", OPTION_INT},
{"quick", OPTION_INT},
{"version", OPTION_BOOLEAN},
{NULL, 0},
};
@@ -394,20 +412,23 @@
}
return TRUE;
}
-boolean checkStartEnd(char *file, int line, char *row, char *start, char *end, char *chrom, unsigned chromSize)
+boolean checkStartEnd(char *file, int line, char *row, char *start, char *end, char *chrom, unsigned chromSize, unsigned *sVal, unsigned *eVal)
// Return TRUE if start and end are both >= 0,
// and if zeroSizeOk then start <= end
// otherwise then start < end
// Also check end <= chromSize (as a special case, ignore chrM end if chrMSizeOk)
+// start and end values are returned in sVal and eVal
// Othewise print warning and return FALSE
{
verbose(3,"[%s %3d] inputLine=%d [%s..%s] (chrom=%s,size=%u) [%s]\n", __func__, __LINE__, line, start, end, chrom, chromSize, row);
unsigned s, e;
if ( !checkUnsigned(file, line, row, start, &s, "chromStart")
|| !checkUnsigned(file, line, row, end, &e, "chromEnd"))
return FALSE;
+*sVal = s;
+*eVal = e;
if (chromSize > 0)
{
if (e > chromSize && !(chrMSizeOk && sameString(chrom, "chrM")))
{
@@ -523,8 +544,103 @@
}
return TRUE;
}
+boolean checkMismatchesSeq(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq)
+{
+int i, mm = 0;
+struct dnaSeq *g;
+if (!genome)
+ return TRUE; // only check if 2bit file specified
+if (line % mmCheckOneInN != 0)
+ return TRUE; // dont check if this is not one in N
+g = twoBitReadSeqFragLower(genome, chrom, chromStart, chromEnd);
+if (strand == '-')
+ reverseComplement(g->dna, g->size);
+if (g->size != strlen(seq) || g->size != chromEnd-chromStart)
+ {
+ warn("Error [file=%s, line=%d]: sequence length (%d) does not match genomic coords (%d / %d)",
+ file, line, (int)strlen(seq), chromEnd-chromStart, g->size);
+ return FALSE;
+ }
+for (i=0 ; i < g->size ; ++i)
+ {
+ char c = tolower(seq[i]);
+ if (c == 'n' || c != g->dna[i])
+ ++mm;
+ }
+if (mm > mismatches)
+ {
+ warn("Error [file=%s, line=%d]: too many mismatches (found %d/%d, maximum is %d) relative to '%c' strand\nseq=[%s]\ngen=[%s]\n",
+ file, line, mm, g->size, mismatches, strand, seq, g->dna);
+ return FALSE;
+ }
+return TRUE;
+}
+
+boolean checkMismatchesSeq1Seq2(char *file, int line, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq1, char *seq2)
+{
+int i, mm1, mm2, len1, len2;
+struct dnaSeq *g1, *g2;
+if (!genome)
+ return TRUE; // dont check unless 2bit file specified
+if (line % mmCheckOneInN != 0)
+ return TRUE; // dont check if this is not one in N
+len1 = strlen(seq1);
+len2 = strlen(seq2);
+if (strand == '-')
+ {
+ g1 = twoBitReadSeqFragLower(genome, chrom, chromEnd-len1, chromEnd);
+ g2 = twoBitReadSeqFragLower(genome, chrom, chromStart, chromStart+len2);
+ reverseComplement(g1->dna, g1->size);
+ reverseComplement(g2->dna, g2->size);
+ }
+else
+ {
+ g1 = twoBitReadSeqFragLower(genome, chrom, chromStart, chromStart+len1);
+ g2 = twoBitReadSeqFragLower(genome, chrom, chromEnd-len2, chromEnd);
+ }
+if (g1->size != len1 || g2->size != len2)
+ {
+ warn("Error [file=%s, line=%d]: sequence lengths (%d, %d) do not match genomic ones (%d, %d)",
+ file, line, len1, len2, g1->size, g2->size);
+ return FALSE;
+ }
+mm1 = 0;
+for (i=0 ; i < g1->size ; ++i)
+ {
+ char c = tolower(seq1[i]);
+ if (c == 'n' || c != g1->dna[i])
+ ++mm1;
+ }
+mm2 = 0;
+for (i=0 ; i < g2->size ; ++i)
+ {
+ char c = tolower(seq2[i]);
+ if (c == 'n' || c != g2->dna[i])
+ ++mm2;
+ }
+if (mmPerPair)
+ {
+ if (mm1 > mismatches || mm2 > mismatches)
+ {
+ warn("Error [file=%s, line=%d]: too many mismatches in one or both (seq1=%d/%d, seq2=%d/%d, maximum is %d) relative to '%c' strand\nseq1=[%s] seq2=[%s]\ngen1=[%s] gen2=[%s]\n",
+ file, line, mm1, len1, mm2, len2, mismatches, strand, seq1, seq2, g1->dna, g2->dna);
+ return FALSE;
+ }
+ }
+else
+ {
+ if (mm1+mm2 > mismatches)
+ {
+ warn("Error [file=%s, line=%d]: too many mismatches in pair (seq1=%d/%d, seq2=%d/%d, maximum is %d) relative to '%c' strand\nseq1=[%s] seq2=[%s]\ngen1=[%s] gen2=[%s]\n",
+ file, line, mm1, len1, mm2, len2, mismatches, strand, seq1, seq2, g1->dna, g2->dna);
+ return FALSE;
+ }
+ }
+return TRUE;
+}
+
int validateTagOrPairedTagAlign(struct lineFile *lf, char *file, boolean paired)
{
char *row;
char buf[1024];
@@ -538,23 +654,26 @@
dnaChars[(int)'.'] = 1;
verbose(2,"[%s %3d] paired=%d file(%s)\n", __func__, __LINE__, paired, file);
while (lineFileNext(lf, &row, &size))
{
+ unsigned start, end;
++line;
if (quick && line > quick)
break;
safecpy(buf, sizeof(buf), row);
if ( checkColumns(file, line, row, buf, words, TAG_WORDS, (paired ? 8 : 6))
&& checkChrom(file, line, row, words[0], &chromSize)
- && checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
+ && checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize, &start, &end)
&& checkIntBetween(file, line, row, words[4], "score", 0, 1000)
&& checkStrand(file, line, row, words[5])
&& (paired ?
(checkString(file, line, row, words[3], "name")
&& checkSeq(file, line, row, words[6], "seq1")
- && checkSeq(file, line, row, words[7], "seq2"))
+ && checkSeq(file, line, row, words[7], "seq2")
+ && checkMismatchesSeq1Seq2(file, line, words[0], start, end, *words[5], words[6], words[7]))
:
- checkSeq(file, line, row, words[3], "sequence")
+ (checkSeq(file, line, row, words[3], "sequence")
+ && checkMismatchesSeq(file, line, words[0], start, end, *words[5], words[3]))
) )
{
if (printOkLines)
printf("%s\n", row);
@@ -601,14 +720,15 @@
verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, file);
while (lineFileNextReal(lf, &row))
{
++line;
+ unsigned start, end;
if (quick && line > quick)
break;
safecpy(buf, sizeof(buf), row);
if ( checkColumns(file, line, row, buf, words, PEAK_WORDS, bedTypeCols[type])
&& checkChrom(file, line, row, words[0], &chromSize)
- && checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize)
+ && checkStartEnd(file, line, row, words[1], words[2], words[0], chromSize, &start, &end)
&& ( type == BED_GRAPH ?
(checkFloat(file, line, row, words[3], "value")) // canonical bedGraph has float in 4th column
: // otherwise BROAD_, NARROW_, or GAPPED_PEAK
(checkString(file, line, row, words[3], "name")
@@ -896,8 +1016,12 @@
zeroSizeOk = optionExists("zeroSizeOk");
chrMSizeOk = optionExists("chrMSizeOk");
printOkLines = optionExists("printOkLines");
printFailLines = optionExists("printFailLines");
+genome = optionExists("genome") ? twoBitOpen(optionVal("genome",NULL)) : NULL;
+mismatches = optionInt("mismatches",0);
+mmPerPair = optionExists("mmPerPair");
+mmCheckOneInN = optionInt("mmCheckOneInN", 1);
quick = optionExists("quick") ? optionInt("quick",QUICK_DEFAULT) : 0;
colorSpace = optionExists("colorSpace") || sameString(type, "csfasta");
initArrays();
// Get chromInfo from DB or file