src/hg/encode/encodeValidate/doEncodeValidate.pl 1.157
1.157 2009/02/20 20:44:22 mikep
added comment about quality scores
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.156
retrieving revision 1.157
diff -b -B -U 4 -r1.156 -r1.157
--- src/hg/encode/encodeValidate/doEncodeValidate.pl 19 Feb 2009 22:33:28 -0000 1.156
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl 20 Feb 2009 20:44:22 -0000 1.157
@@ -745,8 +745,13 @@
# I added '/' in the seqNameRegEx and plusLine even though it wasnt in the spec
# because this is what Colin Kingswood (Gingeras project)
# is getting in the fastq files from GIS for the GisPet project
# and they are being sent on to us
+ # Note on "FASTQ Quality scores":- http://maq.sourceforge.net/qual.shtml
+ # Fastq has 2 different semantics for the score field.
+ # - fastq produced directly from Solexa has a 'solexa' quality score
+ # - fastq defined by Sanger has a 'PHRED' quality score
+ # - The 2 urls above show how to convert between both
my ($path, $file, $type) = @_;
HgAutomate::verbose(2, "validateFastQ($path,$file,$type)\n");
return () if $opt_skipValidateFastQ;
doTime("beginning validateFastQ") if $opt_timing;
@@ -755,9 +760,9 @@
my $state = 'firstLine';
my $seqName;
my $seqNameRegEx = "[A-Za-z0-9_.:/-]+";
my $seqRegEx = "[A-Za-z\n\.~]+";
- my $qualRegEx = "[!-~\n]+";
+ my $qualRegEx = "[!-~\n]+"; # ord(!)=33, ord(~)=126
my $states = {firstLine => {REGEX => "\@($seqNameRegEx)", NEXT => 'seqLine'},
seqLine => {REGEX => $seqRegEx, NEXT => 'plusLine'},
plusLine => {REGEX => "\\\+([A-Za-z0-9_.:/-]*)", NEXT => 'qualLine'},
qualLine => {REGEX => $qualRegEx, NEXT => 'firstLine'}};