src/hg/encode/encodeValidate/doEncodeValidate.pl 1.157

1.157 2009/02/20 20:44:22 mikep
added comment about quality scores
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.156
retrieving revision 1.157
diff -b -B -U 4 -r1.156 -r1.157
--- src/hg/encode/encodeValidate/doEncodeValidate.pl	19 Feb 2009 22:33:28 -0000	1.156
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl	20 Feb 2009 20:44:22 -0000	1.157
@@ -745,8 +745,13 @@
     # I added '/' in the seqNameRegEx and plusLine even though it wasnt in the spec
     #   because this is what Colin Kingswood (Gingeras project)
     #   is getting in the fastq files from GIS for the GisPet project
     #   and they are being sent on to us
+    # Note on "FASTQ Quality scores":-   http://maq.sourceforge.net/qual.shtml
+    # Fastq has 2 different semantics for the score field.
+    # - fastq produced directly from Solexa has a 'solexa' quality score
+    # - fastq defined by Sanger has a 'PHRED' quality score
+    # - The 2 urls above show how to convert between both
     my ($path, $file, $type) = @_;
     HgAutomate::verbose(2, "validateFastQ($path,$file,$type)\n");
     return () if $opt_skipValidateFastQ;
     doTime("beginning validateFastQ") if $opt_timing;
@@ -755,9 +760,9 @@
     my $state = 'firstLine';
     my $seqName;
     my $seqNameRegEx = "[A-Za-z0-9_.:/-]+";
     my $seqRegEx = "[A-Za-z\n\.~]+";
-    my $qualRegEx = "[!-~\n]+";
+    my $qualRegEx = "[!-~\n]+"; # ord(!)=33, ord(~)=126
     my $states = {firstLine => {REGEX => "\@($seqNameRegEx)", NEXT => 'seqLine'},
                   seqLine => {REGEX => $seqRegEx, NEXT => 'plusLine'},
                   plusLine => {REGEX => "\\\+([A-Za-z0-9_.:/-]*)", NEXT => 'qualLine'},
                   qualLine => {REGEX => $qualRegEx, NEXT => 'firstLine'}};