src/hg/encode/encodeValidate/doEncodeValidate.pl 1.143
1.143 2009/02/13 11:08:33 mikep
validation rules for new Wold download types
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.142
retrieving revision 1.143
diff -b -B -U 4 -r1.142 -r1.143
--- src/hg/encode/encodeValidate/doEncodeValidate.pl 13 Feb 2009 04:19:13 -0000 1.142
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl 13 Feb 2009 11:08:33 -0000 1.143
@@ -851,49 +851,119 @@
doTime("done validateCsqual", $line) if $opt_timing;
return ();
}
-sub validateRpkm
+sub validateFasta
{
+ # Wold lab has fasta files, like fastq format without quality
my ($path, $file, $type) = @_;
- doTime("beginning validateRpkm") if $opt_timing;
-# my $fh = openUtil($path, $file);
+ doTime("beginning validateFasta") if $opt_timing;
+ HgAutomate::verbose(2, "validateFasta($path,$file,$type)\n");
+ return () if $opt_skipValidateFastQ;
+ doTime("beginning validateFasta") if $opt_timing;
+ my $fh = openUtil($path, $file);
my $line = 0;
+ my $state = 'firstLine';
+ my $seqName;
+ my $seqNameRegEx = "[A-Za-z0-9_.:/-]+";
+ my $seqRegEx = "[A-Za-z\n\.~]+";
+ my $states = {firstLine => {REGEX => "\@($seqNameRegEx)", NEXT => 'seqLine'},
+ seqLine => {REGEX => $seqRegEx, NEXT => 'firstLine'}};
+ while(<$fh>) {
+ chomp;
+ $line++;
+ my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateFasta]";
+ my $regex = $states->{$state}{REGEX};
+ if(/^${regex}$/) {
+ $state = $states->{$state}{NEXT};
+ } else {
+ return("$errorPrefix (expecting $state):\nline: $_");
+ }
+ last if($opt_quick && $line >= $quickCount);
+ }
+ $fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
- doTime("done validateRpkm", $line) if $opt_timing;
+ doTime("done validateFasta", $line) if $opt_timing;
return ();
}
-sub validateFasta
+sub validateRpkm
+# Wold lab format, has name and 2 floats
+# Example lines:-
+#HBG2 0.583 1973.85
+#RPS20 0.523 1910.01
+#RPLP0 1.312 1800.51
{
my ($path, $file, $type) = @_;
- doTime("beginning validateFasta") if $opt_timing;
-# my $fh = openUtil($path, $file);
- my $line = 0;
+ doTime("beginning validateRpkm") if $opt_timing;
+ my $lineNumber = 0;
+ my $fh = openUtil($path, $file);
+ while(<$fh>) {
+ chomp;
+ $lineNumber++;
+ die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n"
+ unless m/^(\w+)\t(\d+\.\d+)\t(\d+\.\d+)$/;
+ last if($opt_quick && $lineNumber >= $quickCount);
+ }
+ $fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
- doTime("done validateFasta", $line) if $opt_timing;
+ doTime("done validateRpkm", $lineNumber) if $opt_timing;
return ();
}
sub validateBowtie
+# Unkown format (for download) from Wold lab.
+# Sample line:-
+# HWI-EAS229_75_30DY0AAXX:7:1:0:1545/1 + chr1 5983615 NCGTCCATCTCACATCGTCAGGAAAGGGGGAAGCACTGGATGGCTGTGGCCTCACAGGCAGGGAGAGTGGGGTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 0 0:G>N
{
my ($path, $file, $type) = @_;
doTime("beginning validateBowtie") if $opt_timing;
-# my $fh = openUtil($path, $file);
- my $line = 0;
+ my $lineNumber = 0;
+ doTime("beginning validateBedGraph") if $opt_timing;
+ my $fh = openUtil($path, $file);
+ while(<$fh>) {
+ chomp;
+ $lineNumber++;
+ next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
+ die "Failed bowtie validation, file '$file'; line $lineNumber: line=[$_]\n"
+ unless $_ =~ m/^([A-Za-z0-9:>_\/-]+)\t([+-])\t(\w+)\t(\d+)\t(\w+)\t(\w+)\t(\d+)\t([A-Za-z0-9:>_\/-]+)$/;
+ last if($opt_quick && $lineNumber >= $quickCount);
+ }
+ $fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
- doTime("done validateBowtie", $line) if $opt_timing;
+ doTime("done validateBowtie", $lineNumber) if $opt_timing;
return ();
}
sub validatePsl
+# PSL format (for download) from Wold lab.
+# Sample first 6 lines
+#psLayout version 3
+#
+#match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStarts tStarts
+# match match count bases count bases name size start end name size start end count
+#---------------------------------------------------------------------------------------------------------------------------------------------------------------
+#71 3 0 0 0 0 0 0 - HWI-EAS229_75_30DY0AAXX:4:1:0:743/1 75 1 75 chr2 242951149 184181032 184181106 1 74, 0, 184181032, agccttttacagcaacacctttacctctgctagatctttctgtagctcgtctgaagccatgggggctgggtcag, agccttttccagcaacacctttacctcttctagatctttctgtagctcttctgaagccatgggggctgggtcag,
{
my ($path, $file, $type) = @_;
+ my $lineNumber = 0;
doTime("beginning validatePsl") if $opt_timing;
-# my $fh = openUtil($path, $file);
- my $line = 0;
+ my $fh = openUtil($path, $file);
+ while(<$fh>) {
+ chomp;
+ $lineNumber++;
+ next if $lineNumber == 1 and m/^psLayout version \d+/; # check first line
+ next if $lineNumber == 2 and m/^$/;
+ next if $lineNumber == 3 and m/^match/;
+ next if $lineNumber == 4 and m/^\s+match/;
+ next if $lineNumber == 5 and m/^------/;
+ die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n"
+ unless m/^(\d+)\t(\d+)\t(\d+)\t(\d+)(\d+)\t(\d+)\t(\d+)\t(\d+)\t([+-])\t([A-Za-z0-9:>\/_-]+)\t(\d+)\t(\d+)\t(\d+)\t(\w+)\t(\d+)\t(\d+)\t(\d+)$/;
+ last if($opt_quick && $lineNumber >= $quickCount);
+ }
+ $fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
- doTime("done validatePsl", $line) if $opt_timing;
+ doTime("done validatePsl", $lineNumber) if $opt_timing;
return ();
}