src/hg/encode/encodeValidate/doEncodeValidate.pl 1.159
1.159 2009/03/05 02:57:16 mikep
3 different flavors of rpkm; just check the cols are there
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.158
retrieving revision 1.159
diff -b -B -U 4 -r1.158 -r1.159
--- src/hg/encode/encodeValidate/doEncodeValidate.pl 4 Mar 2009 17:55:10 -0000 1.158
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl 5 Mar 2009 02:57:16 -0000 1.159
@@ -918,22 +918,37 @@
sub validateRpkm
# Wold lab format, has gene name and 2 floats
# Allowing Gene name to be composed of any characters but <tab>
-# Example lines:-
-#HBG2 0.583 1973.85
-#RPS20 0.523 1910.01
-#RPLP0 1.312 1800.51
+#
+# Example format 1 (3 cols):-
+# HBG2 0.583 1973.85
+# RPS20 0.523 1910.01
+# RPLP0 1.312 1800.51
+#
+# Example format 2 (7 cols):- (*.accepted.rpkm)
+# ENSG00000003056 chr12 8989051 8989354 2.43 303 M6PR
+# ENSG00000006015 chr19 18560887 18561077 1.10 190 C19orf60
+# ENSG00000008516 chr16 3047223 3047380 0.61 157 MMP25
+#
+# Example format 3 (5 cols): (*.final.rpkm)
+#GID gene len_kb RPKM multi/all
+# OTTHUMG00000151214 IGLC2 0.722 3579.34 0.84
+# FAR3664 FAR3664 0.200 3216.32 0.94
+# OTTHUMG00000021144 TMSB4X 3.551 2767.52 0.35
{
my ($path, $file, $type) = @_;
doTime("beginning validateRpkm") if $opt_timing;
my $lineNumber = 0;
my $fh = openUtil($path, $file);
while(<$fh>) {
chomp;
$lineNumber++;
+ next if m/^#/;
+ my $cols = split;
die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n"
- unless m/^([^\t]+)\t(\d+\.\d+)\t(\d+\.\d+)$/;
+ unless $cols == 3 or $cols == 5 or $cols == 7;
+# unless m/^([^\t]+)\t(\d+\.\d+)\t(\d+\.\d+)$/;
last if($opt_quick && $lineNumber >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");