src/hg/encode/encodeDownloadsPage/encodeDownloadsPage.pl 1.7
1.7 2009/04/22 23:57:17 tdreszer
Support for metadata setting from trackDb
Index: src/hg/encode/encodeDownloadsPage/encodeDownloadsPage.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeDownloadsPage/encodeDownloadsPage.pl,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 1000000 -r1.6 -r1.7
--- src/hg/encode/encodeDownloadsPage/encodeDownloadsPage.pl 20 Mar 2009 21:46:03 -0000 1.6
+++ src/hg/encode/encodeDownloadsPage/encodeDownloadsPage.pl 22 Apr 2009 23:57:17 -0000 1.7
@@ -1,254 +1,366 @@
#!/usr/bin/env perl
# hgEncodeDownloads.pl - generates a webpage for all tgz downloadable files in a directory.
# The first portion of the file name (delimieted by '.') should be the
# corresponding tableName in order to look up the dateReleased in trackDb.
# Called by automated submission pipeline
#
# $Header$
use warnings;
use strict;
use File::stat;
use File::Basename;
use Getopt::Long;
use Time::Local;
use English;
use Carp qw(cluck);
use Cwd;
use IO::File;
use File::Basename;
use lib "/cluster/bin/scripts";
use Encode;
use HgAutomate;
use HgDb;
use vars qw/
$opt_fileType
$opt_preamble
$opt_db
$opt_verbose
/;
sub usage {
print STDERR <<END;
usage: hgEncodeDownloads.pl {index.html} [downloads-dir]
Creates the an HTML page listing the downloads in the current directory or optional directory
options:
-help Displays this usage info
-preamble=file File containing introductory information (written in HTML) that will be included in this file (default preamble.html)
-db=hg18 Use a database other than the default hg18 (For aquiring releaseDate and metadata from trackDb)
-fileType=mask mask for file types included (default '*.gz')
-verbose=num Set verbose level to num (default 1).
END
exit 1;
}
sub htmlStartPage {
# prints the opening of the html downloads index.html file
#local *OUT_FILE = shift;
local *OUT_FILE = shift;
my ($filePath,$fileName,$preamble) = @_;
print OUT_FILE "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\">\n";
print OUT_FILE "<HTML>\n";
print OUT_FILE " <HEAD>\n";
print OUT_FILE " <TITLE>Index of $filePath</TITLE>\n";
print OUT_FILE " </HEAD>\n";
print OUT_FILE " <BODY>\n";
print OUT_FILE "<IMG SRC=\"/icons/back.gif\" ALT=\"[DIR]\"> <A HREF=\"../\">Parent Directory</A><BR>\n\n";
my $hasPreamble = 0;
if($preamble && length($preamble) > 0) { # TODO check that prolog exists
if(open( PREAMBLE, "$filePath/$preamble")) {
$hasPreamble = 1;
print OUT_FILE <PREAMBLE>;
close(PREAMBLE);
}
}
if(!$hasPreamble) {
print OUT_FILE "<p>This directory contains data files available for download.</p>\n\n";
}
print OUT_FILE "<HR>\n\n";
}
sub htmlEndPage {
# prints closing of the html downloads index.html file
local *OUT_FILE = shift;
print OUT_FILE "\n<HR>\n";
print OUT_FILE "</BODY></HTML>\n";
}
sub htmlTableRow {
# prints a row closing of an html table
local *OUT_FILE = shift;
my ($file,$size,$date,$releaseDate,$metaData) = @_;
if($releaseDate && length($releaseDate) > 0) {
- print OUT_FILE "<TR><TD align='left'> $releaseDate\n";
+ print OUT_FILE "<TR valign='top'><TD align='left'> $releaseDate\n";
} else {
- print OUT_FILE "<TR><TD align='left'> \n";
+ print OUT_FILE "<TR valign='top'><TD align='left'> \n";
}
print OUT_FILE "<TD><A type='application/zip' target='_blank' HREF=\"$file\">$file</A>\n";
print OUT_FILE "<TD align='right'> $size<TD> $date \n";
if($metaData && length($metaData) > 0) {
- print OUT_FILE "<TD>$metaData\n";
+ print OUT_FILE "<TD nowrap>$metaData\n";
}
print OUT_FILE "</TR>\n";
}
+
+sub metadataLineToArrays {
+# Creates pair of arrays that contain the settings in a metadata setting line (retains order)
+ my ($line) = @_;
+
+ my @tags;
+ my @vals;
+ my $tix = 0;
+ while($line && length($line)>0) {
+ my $tag;
+ my $val;
+ ( $tag,$line ) = split(/=/, $line,2);
+ $tag =~ s/\s+//g;
+ my @chars = split(//,$line);
+ if($chars[0] ne "\"") {
+ ( $val,$line ) = split(/\s+/, $line,2);
+ } else {
+ my $ix=1;
+ while($ix < length($line) && ($chars[$ix] ne '"' || $chars[$ix - 1] eq '\\')) { # Find next " skipping escaped \"
+ $ix++;
+ }
+ if($ix < length($line)) {
+ $val = substr($line,1,$ix - 1);
+ $line = substr($line, $ix + 1);
+ $val =~ s/\\"/\"/g;
+ } else {
+ $val = $line;
+ $line = "";
+ }
+ }
+ $tags[$tix ] = $tag;
+ $vals[$tix++] = $val;
+ }
+ return ( \@tags, \@vals );
+}
+
+sub metadataArraysRemoveHash {
+# Removes members of has from metadata Arrays
+ my ($tagsRef,$valsRef,$removeRef) = @_;
+ my @tags = @{$tagsRef};
+ my @vals = @{$valsRef};
+ my %remove = %{$removeRef};
+
+ my @tagsOk;
+ my @valsOk;
+ my $nix = 0;
+ my $oix = 0;
+ while($tags[$oix]) {
+ if($remove{$tags[$oix]}) {
+ $oix++;
+ } else {
+ $tagsOk[$nix ] = $tags[$oix ];
+ $valsOk[$nix++] = $vals[$oix++];
+ }
+ }
+ return ( \@tagsOk, \@valsOk );
+}
+
+sub metadataArraysJoin {
+# Removes members of has from metadata Arrays
+ my ($tagsRef,$valsRef) = @_;
+ my @tags = @{$tagsRef};
+ my @vals = @{$valsRef};
+ my @meta;
+ my $tix = 0;
+ while($tags[$tix]) {
+ $meta[$tix] = join("=",($tags[$tix],$vals[$tix]));
+ $tix++;
+ }
+ return @meta;
+}
+
############################################################################
# Main
# Get ls -hog of directory ( working dir or -dir )
#ls -hog --time-style=long-iso *.gz
#-rw-rw-r-- 1 101M 2008-10-27 14:34 wgEncodeYaleChIPseqSignalK562Pol2.wig.gz
my $now = time();
my $ok = GetOptions("fileType=s",
"preamble=s",
"db=s",
"verbose=i",
);
usage() if (!$ok);
$opt_verbose = 1 if (!defined $opt_verbose);
my $fileMask = "*.gz";
$fileMask = $opt_fileType if(defined $opt_fileType);
my $preamble = "preamble.html";
$preamble = $opt_preamble if(defined $opt_preamble);
usage() if (scalar(@ARGV) < 1);
# Get command-line args
my $indexHtml = $ARGV[0];
my $downloadsDir = cwd();
$downloadsDir = $ARGV[1] if (scalar(@ARGV) > 1);
# Now find some files
my @fileList = `ls -hog --time-style=long-iso $downloadsDir/$fileMask 2> /dev/null`;
# -rw-rw-r-- 1 101M 2008-10-27 14:34 /usr/local/apache/htdocs/goldenPath/hg18/wgEncodeYaleChIPseq/wgEncodeYaleChIPseqSignalK562Pol2.wig.gz
if(length(@fileList) == 0) {
die ("ERROR; No files were found in \'$downloadsDir\' that match \'$fileMask\'.\n");
}
# TODO determine whether we should even look this up
$opt_db = "hg18" if(!defined $opt_db);
my $db = HgDb->new(DB => $opt_db);
open( OUT_FILE, "> $downloadsDir/$indexHtml") || die "SYS ERROR: Can't write to \'$downloadsDir/$indexHtml\' file; error: $!\n";
#print OUT_FILE @fileList;
my @readme;
if(open(README, "$downloadsDir/README.txt")) {
@readme = <README>;
close(README);
}
# Start the page
htmlStartPage(*OUT_FILE,$downloadsDir,$indexHtml,$preamble);
print OUT_FILE "<TABLE cellspacing=0>\n";
print OUT_FILE "<TR valign='bottom'><TH>RESTRICTED<BR>until<TH align='left'> File<TH align='right'>Size<TH>Submitted<TH align='left'> Details</TR>\n";
for my $line (@fileList) {
chomp $line;
my @file = split(/\s+/, $line); # White space
#print OUT_FILE join ' ', @file,"\n";
my @path = split('/', $file[5]); # split on /
my $fileName = $path[$#path]; # Last element
my ($tableName,$dataType) = split('\.', $fileName); # tableName I hope
#print OUT_FILE "Path:$file[5] File:$fileName Table:$tableName\n";
my $releaseDate = "";
my %metaData;
my $typePrefix = "";
my $results = $db->quickQuery("select type from trackDb where tableName = '$tableName'");
if($results) {
my ($type) = split(/\s+/, $results); # White space
$metaData{type} = $type;
+ } else {
+ if($dataType eq "fastq" || $dataType eq "tagAlign" || $dataType eq "csfasta" || $dataType eq "csqual") {
+ $metaData{type} = $dataType;
+ }
+ }
$results = $db->quickQuery("select settings from trackDb where tableName = '$tableName'");
-#print OUT_FILE "Settings:$results\n";
- if( $results ) {
- my @settings = split(/\n/, $results); # New Line
- while (@settings) {
- my @pair = split(/\s+/, (shift @settings),3);
-#print OUT_FILE "Settings:". join('=',@pair) . "\n";
- if($pair[0] eq "subTrack") {
- my $parentSettings = $db->quickQuery("select settings from trackDb where tableName = '$pair[1]'");
- if( $parentSettings ) {
- my @parent = split(/\n/, $parentSettings); # New Line
- while (@parent) {
- my @par = split(/\s+/, (shift @parent));
- if( $par[0] eq "wgEncode" ) {
- $typePrefix = "ENCODE ";
+ if(!$results) {
+ ### TODO: This needs to be replaced with a lookup of fileDb.ra
+ my $associatedTable = $tableName;
+ $associatedTable =~ s/RawData/RawSignal/ if $tableName =~ /RawData/;
+ $associatedTable =~ s/Alignments/RawSignal/ if $tableName =~ /Alignments/;
+ if($tableName ne $associatedTable) {
+ $results = $db->quickQuery("select settings from trackDb where tableName = '$associatedTable'");
+ if($results) {
+ $metaData{parent} = "RawData→RawSignal" if $tableName =~ /RawData/;
+ $metaData{parent} = "Alignments→RawSignal" if $tableName =~ /Alignments/;
}
}
}
- } elsif($pair[0] eq "releaseDate") {
+ if( $results ) {
+ my @settings = split(/\n/, $results); # New Line
+ while (@settings) {
+ my @pair = split(/\s+/, (shift @settings),2);
+ if($pair[0] eq "releaseDate" && length($releaseDate) == 0) {
$releaseDate = $pair[1];
} elsif($pair[0] eq "dateSubmitted" && length($releaseDate) == 0) {
my ($YYYY,$MM,$DD) = split('-',$pair[1]);
$MM = $MM - 1;
my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate(timelocal(0,0,0,$DD,$MM,$YYYY));
$rMon = $rMon + 1;
$releaseDate = sprintf("%04d-%02d-%02d", (1900 + $rYear),$rMon,$rMDay);
} elsif($pair[0] eq "cell" || $pair[0] eq "antibody" || $pair[0] eq "promoter") {
$metaData{$pair[0]} = $pair[1];
+ } elsif($pair[0] eq "metadata") {
+ # Use metadata setting with priority ### TODO Need to do this for files
+ my ( $tagRef, $valRef ) = metadataLineToArrays($pair[1]);
+ my @tags = @{$tagRef};
+ my @vals = @{$valRef};
+ my $tix = 0;
+ while($tags[$tix]) {
+ if($tags[$tix] eq "dateUnrestricted") {
+ $releaseDate = $vals[$tix];
+ } elsif($tags[$tix] eq "dateSubmitted" && length($releaseDate) == 0) {
+ my ($YYYY,$MM,$DD) = split('-',$vals[$tix]);
+ $MM = $MM - 1;
+ my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate(timelocal(0,0,0,$DD,$MM,$YYYY));
+ $rMon = $rMon + 1;
+ $releaseDate = sprintf("%04d-%02d-%02d", (1900 + $rYear),$rMon,$rMDay);
}
+ $tix++;
+ }
+ my %remove;
+ $remove{project} = 1;
+ $remove{composite} = 1;
+ $remove{fileName} = 1;
+ $remove{dateSubmitted} = 1;
+ $remove{dateUnrestricted} = 1;
+ ( $tagRef, $valRef ) = metadataArraysRemoveHash( \@tags,\@vals,\%remove );
+ @tags = @{$tagRef};
+ @vals = @{$valRef};
+ my @metas = metadataArraysJoin( \@tags,\@vals );
+ $metaData{metadata} = join("; ", @metas);
}
}
} else {
if($dataType eq "fastq" || $dataType eq "tagAlign" || $dataType eq "csfasta" || $dataType eq "csqual") {
$metaData{type} = $dataType;
- $typePrefix = "ENCODE ";
my ($YYYY,$MM,$DD) = split('-',$file[3]);
$MM = $MM - 1;
my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate(timelocal(0,0,0,$DD,$MM,$YYYY));
$rMon = $rMon + 1;
$releaseDate = sprintf("%04d-%02d-%02d", (1900 + $rYear),$rMon,$rMDay);
}
# pull metadata out of README.txt (if any is available).
my $active = 0;
for my $line (@readme) {
chomp $line;
if($line =~ /file: $fileName/) {
$active = 1;
} elsif($active) {
if($line =~ /(.*):(.*)/) {
my ($name, $var) = ($1, $2);
$metaData{$name} = $var if($name !~ /restricted/i);
} else {
last;
}
}
}
}
my @tmp;
if(my $type = $metaData{type}) {
- push(@tmp, "${typePrefix}type: $type");
+ push(@tmp, "type: $type");
delete $metaData{type};
}
+ if($metaData{metadata}) {
+ if($metaData{parent}) {
+ push(@tmp, $metaData{parent});
+ delete $metaData{parent};
+ }
+ push(@tmp, $metaData{metadata});
+ } else {
if(my $cell = $metaData{cell}) {
push(@tmp, "cell: $cell");
delete $metaData{cell};
}
push(@tmp, "$_: " . $metaData{$_}) for (sort keys %metaData);
+ }
my $metaData = join("; ", @tmp);
# Now file contains: [file without path] [tableName] [size] [date] [releaseDate] [fullpath/file]
htmlTableRow(*OUT_FILE,$fileName,$file[2],$file[3],$releaseDate,$metaData);
}
print OUT_FILE "</TABLE>\n";
htmlEndPage(*OUT_FILE);
exit 0;