f464e85e868ff6c5ef8d97c24cb990078a8fd723
kate
  Wed Apr 3 11:28:55 2019 -0700
Tools for factorSource format. refs #21139

diff --git src/hg/makeDb/hgBedsToBedExps/bedExpsToFactorSource.pl src/hg/makeDb/hgBedsToBedExps/bedExpsToFactorSource.pl
new file mode 100755
index 0000000..53468f9
--- /dev/null
+++ src/hg/makeDb/hgBedsToBedExps/bedExpsToFactorSource.pl
@@ -0,0 +1,46 @@
+#!/usr/bin/env perl
+
+# bedExpsToFactorSource.pl - create clusters file in BED5+ format, with additional fields:
+#       6: expCount: number of experiments having non-zero scores
+#       7: expIds: IDs of experiments
+#       8: expScores: scores
+#
+# Created to provide a more efficient representation for UCSC factorSource type tracks.
+# The input clusters file is UCSC BED15 format (see genome.ucsc.edu/FAQ/FAQformat.html);
+# This format is used in later versions of ENCODE TFBS Clusters tracks.
+
+use warnings;
+use strict;
+
+scalar(@ARGV) == 1 or die "usage: bedExpsToFactorSource.pl clusters.bed\n";
+
+my $clusterFile = $ARGV[0];
+
+# read clusters file in BED5 format (generated by hgBedExpsToBed)
+open(CF, $clusterFile) or die "can't read $clusterFile\n";
+while (<CF>) {
+    chomp;
+    my ($chrom, $start, $end, $name, $score) = split('\t');
+    my @fields = split("\t");
+    my @expIds = split(",", $fields[13]);
+    my @expScores = split(",", $fields[14]);
+    my @exps;
+    my @scores;
+    my $i = -1;
+    my $expCount = 0;
+
+    # grab experiments with non-zero scores and look up source
+    foreach my $expScore (@expScores) {
+        $i++;
+        next if (!$expScore);
+        $expCount++;
+        my $exp = $expIds[$i];
+        push(@exps, $exp);
+        push(@scores, $expScore);
+    }
+    my $exps = join(',', @exps);
+    my $scores = join(',', @scores);
+
+    # output BED 5+ line
+    print join("\t", $chrom, $start, $end, $name, $score, $expCount, $exps, $scores), "\n";
+}