9b6653339725825ef07c07651fd3003ac1fd88b6 hiram Fri May 15 12:49:43 2026 -0700 adding in html description page refs #31811 diff --git src/hg/makeDb/doc/vgp577way/vgp577way.pl src/hg/makeDb/doc/vgp577way/vgp577way.pl new file mode 100755 index 00000000000..6961e84ed0a --- /dev/null +++ src/hg/makeDb/doc/vgp577way/vgp577way.pl @@ -0,0 +1,238 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +my $argc = scalar(@ARGV); +if ($argc != 3) { + printf STDERR "usage: vgp577way.pl <referenceAccession> <577.acc.sciName.comName.clade.tsv> <order.list>\n"; + printf STDERR " referenceAccession - accession of the reference species (speciesCodonDefault)\n"; + printf STDERR " the .tsv file has four columns: accession, sciName, comName, clade\n"; + printf STDERR " the order.list has two columns: featureBitsMeasure, accession\n"; + printf STDERR "output to stdout is a trackDb.txt composite track definition\n"; + exit 255; +} + +my $refAcc = shift; +my $tsvFile = shift; +my $orderFile = shift; + +# mapping from order.list names to TSV accessions for special cases +my %aliasToTsv = ( + "hs1" => "GCA_009914755.4", + "hg38" => "GCA_000001405.15", + "mm39" => "GCA_000001635.9", +); + +# read the TSV: accession -> (sciName, comName, clade) +my %sciName; # key is accession (TSV column 1), value is scientific name +my %comName; # key is accession, value is common name +my %clade; # key is accession, value is clade string + +open (my $fh, "<", $tsvFile) or die "can not read $tsvFile"; +while (my $line = <$fh>) { + chomp $line; + my ($acc, $sci, $com, $cl) = split('\t', $line); + $sciName{$acc} = $sci; + $comName{$acc} = $com; + $clade{$acc} = $cl; +} +close ($fh); + +# read order.list: ordered list of accessions (these are the MAF names) +my @orderAcc; # accessions in coverage order (highest first) + +open ($fh, "<", $orderFile) or die "can not read $orderFile"; +while (my $line = <$fh>) { + chomp $line; + next if ($line =~ m/^\s*$/); + my ($count, $acc) = split('\s+', $line); + push (@orderAcc, $acc); +} +close ($fh); + +# helper: given a MAF name (from order.list), return the TSV accession for lookup +sub tsvKey { + my ($mafName) = @_; + return $aliasToTsv{$mafName} if (defined($aliasToTsv{$mafName})); + return $mafName; +} + +# verify reference accession is in the TSV +my $refTsvKey = tsvKey($refAcc); +if (!defined($clade{$refTsvKey})) { + printf STDERR "ERROR: reference accession '%s' not found in %s\n", $refAcc, $tsvFile; + exit 255; +} + +# verify all order.list entries are in the TSV +foreach my $acc (@orderAcc) { + my $key = tsvKey($acc); + if (!defined($clade{$key})) { + printf STDERR "ERROR: accession '%s' (tsv key '%s') from %s not found in %s\n", + $acc, $key, $orderFile, $tsvFile; + exit 255; + } +} + +# collect unique clades in order of first appearance in order.list +my @cladeOrder; +my %cladeSeen; +foreach my $acc (@orderAcc) { + my $cl = $clade{tsvKey($acc)}; + if (!defined($cladeSeen{$cl})) { + push (@cladeOrder, $cl); + $cladeSeen{$cl} = 1; + } +} +# also include the reference clade if not already present +my $refClade = $clade{$refTsvKey}; +if (!defined($cladeSeen{$refClade})) { + push (@cladeOrder, $refClade); + $cladeSeen{$refClade} = 1; +} + +# convert clade string to a trackDb-safe identifier +# e.g. "mammals-euarchontoglires" -> "Mammals_Euarchontoglires" +# e.g. "fishes-ray-finned" -> "Fishes_Ray_Finned" +sub cladeToId { + my ($cl) = @_; + my @parts = split(/[-]/, $cl); + foreach my $p (@parts) { + $p = ucfirst($p); + # handle sub-parts joined by underscore (from split on -) + # e.g. "ray-finned" -> "Ray_Finned" + } + return join("_", @parts); +} + +# build sGroup lines: for each clade, list accessions in order.list order +# these use the MAF names (accessions from order.list) +my %sGroupMembers; # key is clade string, value is arrayref of MAF names +foreach my $acc (@orderAcc) { + my $cl = $clade{tsvKey($acc)}; + push (@{$sGroupMembers{$cl}}, $acc); +} + +# the total species count +my $totalSpecies = scalar(@orderAcc) + 1; # +1 for reference + +# output the composite track +printf "track cons577way\n"; +printf "compositeTrack on\n"; +printf "shortLabel VGP %d-way\n", $totalSpecies; +printf "longLabel VGP %d species Cactus multiple alignment\n", $totalSpecies; +if ( -s "../../contrib/vgp577way/${refAcc}.vgp577wayPhyloP.bw" ) { + printf "subGroup1 view Views align=Multiz_Alignments phyloP=Basewise_Conservation_(phyloP)\n"; +} else { + printf "subGroup1 view Views align=Multiz_Alignments\n"; +} +printf "dragAndDrop subTracks\n"; +printf "visibility hide\n"; +printf "type bed 4\n"; +printf "group compGeno\n"; +printf "html contrib/vgp577way/vgp577way\n"; +printf "\n"; + +# the alignment view +printf " track cons577wayViewalign\n"; +printf " shortLabel Cactus %d-way\n", $totalSpecies; +printf " view align\n"; +printf " visibility pack\n"; +printf " viewUi on\n"; +printf " subTrack cons577way\n"; +printf "\n"; + +# the bigMaf track +printf " track vgp577way\n"; +printf " subTrack cons577wayViewalign\n"; +printf " shortLabel VGP %d-way\n", $totalSpecies; +printf " longLabel VGP Cactus alignment on %d species\n", $totalSpecies; +printf " subGroups view=align\n"; +printf " noInherit on\n"; +printf " type bigMaf\n"; +printf " viewUi on\n"; +printf " itemFirstCharCase noChange\n"; +printf " group compGeno\n"; +printf " bigDataUrl contrib/vgp577way/%s.vgp577way.bb\n", $refAcc; +printf " summary contrib/vgp577way/%s.vgp577waySummary.bb\n", $refAcc; +printf " irows on\n"; +printf " color 0, 10, 100\n"; +printf " altColor 0,90,10\n"; + +# speciesCodonDefault +printf " speciesCodonDefault %s\n", $refAcc; + +# speciesGroups - list of clade identifiers +printf " speciesGroups"; +foreach my $cl (@cladeOrder) { + printf " %s", cladeToId($cl); +} +printf "\n"; + +# sGroup_ lines - accessions in order.list order within each clade +foreach my $cl (@cladeOrder) { + printf " sGroup_%s", cladeToId($cl); + foreach my $acc (@{$sGroupMembers{$cl}}) { + printf " %s", $acc; + } + printf "\n"; +} + +# speciesLabels - map accession to common name +printf " speciesLabels"; +foreach my $acc (@orderAcc) { + my $key = tsvKey($acc); + printf " %s=\"%s\"", $acc, $comName{$key}; +} +printf "\n"; + +# speciesDefaultOff - all species from order.list +printf " speciesDefaultOff"; +foreach my $acc (@orderAcc) { + printf " %s", $acc; +} +printf "\n"; + +if ( -s "../../contrib/vgp577way/${refAcc}.vgp577wayPhyloP.bw" ) { + +my $bwInfo = `bigWigInfo ../../contrib/vgp577way/${refAcc}.vgp577wayPhyloP.bw | egrep "min:|max:" | awk '{printf "%.2f\\n", \$NF}'| xargs echo | tr ' ' ':'`; +chomp $bwInfo; +my ($bwMin, $bwMax) = split(':', $bwInfo); +my $viewLimitMin = $bwMin / 2.0; +my $viewLimitMax = $bwMax / 2.0; +my $bigWigInfo = $bwInfo; +$bigWigInfo =~ s/:/ /; + +printf " + # PhyloP conservation + track cons577wayViewphyloP + shortLabel Basewise Conservation (phyloP) + view phyloP + visibility full + subTrack cons577way + viewLimits %.1f:%.1f +", $viewLimitMin, $viewLimitMax; + +printf " + track phyloP577wayREV + subTrack cons577wayViewphyloP + subGroups view=phyloP + shortLabel 577 phyloP + longLabel VGP %d species Basewise Conservation by PhyloP phyloFit + configurable on + noInherit on + type bigWig %s + bigDataUrl contrib/vgp577way/%s.vgp577wayPhyloP.bw + maxHeightPixels 100:50:11 + viewLimits %.1f:%.1f + autoScale off + spanList 1 + windowingFunction mean + color 60,60,140 + altColor 140,60,60 + priority 1 + logo on +", $totalSpecies, $bigWigInfo, $refAcc, $viewLimitMin, $viewLimitMax; + +}