b4fd34e71f94b2847253ef2b20cd3d8aa2acaa24
jcasper
Mon May 19 10:31:30 2025 -0700
Fix a few issues with MaveDB json parse (score bounds, item count display for
cells with multiple values. Also committing trackDb and doc for a native version, refs #31812
diff --git src/hg/oneShot/parseMave/parseMave.pl src/hg/oneShot/parseMave/parseMave.pl
index 971d746dfbf..5d7b92ee676 100644
--- src/hg/oneShot/parseMave/parseMave.pl
+++ src/hg/oneShot/parseMave/parseMave.pl
@@ -1,572 +1,573 @@
# parseMave.pl - parse the dcd_mappings json files provided by MaveDB into a series of bed records,
# where each record is a heatmap of the data for one experiment. Also writes out a separate fasta
# file containing the reference sequence for each experiment.
use JSON::PP;
use HTML::Entities;
use Encode;
use List::Util qw(min max);
sub cleanWhitespace($)
# There seems to be a variety of weird newline stuff, depending on who curated the record. There's a few
# dos newlines in records somewhere, others have "\r\n" (like that actual string, not the characters).
# Going to try to replace them all with
here, and reduce all other whitespace to single spaces.
{
my $str = shift;
$str =~ s/\\r\\n/
/g;
$str =~ s/\\n/
/g;
$str =~ s/(\012|\015)+/
/g;
$str =~ s/\s+/ /g;
return $str;
}
$jsonBlob = "";
while ($line = )
{
$jsonBlob .= $line;
}
if ($jsonBlob !~ m/hgvs\.p/)
{
# no point continuing
exit(0);
}
$jsonObj = decode_json($jsonBlob);
+# Start indexing into the json object to find all of the mapped scores
+#
$mapped_scores_ref = $$jsonObj{"mapped_scores"};
$itemStart = -1;
$itemEnd = -1;
$chrom = "";
$urnID = "";
-%exonList = ();
+%exonList = (); # key: "A,B,C", where A and B are the start and end coordinates of the exon in BED coordinates.
+ # C is the HGVS-like p string for the variant
+ # value: "$thisID $pString: $scoreTrunc", being the id, HGVS-like p. string, and score
+ # The %exonDups list is keyed the same way, containing overflow items for each exon (if any)
%txStarts = ();
%txSizes = ();
foreach $record_ref (@$mapped_scores_ref)
{
# record_ref should be a hash
# We'll want "mavedb_id" (example "urn:mavedb:00000097-s-1#2")
# "score" (example -0.0757349427170253)
# "post_mapped" -> "expressions" -> "value" (example "NP_009225.1:p.Asn1774Thr")
# ASSUMING that expressions -> syntax is hgvs.p.
# NOTE: expressions is an array, so you'll have to loop through the contents to find the hgvs
$id = $$record_ref{"mavedb_id"};
$score = $$record_ref{"score"};
next if ($score eq "");
$post = $$record_ref{"post_mapped"};
$type = $$post{"type"};
next if ($type eq "Haplotype"); # we're not set up to handle this right now
next if ($type ne "VariationDescriptor"); # we're not set up to handle anything but VariationDescriptor right now
$expr = $$post{"expressions"};
$hgvs = "";
foreach $expr_i (@$expr)
{
if ($$expr_i{"syntax"} eq "hgvs.p")
{
$hgvs = $$expr_i{"value"};
}
}
next if ($hgvs eq "");
# There are some HGVS term types (or HGVS-like terms) that we're skipping over for now.
# NP_003336.1:p.159delinsHis is causing a core dump
# NP_003336.1:p.Ser158_159delinsSerSer is being mapped to
# chr16 1324787 1324790 Ser158_159delinsSerSer
# Seems like that should be more than one codon wide.
if ($hgvs =~ m/(NP_[^:]+):p\.[a-zA-Z]{3}\d+[a-zA-Z]{3}$/)
{ $ncbiProt = $1; }
elsif ($hgvs =~ m/(NP_[^:]+):p\.[a-zA-Z]{3}\d+=$/)
{ $ncbiProt = $1; }
else
{ next; }
$partialHgvs = $hgvs;
$partialHgvs =~ s/(\d+)\D+$/$1/; # strip off the destination AA; we only care about the genome mapping
if (!defined $coords{$partialHgvs})
{
# Using another oneShot tool to convert HGVS terms to BED records
$bedMapping = `~/bin/x86_64/hgvsToBed hg38 "$hgvs"` or die "hgvsToBed failed on $hgvs; have you built it?";
$coords{$partialHgvs} = $bedMapping;
}
else
{
$bedMapping = $coords{$partialHgvs};
}
next if ($bedMapping eq ""); # if we can't map it, it's useless. Discard
if ($bedMapping !~ m/^\S+\s+\d+\s+\d+/)
{
# something weird happened. Panic.
die "Something went wrong, hgvs.p $hgvs was mapped to $bedMapping";
}
@f = split /\s+/, $bedMapping;
if ($chrom eq "")
{
$chrom = $f[0];
}
elsif ($chrom ne $f[0])
{
die "Whoa, multiple chromosomes found in file (compare $chrom, " . $f[0] . ")";
}
if ($itemStart == -1)
{ $itemStart = $f[1]; }
elsif ($itemStart > $f[1])
{ $itemStart = $f[1]; }
if ($itemEnd == -1)
{ $itemEnd = $f[2]; }
elsif ($itemEnd < $f[2])
{ $itemEnd = $f[2]; }
$oldUrnID = $urnID;
if ($id =~ m/^urn:mavedb:([^#]+)(#\d+)/)
{
($urnID, $thisID) = ($1, $2);
if ($oldUrnID ne "" && $oldUrnID ne $urnID)
{
die "I guess we change urn IDs mid-file now? $oldUrnID vs $urnID";
}
}
else
{
die "Unexpected id string: $id did not contain an urn:mavedb identifier";
}
# Right about here, we want to parse f[1] and f[2], which are the start and end of the codon
# We want to compare that against the exon coordinates against the bounds from the PSL record for
# the transcript. Which means we need to get that.
if (!defined $txStarts{$ncbiProt})
{
$st_size = `hgsql hg38 -Ne 'select psl.tStarts, psl.blockSizes from ncbiRefSeqPsl psl join ncbiRefSeqLink link on psl.qName = link.mrnaAcc where link.protAcc = "$ncbiProt"'` or die "No exons for $ncbiProt in ncbiRefSeqLink/ncbiRefSeqPsl";
chomp $st_size;
@g = split /\t/, $st_size;
$txStarts{$NM} = $g[0];
$txSizes{$NM} = $g[1];
}
@thisStarts = split /,/, $txStarts{$NM};
@thisSizes = split /,/, $txSizes{$NM};
$thisStart = $f[1];
$thisEnd = $f[2];
# hgvsToBed mapped us to this position using the same record that we pulled txStarts and txSizes from,
# but it ignored the exon structure. We need to build (maybe) this one "exon" into multiple exons.
# Going to do this kinda in reverse, actually. Loop through the transcript's exons. Either delete
# them if they have no overlap with the mapped range, or else truncate them to it (by adjusting
# sizes and maybe starts.
# Except actual splicing would screw with array offsets, and we don't need that kind of headache.
# So we'll just set thisStarts[$k] = -1 and filter them out subsequently.
for ($k=0; $k<@thisStarts; $k++)
{
if ($thisStart > $thisStarts[$k] + $thisSizes[$k] || $thisEnd < $thisStarts[$k])
{
$thisStarts[$k] = -1;
next;
}
if ($thisStarts[$k] < $thisStart)
{
# move the start up and trim the size down
$thisSizes[$k] -= ($thisStart - $thisStarts[$k]);
$thisStarts[$k] = $thisStart;
}
if ($thisStarts[$k] + $thisSizes[$k] > $thisEnd)
{
# trim back the size to fit
$thisSizes[$k] = $thisEnd - $thisStarts[$k];
}
}
for ($k=0; $k<@thisStarts; $k++)
{
next if $thisStarts[$k] == -1;
- #$posString = $f[1] . "," . $f[2];
$posString = $thisStarts[$k] . "," . ($thisStarts[$k]+$thisSizes[$k]);
if ($hgvs =~ m/:(.*)/)
{
$pString = $1;
$posString .= ",$pString";
$scoreTrunc = sprintf("%.4f", $score);
if (defined $exonList{$posString})
{
$exonDups{$posString} .= "
$thisID $pString: $scoreTrunc";
}
else
{
$exonList{$posString} = "$thisID $pString: $scoreTrunc";
}
}
else
{
die "Huh, hgvs string didn't have a p term? $hgvs";
}
}
}
# let's check that something actually happened, and skip out if the file was functionally
# empty (e.g. it was only haplotypes)
if (scalar(keys %exonList) == 0)
{
exit(0);
}
# At this point, %exonList takes position strings (of exons) to the label-type info for them
# need to manually sort the exons at this point
# a bit of a hack here, but numeric sort should sort by the start positions of each exon since
# that's the start of each key in %exonList.
#@sorted_exons = sort {$a <=> $b} (keys %exonList);
# so %coords is actually the list of distinct mappings from partial HGVS terms to bed coordinates
# That makes its size the number of exons that we need. We can sanity check the ranges in that
# list if we want.
#
# Okaaaay, so that's no longer accurate. Some of those codon bed mappings may have resulted in multiple
# "exons" because they were split across intron boundaries. We can recover what we need
# from the keys of %exonList instead.
# I WAS using coords because it imposed uniqueness on the exon coordinates. So I have to reimplement that.
-#foreach $key (keys %coords)
%uniquify = ();
foreach $key (keys %exonList)
{
-# $bedStr = $coords{$key};
-# $bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/;
- $bedStr = $key; # $exonList{$key};
+ $bedStr = $key;
$bedStr =~ m/^(\d+),(\d+),/;
$st_en = "$1,$2";
if (!defined $uniquify{$st_en})
{
$uniquify{$st_en} = 1;
$start_end[@start_end] = $st_en;
}
}
@sorted_pos = sort {$a <=> $b} (@start_end);
$sizeString = "";
$exonStarts = "";
$lasten = "";
$lastst = "";
%exonIxs = ();
for ($i=0; $i<@sorted_pos; $i++)
{
$sorted_pos[$i] =~ m/(\d+),(\d+)/;
($st, $en) = ($1, $2);
if ($i > 0 && $st < $lasten)
{
# something went horribly wrong - one exon started before the previous one ended.
# Track down the previous coordinates so we can report both sets.
# inefficient, but at least we don't do it often
foreach $key (keys %uniquify)
{
$bedStr = $uniquify{$key};
- #$bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/;
$bedStr =~ m/^(\d+),(\d+)/;
$st_en = "$1,$2";
if ($st_en eq "$lastst,$lasten")
{
$prevbad = $key;
}
if ($st_en eq "$st,$en")
{
$thisbad = $key;
}
}
die "One item started at $st ($thisbad) after another ended at $lasten ($prevbad)";
}
$width = $en - $st;
$sizeString .= "$width,";
$exonStarts .= ($st - $itemStart) . ",";
$exonIxs{"$st,$en"} = $i;
$lasten = $en;
$lastst = $st;
}
$outstring = "$chrom\t$itemStart\t$itemEnd\t$urnID\t1000\t+\t$itemStart\t$itemEnd\t0\t";
$outstring .= scalar(@sorted_pos) . "\t";
##################################################
# Now to assemble the score and label arrays
%aaNames = (
"Ala" => 'A',
"Arg" => 'R',
"Asn" => 'N',
"Asp" => 'D',
"Cys" => 'C',
"Glu" => 'E',
"Gln" => 'Q',
"Gly" => 'G',
"His" => 'H',
"Ile" => 'I',
"Leu" => 'L',
"Lys" => 'K',
"Met" => 'M',
"Phe" => 'F',
"Pro" => 'P',
"Ser" => 'S',
"Thr" => 'T',
"Trp" => 'W',
"Tyr" => 'Y',
"Val" => 'V',
"Ter" => '*',
"del" => '-'
);
# This takes a single-letter code to the index of the row in the heatmap it appears in
%aaIndex = (
'A' => 0,
'V' => 1,
'L' => 2,
'I' => 3,
'M' => 4,
'F' => 5,
'Y' => 6,
'W' => 7,
'R' => 8,
'H' => 9,
'K' => 10,
'D' => 11,
'E' => 12,
'S' => 13,
'T' => 14,
'N' => 15,
'Q' => 16,
'G' => 17,
'C' => 18,
'P' => 19,
'-' => 20,
'*' => 21,
'=' => 22
);
# This should probably be replaced with the keys of the hash, but we want to make sure they
# appear in the right order!
$rowLabels = "A,V,L,I,M,F,Y,W,R,H,K,D,E,S,T,N,Q,G,C,P,-,*,=";
$rowCount = scalar(keys %aaIndex);
@exonScores = ();
for ($i=0; $i < scalar(@sorted_pos)*$rowCount; $i++)
{
$exonScores[$i] = "";
$exonLabels[$i] = "";
}
$hasDup = 0;
foreach $exon (keys %exonList)
{
$exon =~ m/^(\d+),(\d+)/;
($exonStart, $exonEnd) = ($1, $2);
if (defined $exonIxs{"$exonStart,$exonEnd"})
{
$exonIx = $exonIxs{"$exonStart,$exonEnd"};
}
else
{
# shouldn't ever happen, since we just populated exonIxs from the same list
# that filled exonList
die "Unexpected exon in the bagging area - $exonStart,$exonEnd vs " . (join ";", keys %exonIxs);
}
# now I need to populate the exonScores and exonLabels matrices. There's a fixed order for the destination AA, so
# Really I just have to calculate the position in the list as $i + AA_ix*exon_count
$label = $exonList{$exon};
if ($label =~ m/#\d+ (\S+): (\S+)/)
{
($hgvs, $score) = ($1, $2);
}
else
{
die "unexpected label failure: $label";
}
if ($hgvs =~ m/\.(\w{3})\d+=$/)
{
$destAA = $1;
$sourceAA = $destAA;
}
elsif ($hgvs =~ m/(\w{3})\d+(\w{3})$/)
{
$destAA = $2;
$sourceAA = $1;
}
else
{
die "unexpected failure to find dest AA: $hgvs";
}
$aaLetter = $aaNames{$destAA};
if (!defined $aaIndex{$aaLetter})
{
die "Don't have an index for AA: -$aaLetter- (from $destAA)";
}
$aaIx = $aaIndex{$aaLetter};
$ix = $exonIx + $aaIx*scalar(@sorted_pos);
if ($destAA ne $sourceAA)
{
$exonScores[$ix] = $score;
$exonLabels[$ix] = $label;
if (defined $exonDups{$exon})
{
$hasDupe = 1;
if ($exonScores[$ix] !~ m/\|/)
- { $exonScores[$ix] .= "|"; }
+ {
+ $exonCount = 1 + scalar(() = $exonDups{$exon} =~ m/
/g); # awkward, but gives the right count
+ $exonScores[$ix] .= "|$exonCount";
+ }
$exonLabels[$ix] .= $exonDups{$exon};
}
}
else
{
# this is synonymous
$exonScores[$ix] = "#FFD700";
$exonLabels[$ix] = "Wild type";
$aaIx = $aaIndex{"="};
$ix = $exonIx + $aaIx*scalar(@sorted_pos);
$exonScores[$ix] = $score;
$exonLabels[$ix] = $label;
if (defined $exonDups{$exon})
{
$hasDupe = 1;
if ($exonScores[$ix] !~ m/|/)
{
$exonCount = 1 + scalar(() = $exonDups{$exon} =~ m/
/g); # awkward, but gives the right count
$exonScores[$ix] .= "|$exonCount";
}
$exonLabels[$ix] .= $exonDups{$exon};
}
}
}
if ($hasDupe)
{
# just a san check, also sometimes useful for noting which datasets have dups for investigation
# of whether that bit of mouseover code is working properly
print STDERR "This one has dups\n";
}
# at this point, we have
# $sizeString : the exon sizes string
# $exonStarts: the exon starts string
# 21 (we know it's the number of rows)
# "A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V,*" (this is the row labels)
# @exonScores
# @exonLabels
# ID, where ID needs to be replaced with $urnID
#
# All of this is getting appended to $outstring
# sanity check
if (scalar(@exonScores) != $rowCount * scalar(@sorted_pos))
{
die "exonScores list is wrong, compare " . scalar(@exonScores) . " with " . ($rowCount*scalar(@sorted_pos));
}
if (scalar(@exonScores) != scalar(@exonLabels))
{
die "mismatch in scores and labels lengths, compare " . scalar(@exonScores) . " with " . scalar(@exonLabels);
}
$outstring .= "$sizeString\t$exonStarts";
# that's the end of the basic bed fields. Now for the extra heatmap-specific fields
$outstring .= "\t$rowCount\t$rowLabels\t";
# now write out the lower bound, midpoint, and upper bound for the track
# These should be in ascending order
-$outstring .= join ",", (min(@exonScores), (min(@exonScores)+max(@exonScores))/2, max(@exonScores));
+@justScores = grep (/^[^#]/, @exonScores); # if all positive or negative, the #color values and "" would screw this up
+$outstring .= join ",", (min(@justScores), (min(@justScores)+max(@justScores))/2, max(@justScores));
$outstring .= ",\t";
# now the colors for each block
$outstring .= join ",", ("blue", "silver", "red");
$outstring .= ",\t";
# now for the cell scores themselves, along with the cell labels
$outstring .= join ",", @exonScores;
$outstring .= ",\t";
$outstring .= join ",", @exonLabels;
$outstring .= ",\t";
$metadata_ref = $$jsonObj{"metadata"};
$shortDescription = $$metadata_ref{"shortDescription"};
$shortDescription =~ s/\\u(.{4})/chr(hex($1))/ge; # thanks to https://www.perlmonks.org/?node_id=796799
# this deals with unicode characters.
$shortDescription = encode_entities($shortDescription); # this HTML-encodes everything
$shortDescription = cleanWhitespace($shortDescription);
-#$shortDescription =~ s/\s+/ /g;
$outstring .= "$shortDescription";
# everything above was mandatory fields for a heatmap (including a string for the heatmap title area)
# What follows next is metadata fields for hgc display
$url = "ID";
$url =~ s/ID/$urnID/g;
$outstring .= "\t$url";
$abstract = $$metadata_ref{"abstractText"};
$abstract =~ s/\\u(.{4})/chr(hex($1))/ge;
$abstract = encode_entities($abstract); # this HTML-encodes everything
$abstract = cleanWhitespace($abstract);
-#$abstract =~ s/\s+/ /g;
$outstring .= "\t$abstract";
$methodText = $$metadata_ref{"methodText"};
$methodText =~ s/\\u(.{4})/chr(hex($1))/ge;
$methodText = encode_entities($methodText); # this HTML-encodes everything
$methodText = cleanWhitespace($methodText);
$expRef = $$metadata_ref{"experiment"};
$expMethodText = $$expRef{"methodText"};
if ($expMethodText =~ m/\w+/)
{
$expMethodText =~ s/\\u(.{4})/chr(hex($1))/ge;
$expMethodText = encode_entities($expMethodText); # this HTML-encodes everything
$expMethodText = cleanWhitespace($expMethodText);
$methodText .= "
$expMethodText";
}
$outstring .= "\t$methodText";
$outstring .= "\n";
print "$outstring";
open($seqFile, ">>", "MaveDBSeqFile.fa");
-#metadata -> targetGenes (array) -> name, targetSequence -> sequence
$targetGenesRef = $$metadata_ref{"targetGenes"};
foreach $targetGene_ref (@$targetGenesRef)
{
$targetGene_name = $$targetGene_ref{"name"};
$targetSeq_ref = $$targetGene_ref{"targetSequence"};
$sequence = $$targetSeq_ref{"sequence"};
- print $seqFile ">$targetGene_name $urnID\n";
+ print $seqFile ">$urnID $targetGene_name\n";
print $seqFile "$sequence\n";
}
close($seqFile);