src/hg/oneShot/parseMave/parseMave.pl b4fd34e71f94b2847253ef2b20cd3d8aa2acaa24

b4fd34e71f94b2847253ef2b20cd3d8aa2acaa24
jcasper
  Mon May 19 10:31:30 2025 -0700
Fix a few issues with MaveDB json parse (score bounds, item count display for
cells with multiple values.  Also committing trackDb and doc for a native version, refs #31812

diff --git src/hg/oneShot/parseMave/parseMave.pl src/hg/oneShot/parseMave/parseMave.pl
index 971d746dfbf..5d7b92ee676 100644
--- src/hg/oneShot/parseMave/parseMave.pl
+++ src/hg/oneShot/parseMave/parseMave.pl
@@ -23,38 +23,43 @@
 
 $jsonBlob = "";
 while ($line = <STDIN>)
 {
     $jsonBlob .= $line;
 }
 
 if ($jsonBlob !~ m/hgvs\.p/)
 {
     # no point continuing
     exit(0);
 }
 
 $jsonObj = decode_json($jsonBlob);
 
+# Start indexing into the json object to find all of the mapped scores
+#
 $mapped_scores_ref = $$jsonObj{"mapped_scores"};
 
 $itemStart = -1;
 $itemEnd = -1;
 $chrom = "";
 $urnID = "";
 
-%exonList = ();
+%exonList = ();   # key: "A,B,C", where A and B are the start and end coordinates of the exon in BED coordinates.
+                  #      C is the HGVS-like p string for the variant
+                  # value: "$thisID $pString: $scoreTrunc", being the id, HGVS-like p. string, and score
+                  # The %exonDups list is keyed the same way, containing overflow items for each exon (if any)
 %txStarts = ();
 %txSizes = ();
 
 foreach $record_ref (@$mapped_scores_ref)
 {
     # record_ref should be a hash
     # We'll want "mavedb_id" (example "urn:mavedb:00000097-s-1#2")
     #            "score" (example -0.0757349427170253)
     #            "post_mapped" -> "expressions" -> "value" (example "NP_009225.1:p.Asn1774Thr")
     #               ASSUMING that expressions -> syntax is hgvs.p.
     #               NOTE: expressions is an array, so you'll have to loop through the contents to find the hgvs
 
     $id = $$record_ref{"mavedb_id"};
     $score = $$record_ref{"score"};
     next if ($score eq "");
@@ -176,31 +181,30 @@
         {
             # move the start up and trim the size down
             $thisSizes[$k] -= ($thisStart - $thisStarts[$k]);
             $thisStarts[$k] = $thisStart;
         }
         if ($thisStarts[$k] + $thisSizes[$k] > $thisEnd)
         {
             # trim back the size to fit
             $thisSizes[$k] = $thisEnd - $thisStarts[$k];
         }
     }
 
     for ($k=0; $k<@thisStarts; $k++)
     {
         next if $thisStarts[$k] == -1;
-        #$posString = $f[1] . "," . $f[2];
         $posString = $thisStarts[$k] . "," . ($thisStarts[$k]+$thisSizes[$k]);
         if ($hgvs =~ m/:(.*)/)
         {
             $pString = $1;
             $posString .= ",$pString";
             $scoreTrunc = sprintf("%.4f", $score);
             if (defined $exonList{$posString})
             {
                 $exonDups{$posString} .= "<br>$thisID $pString: $scoreTrunc";
             }
             else
             {
                 $exonList{$posString} = "$thisID $pString: $scoreTrunc";
             }
         }
@@ -225,65 +229,61 @@
 # need to manually sort the exons at this point
 # a bit of a hack here, but numeric sort should sort by the start positions of each exon since
 # that's the start of each key in %exonList.
 
 #@sorted_exons = sort {$a <=> $b} (keys %exonList);
 
 # so %coords is actually the list of distinct mappings from partial HGVS terms to bed coordinates
 # That makes its size the number of exons that we need.  We can sanity check the ranges in that
 # list if we want.
 #
 # Okaaaay, so that's no longer accurate.  Some of those codon bed mappings may have resulted in multiple
 # "exons" because they were split across intron boundaries.  We can recover what we need
 # from the keys of %exonList instead.  
 # I WAS using coords because it imposed uniqueness on the exon coordinates.  So I have to reimplement that.
 
-#foreach $key (keys %coords)
 %uniquify = ();
 foreach $key (keys %exonList)
 {
-#    $bedStr = $coords{$key};
-#    $bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/;
-    $bedStr = $key; # $exonList{$key};
+    $bedStr = $key;
     $bedStr =~ m/^(\d+),(\d+),/;
     $st_en = "$1,$2";
     if (!defined $uniquify{$st_en})
     {
         $uniquify{$st_en} = 1;
         $start_end[@start_end] = $st_en;
     }
 }
 @sorted_pos = sort {$a <=> $b} (@start_end);
 $sizeString = "";
 $exonStarts = "";
 $lasten = "";
 $lastst = "";
 %exonIxs = ();
 
 for ($i=0; $i<@sorted_pos; $i++)
 {
     $sorted_pos[$i] =~ m/(\d+),(\d+)/;
     ($st, $en) = ($1, $2);
     if ($i > 0 && $st < $lasten)
     {
         # something went horribly wrong - one exon started before the previous one ended.
         # Track down the previous coordinates so we can report both sets.
         # inefficient, but at least we don't do it often
         foreach $key (keys %uniquify)
         {
             $bedStr = $uniquify{$key};
-            #$bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/;
             $bedStr =~ m/^(\d+),(\d+)/;
             $st_en = "$1,$2";
             if ($st_en eq "$lastst,$lasten")
             {
                 $prevbad = $key;
             }
             if ($st_en eq "$st,$en")
             {
                 $thisbad = $key;
             }
         }
         die "One item started at $st ($thisbad) after another ended at $lasten ($prevbad)";
     }
     $width = $en - $st;
     $sizeString .= "$width,";
@@ -417,31 +417,34 @@
     {
         die "Don't have an index for AA: -$aaLetter- (from $destAA)";
     }
     $aaIx = $aaIndex{$aaLetter};
     $ix = $exonIx + $aaIx*scalar(@sorted_pos);
 
     if ($destAA ne $sourceAA)
     {
         $exonScores[$ix] = $score;
         $exonLabels[$ix] = $label;
 
         if (defined $exonDups{$exon})
         {
             $hasDupe = 1;
             if ($exonScores[$ix] !~ m/\|/)
-                { $exonScores[$ix] .= "|"; }
+                {
+                $exonCount = 1 + scalar(() = $exonDups{$exon} =~ m/<br>/g);  # awkward, but gives the right count
+                $exonScores[$ix] .= "|$exonCount";
+                }
             $exonLabels[$ix] .= $exonDups{$exon};
         }
     }
     else
     {
         # this is synonymous
         $exonScores[$ix] = "#FFD700";
         $exonLabels[$ix] = "Wild type";
 
         $aaIx = $aaIndex{"="};
         $ix = $exonIx + $aaIx*scalar(@sorted_pos);
 
         $exonScores[$ix] = $score;
         $exonLabels[$ix] = $label;
 
@@ -486,87 +489,85 @@
 }
 if (scalar(@exonScores) != scalar(@exonLabels))
 {
     die "mismatch in scores and labels lengths, compare " . scalar(@exonScores) . " with " . scalar(@exonLabels);
 }
 
 $outstring .= "$sizeString\t$exonStarts";
 
 # that's the end of the basic bed fields.  Now for the extra heatmap-specific fields
 
 $outstring .= "\t$rowCount\t$rowLabels\t";
 
 # now write out the lower bound, midpoint, and upper bound for the track
 # These should be in ascending order
 
-$outstring .= join ",", (min(@exonScores), (min(@exonScores)+max(@exonScores))/2, max(@exonScores));
+@justScores = grep (/^[^#]/, @exonScores);  # if all positive or negative, the #color values and "" would screw this up
+$outstring .= join ",", (min(@justScores), (min(@justScores)+max(@justScores))/2, max(@justScores));
 $outstring .= ",\t";
 
 # now the colors for each block
 
 $outstring .= join ",", ("blue", "silver", "red");
 $outstring .= ",\t";
 
 # now for the cell scores themselves, along with the cell labels
 
 $outstring .= join ",", @exonScores;
 $outstring .= ",\t";
 $outstring .= join ",", @exonLabels;
 $outstring .= ",\t";
 
 $metadata_ref = $$jsonObj{"metadata"};
 $shortDescription = $$metadata_ref{"shortDescription"};
 $shortDescription =~ s/\\u(.{4})/chr(hex($1))/ge;  # thanks to https://www.perlmonks.org/?node_id=796799
                                                    # this deals with unicode characters.
 $shortDescription = encode_entities($shortDescription);  # this HTML-encodes everything
 $shortDescription = cleanWhitespace($shortDescription);
-#$shortDescription =~ s/\s+/ /g;
 $outstring .= "$shortDescription";
 
 # everything above was mandatory fields for a heatmap (including a string for the heatmap title area)
 # What follows next is metadata fields for hgc display
 
 $url = "<a href=\"https://mavedb.org/score-sets/urn:mavedb:ID\" target=_blank>ID</a>";
 $url =~ s/ID/$urnID/g;
 $outstring .= "\t$url";
 
 $abstract = $$metadata_ref{"abstractText"};
 $abstract =~ s/\\u(.{4})/chr(hex($1))/ge;
 $abstract = encode_entities($abstract);  # this HTML-encodes everything
 $abstract = cleanWhitespace($abstract);
-#$abstract =~ s/\s+/ /g;
 $outstring .= "\t$abstract";
 
 
 $methodText = $$metadata_ref{"methodText"};
 $methodText =~ s/\\u(.{4})/chr(hex($1))/ge;
 $methodText = encode_entities($methodText);  # this HTML-encodes everything
 $methodText = cleanWhitespace($methodText);
 
 $expRef = $$metadata_ref{"experiment"};
 $expMethodText = $$expRef{"methodText"};
 if ($expMethodText =~ m/\w+/)
 {
     $expMethodText =~ s/\\u(.{4})/chr(hex($1))/ge;
     $expMethodText = encode_entities($expMethodText);  # this HTML-encodes everything
     $expMethodText = cleanWhitespace($expMethodText);
 
     $methodText .= "<br><br>$expMethodText";
 }
 
 $outstring .= "\t$methodText";
 
 $outstring .= "\n";
 print "$outstring";
 
 open($seqFile, ">>", "MaveDBSeqFile.fa");
-#metadata ->  targetGenes (array) -> name, targetSequence -> sequence
 $targetGenesRef = $$metadata_ref{"targetGenes"};
 foreach $targetGene_ref (@$targetGenesRef)
 {
     $targetGene_name = $$targetGene_ref{"name"};
     $targetSeq_ref = $$targetGene_ref{"targetSequence"};
     $sequence = $$targetSeq_ref{"sequence"};
-    print $seqFile ">$targetGene_name $urnID\n";
+    print $seqFile ">$urnID $targetGene_name\n";
     print $seqFile "$sequence\n";
 }
 close($seqFile);