b4fd34e71f94b2847253ef2b20cd3d8aa2acaa24 jcasper Mon May 19 10:31:30 2025 -0700 Fix a few issues with MaveDB json parse (score bounds, item count display for cells with multiple values. Also committing trackDb and doc for a native version, refs #31812 diff --git src/hg/oneShot/parseMave/parseMave.pl src/hg/oneShot/parseMave/parseMave.pl index 971d746dfbf..5d7b92ee676 100644 --- src/hg/oneShot/parseMave/parseMave.pl +++ src/hg/oneShot/parseMave/parseMave.pl @@ -23,38 +23,43 @@ $jsonBlob = ""; while ($line = <STDIN>) { $jsonBlob .= $line; } if ($jsonBlob !~ m/hgvs\.p/) { # no point continuing exit(0); } $jsonObj = decode_json($jsonBlob); +# Start indexing into the json object to find all of the mapped scores +# $mapped_scores_ref = $$jsonObj{"mapped_scores"}; $itemStart = -1; $itemEnd = -1; $chrom = ""; $urnID = ""; -%exonList = (); +%exonList = (); # key: "A,B,C", where A and B are the start and end coordinates of the exon in BED coordinates. + # C is the HGVS-like p string for the variant + # value: "$thisID $pString: $scoreTrunc", being the id, HGVS-like p. string, and score + # The %exonDups list is keyed the same way, containing overflow items for each exon (if any) %txStarts = (); %txSizes = (); foreach $record_ref (@$mapped_scores_ref) { # record_ref should be a hash # We'll want "mavedb_id" (example "urn:mavedb:00000097-s-1#2") # "score" (example -0.0757349427170253) # "post_mapped" -> "expressions" -> "value" (example "NP_009225.1:p.Asn1774Thr") # ASSUMING that expressions -> syntax is hgvs.p. # NOTE: expressions is an array, so you'll have to loop through the contents to find the hgvs $id = $$record_ref{"mavedb_id"}; $score = $$record_ref{"score"}; next if ($score eq ""); @@ -176,31 +181,30 @@ { # move the start up and trim the size down $thisSizes[$k] -= ($thisStart - $thisStarts[$k]); $thisStarts[$k] = $thisStart; } if ($thisStarts[$k] + $thisSizes[$k] > $thisEnd) { # trim back the size to fit $thisSizes[$k] = $thisEnd - $thisStarts[$k]; } } for ($k=0; $k<@thisStarts; $k++) { next if $thisStarts[$k] == -1; - #$posString = $f[1] . "," . $f[2]; $posString = $thisStarts[$k] . "," . ($thisStarts[$k]+$thisSizes[$k]); if ($hgvs =~ m/:(.*)/) { $pString = $1; $posString .= ",$pString"; $scoreTrunc = sprintf("%.4f", $score); if (defined $exonList{$posString}) { $exonDups{$posString} .= "<br>$thisID $pString: $scoreTrunc"; } else { $exonList{$posString} = "$thisID $pString: $scoreTrunc"; } } @@ -225,65 +229,61 @@ # need to manually sort the exons at this point # a bit of a hack here, but numeric sort should sort by the start positions of each exon since # that's the start of each key in %exonList. #@sorted_exons = sort {$a <=> $b} (keys %exonList); # so %coords is actually the list of distinct mappings from partial HGVS terms to bed coordinates # That makes its size the number of exons that we need. We can sanity check the ranges in that # list if we want. # # Okaaaay, so that's no longer accurate. Some of those codon bed mappings may have resulted in multiple # "exons" because they were split across intron boundaries. We can recover what we need # from the keys of %exonList instead. # I WAS using coords because it imposed uniqueness on the exon coordinates. So I have to reimplement that. -#foreach $key (keys %coords) %uniquify = (); foreach $key (keys %exonList) { -# $bedStr = $coords{$key}; -# $bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/; - $bedStr = $key; # $exonList{$key}; + $bedStr = $key; $bedStr =~ m/^(\d+),(\d+),/; $st_en = "$1,$2"; if (!defined $uniquify{$st_en}) { $uniquify{$st_en} = 1; $start_end[@start_end] = $st_en; } } @sorted_pos = sort {$a <=> $b} (@start_end); $sizeString = ""; $exonStarts = ""; $lasten = ""; $lastst = ""; %exonIxs = (); for ($i=0; $i<@sorted_pos; $i++) { $sorted_pos[$i] =~ m/(\d+),(\d+)/; ($st, $en) = ($1, $2); if ($i > 0 && $st < $lasten) { # something went horribly wrong - one exon started before the previous one ended. # Track down the previous coordinates so we can report both sets. # inefficient, but at least we don't do it often foreach $key (keys %uniquify) { $bedStr = $uniquify{$key}; - #$bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/; $bedStr =~ m/^(\d+),(\d+)/; $st_en = "$1,$2"; if ($st_en eq "$lastst,$lasten") { $prevbad = $key; } if ($st_en eq "$st,$en") { $thisbad = $key; } } die "One item started at $st ($thisbad) after another ended at $lasten ($prevbad)"; } $width = $en - $st; $sizeString .= "$width,"; @@ -417,31 +417,34 @@ { die "Don't have an index for AA: -$aaLetter- (from $destAA)"; } $aaIx = $aaIndex{$aaLetter}; $ix = $exonIx + $aaIx*scalar(@sorted_pos); if ($destAA ne $sourceAA) { $exonScores[$ix] = $score; $exonLabels[$ix] = $label; if (defined $exonDups{$exon}) { $hasDupe = 1; if ($exonScores[$ix] !~ m/\|/) - { $exonScores[$ix] .= "|"; } + { + $exonCount = 1 + scalar(() = $exonDups{$exon} =~ m/<br>/g); # awkward, but gives the right count + $exonScores[$ix] .= "|$exonCount"; + } $exonLabels[$ix] .= $exonDups{$exon}; } } else { # this is synonymous $exonScores[$ix] = "#FFD700"; $exonLabels[$ix] = "Wild type"; $aaIx = $aaIndex{"="}; $ix = $exonIx + $aaIx*scalar(@sorted_pos); $exonScores[$ix] = $score; $exonLabels[$ix] = $label; @@ -486,87 +489,85 @@ } if (scalar(@exonScores) != scalar(@exonLabels)) { die "mismatch in scores and labels lengths, compare " . scalar(@exonScores) . " with " . scalar(@exonLabels); } $outstring .= "$sizeString\t$exonStarts"; # that's the end of the basic bed fields. Now for the extra heatmap-specific fields $outstring .= "\t$rowCount\t$rowLabels\t"; # now write out the lower bound, midpoint, and upper bound for the track # These should be in ascending order -$outstring .= join ",", (min(@exonScores), (min(@exonScores)+max(@exonScores))/2, max(@exonScores)); +@justScores = grep (/^[^#]/, @exonScores); # if all positive or negative, the #color values and "" would screw this up +$outstring .= join ",", (min(@justScores), (min(@justScores)+max(@justScores))/2, max(@justScores)); $outstring .= ",\t"; # now the colors for each block $outstring .= join ",", ("blue", "silver", "red"); $outstring .= ",\t"; # now for the cell scores themselves, along with the cell labels $outstring .= join ",", @exonScores; $outstring .= ",\t"; $outstring .= join ",", @exonLabels; $outstring .= ",\t"; $metadata_ref = $$jsonObj{"metadata"}; $shortDescription = $$metadata_ref{"shortDescription"}; $shortDescription =~ s/\\u(.{4})/chr(hex($1))/ge; # thanks to https://www.perlmonks.org/?node_id=796799 # this deals with unicode characters. $shortDescription = encode_entities($shortDescription); # this HTML-encodes everything $shortDescription = cleanWhitespace($shortDescription); -#$shortDescription =~ s/\s+/ /g; $outstring .= "$shortDescription"; # everything above was mandatory fields for a heatmap (including a string for the heatmap title area) # What follows next is metadata fields for hgc display $url = "<a href=\"https://mavedb.org/score-sets/urn:mavedb:ID\" target=_blank>ID</a>"; $url =~ s/ID/$urnID/g; $outstring .= "\t$url"; $abstract = $$metadata_ref{"abstractText"}; $abstract =~ s/\\u(.{4})/chr(hex($1))/ge; $abstract = encode_entities($abstract); # this HTML-encodes everything $abstract = cleanWhitespace($abstract); -#$abstract =~ s/\s+/ /g; $outstring .= "\t$abstract"; $methodText = $$metadata_ref{"methodText"}; $methodText =~ s/\\u(.{4})/chr(hex($1))/ge; $methodText = encode_entities($methodText); # this HTML-encodes everything $methodText = cleanWhitespace($methodText); $expRef = $$metadata_ref{"experiment"}; $expMethodText = $$expRef{"methodText"}; if ($expMethodText =~ m/\w+/) { $expMethodText =~ s/\\u(.{4})/chr(hex($1))/ge; $expMethodText = encode_entities($expMethodText); # this HTML-encodes everything $expMethodText = cleanWhitespace($expMethodText); $methodText .= "<br><br>$expMethodText"; } $outstring .= "\t$methodText"; $outstring .= "\n"; print "$outstring"; open($seqFile, ">>", "MaveDBSeqFile.fa"); -#metadata -> targetGenes (array) -> name, targetSequence -> sequence $targetGenesRef = $$metadata_ref{"targetGenes"}; foreach $targetGene_ref (@$targetGenesRef) { $targetGene_name = $$targetGene_ref{"name"}; $targetSeq_ref = $$targetGene_ref{"targetSequence"}; $sequence = $$targetSeq_ref{"sequence"}; - print $seqFile ">$targetGene_name $urnID\n"; + print $seqFile ">$urnID $targetGene_name\n"; print $seqFile "$sequence\n"; } close($seqFile);