b4fd34e71f94b2847253ef2b20cd3d8aa2acaa24 jcasper Mon May 19 10:31:30 2025 -0700 Fix a few issues with MaveDB json parse (score bounds, item count display for cells with multiple values. Also committing trackDb and doc for a native version, refs #31812 diff --git src/hg/oneShot/parseMave/parseMave.pl src/hg/oneShot/parseMave/parseMave.pl index 971d746dfbf..5d7b92ee676 100644 --- src/hg/oneShot/parseMave/parseMave.pl +++ src/hg/oneShot/parseMave/parseMave.pl @@ -1,572 +1,573 @@ # parseMave.pl - parse the dcd_mappings json files provided by MaveDB into a series of bed records, # where each record is a heatmap of the data for one experiment. Also writes out a separate fasta # file containing the reference sequence for each experiment. use JSON::PP; use HTML::Entities; use Encode; use List::Util qw(min max); sub cleanWhitespace($) # There seems to be a variety of weird newline stuff, depending on who curated the record. There's a few # dos newlines in records somewhere, others have "\r\n" (like that actual string, not the characters). # Going to try to replace them all with
here, and reduce all other whitespace to single spaces. { my $str = shift; $str =~ s/\\r\\n/
/g; $str =~ s/\\n/
/g; $str =~ s/(\012|\015)+/
/g; $str =~ s/\s+/ /g; return $str; } $jsonBlob = ""; while ($line = ) { $jsonBlob .= $line; } if ($jsonBlob !~ m/hgvs\.p/) { # no point continuing exit(0); } $jsonObj = decode_json($jsonBlob); +# Start indexing into the json object to find all of the mapped scores +# $mapped_scores_ref = $$jsonObj{"mapped_scores"}; $itemStart = -1; $itemEnd = -1; $chrom = ""; $urnID = ""; -%exonList = (); +%exonList = (); # key: "A,B,C", where A and B are the start and end coordinates of the exon in BED coordinates. + # C is the HGVS-like p string for the variant + # value: "$thisID $pString: $scoreTrunc", being the id, HGVS-like p. string, and score + # The %exonDups list is keyed the same way, containing overflow items for each exon (if any) %txStarts = (); %txSizes = (); foreach $record_ref (@$mapped_scores_ref) { # record_ref should be a hash # We'll want "mavedb_id" (example "urn:mavedb:00000097-s-1#2") # "score" (example -0.0757349427170253) # "post_mapped" -> "expressions" -> "value" (example "NP_009225.1:p.Asn1774Thr") # ASSUMING that expressions -> syntax is hgvs.p. # NOTE: expressions is an array, so you'll have to loop through the contents to find the hgvs $id = $$record_ref{"mavedb_id"}; $score = $$record_ref{"score"}; next if ($score eq ""); $post = $$record_ref{"post_mapped"}; $type = $$post{"type"}; next if ($type eq "Haplotype"); # we're not set up to handle this right now next if ($type ne "VariationDescriptor"); # we're not set up to handle anything but VariationDescriptor right now $expr = $$post{"expressions"}; $hgvs = ""; foreach $expr_i (@$expr) { if ($$expr_i{"syntax"} eq "hgvs.p") { $hgvs = $$expr_i{"value"}; } } next if ($hgvs eq ""); # There are some HGVS term types (or HGVS-like terms) that we're skipping over for now. # NP_003336.1:p.159delinsHis is causing a core dump # NP_003336.1:p.Ser158_159delinsSerSer is being mapped to # chr16 1324787 1324790 Ser158_159delinsSerSer # Seems like that should be more than one codon wide. if ($hgvs =~ m/(NP_[^:]+):p\.[a-zA-Z]{3}\d+[a-zA-Z]{3}$/) { $ncbiProt = $1; } elsif ($hgvs =~ m/(NP_[^:]+):p\.[a-zA-Z]{3}\d+=$/) { $ncbiProt = $1; } else { next; } $partialHgvs = $hgvs; $partialHgvs =~ s/(\d+)\D+$/$1/; # strip off the destination AA; we only care about the genome mapping if (!defined $coords{$partialHgvs}) { # Using another oneShot tool to convert HGVS terms to BED records $bedMapping = `~/bin/x86_64/hgvsToBed hg38 "$hgvs"` or die "hgvsToBed failed on $hgvs; have you built it?"; $coords{$partialHgvs} = $bedMapping; } else { $bedMapping = $coords{$partialHgvs}; } next if ($bedMapping eq ""); # if we can't map it, it's useless. Discard if ($bedMapping !~ m/^\S+\s+\d+\s+\d+/) { # something weird happened. Panic. die "Something went wrong, hgvs.p $hgvs was mapped to $bedMapping"; } @f = split /\s+/, $bedMapping; if ($chrom eq "") { $chrom = $f[0]; } elsif ($chrom ne $f[0]) { die "Whoa, multiple chromosomes found in file (compare $chrom, " . $f[0] . ")"; } if ($itemStart == -1) { $itemStart = $f[1]; } elsif ($itemStart > $f[1]) { $itemStart = $f[1]; } if ($itemEnd == -1) { $itemEnd = $f[2]; } elsif ($itemEnd < $f[2]) { $itemEnd = $f[2]; } $oldUrnID = $urnID; if ($id =~ m/^urn:mavedb:([^#]+)(#\d+)/) { ($urnID, $thisID) = ($1, $2); if ($oldUrnID ne "" && $oldUrnID ne $urnID) { die "I guess we change urn IDs mid-file now? $oldUrnID vs $urnID"; } } else { die "Unexpected id string: $id did not contain an urn:mavedb identifier"; } # Right about here, we want to parse f[1] and f[2], which are the start and end of the codon # We want to compare that against the exon coordinates against the bounds from the PSL record for # the transcript. Which means we need to get that. if (!defined $txStarts{$ncbiProt}) { $st_size = `hgsql hg38 -Ne 'select psl.tStarts, psl.blockSizes from ncbiRefSeqPsl psl join ncbiRefSeqLink link on psl.qName = link.mrnaAcc where link.protAcc = "$ncbiProt"'` or die "No exons for $ncbiProt in ncbiRefSeqLink/ncbiRefSeqPsl"; chomp $st_size; @g = split /\t/, $st_size; $txStarts{$NM} = $g[0]; $txSizes{$NM} = $g[1]; } @thisStarts = split /,/, $txStarts{$NM}; @thisSizes = split /,/, $txSizes{$NM}; $thisStart = $f[1]; $thisEnd = $f[2]; # hgvsToBed mapped us to this position using the same record that we pulled txStarts and txSizes from, # but it ignored the exon structure. We need to build (maybe) this one "exon" into multiple exons. # Going to do this kinda in reverse, actually. Loop through the transcript's exons. Either delete # them if they have no overlap with the mapped range, or else truncate them to it (by adjusting # sizes and maybe starts. # Except actual splicing would screw with array offsets, and we don't need that kind of headache. # So we'll just set thisStarts[$k] = -1 and filter them out subsequently. for ($k=0; $k<@thisStarts; $k++) { if ($thisStart > $thisStarts[$k] + $thisSizes[$k] || $thisEnd < $thisStarts[$k]) { $thisStarts[$k] = -1; next; } if ($thisStarts[$k] < $thisStart) { # move the start up and trim the size down $thisSizes[$k] -= ($thisStart - $thisStarts[$k]); $thisStarts[$k] = $thisStart; } if ($thisStarts[$k] + $thisSizes[$k] > $thisEnd) { # trim back the size to fit $thisSizes[$k] = $thisEnd - $thisStarts[$k]; } } for ($k=0; $k<@thisStarts; $k++) { next if $thisStarts[$k] == -1; - #$posString = $f[1] . "," . $f[2]; $posString = $thisStarts[$k] . "," . ($thisStarts[$k]+$thisSizes[$k]); if ($hgvs =~ m/:(.*)/) { $pString = $1; $posString .= ",$pString"; $scoreTrunc = sprintf("%.4f", $score); if (defined $exonList{$posString}) { $exonDups{$posString} .= "
$thisID $pString: $scoreTrunc"; } else { $exonList{$posString} = "$thisID $pString: $scoreTrunc"; } } else { die "Huh, hgvs string didn't have a p term? $hgvs"; } } } # let's check that something actually happened, and skip out if the file was functionally # empty (e.g. it was only haplotypes) if (scalar(keys %exonList) == 0) { exit(0); } # At this point, %exonList takes position strings (of exons) to the label-type info for them # need to manually sort the exons at this point # a bit of a hack here, but numeric sort should sort by the start positions of each exon since # that's the start of each key in %exonList. #@sorted_exons = sort {$a <=> $b} (keys %exonList); # so %coords is actually the list of distinct mappings from partial HGVS terms to bed coordinates # That makes its size the number of exons that we need. We can sanity check the ranges in that # list if we want. # # Okaaaay, so that's no longer accurate. Some of those codon bed mappings may have resulted in multiple # "exons" because they were split across intron boundaries. We can recover what we need # from the keys of %exonList instead. # I WAS using coords because it imposed uniqueness on the exon coordinates. So I have to reimplement that. -#foreach $key (keys %coords) %uniquify = (); foreach $key (keys %exonList) { -# $bedStr = $coords{$key}; -# $bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/; - $bedStr = $key; # $exonList{$key}; + $bedStr = $key; $bedStr =~ m/^(\d+),(\d+),/; $st_en = "$1,$2"; if (!defined $uniquify{$st_en}) { $uniquify{$st_en} = 1; $start_end[@start_end] = $st_en; } } @sorted_pos = sort {$a <=> $b} (@start_end); $sizeString = ""; $exonStarts = ""; $lasten = ""; $lastst = ""; %exonIxs = (); for ($i=0; $i<@sorted_pos; $i++) { $sorted_pos[$i] =~ m/(\d+),(\d+)/; ($st, $en) = ($1, $2); if ($i > 0 && $st < $lasten) { # something went horribly wrong - one exon started before the previous one ended. # Track down the previous coordinates so we can report both sets. # inefficient, but at least we don't do it often foreach $key (keys %uniquify) { $bedStr = $uniquify{$key}; - #$bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/; $bedStr =~ m/^(\d+),(\d+)/; $st_en = "$1,$2"; if ($st_en eq "$lastst,$lasten") { $prevbad = $key; } if ($st_en eq "$st,$en") { $thisbad = $key; } } die "One item started at $st ($thisbad) after another ended at $lasten ($prevbad)"; } $width = $en - $st; $sizeString .= "$width,"; $exonStarts .= ($st - $itemStart) . ","; $exonIxs{"$st,$en"} = $i; $lasten = $en; $lastst = $st; } $outstring = "$chrom\t$itemStart\t$itemEnd\t$urnID\t1000\t+\t$itemStart\t$itemEnd\t0\t"; $outstring .= scalar(@sorted_pos) . "\t"; ################################################## # Now to assemble the score and label arrays %aaNames = ( "Ala" => 'A', "Arg" => 'R', "Asn" => 'N', "Asp" => 'D', "Cys" => 'C', "Glu" => 'E', "Gln" => 'Q', "Gly" => 'G', "His" => 'H', "Ile" => 'I', "Leu" => 'L', "Lys" => 'K', "Met" => 'M', "Phe" => 'F', "Pro" => 'P', "Ser" => 'S', "Thr" => 'T', "Trp" => 'W', "Tyr" => 'Y', "Val" => 'V', "Ter" => '*', "del" => '-' ); # This takes a single-letter code to the index of the row in the heatmap it appears in %aaIndex = ( 'A' => 0, 'V' => 1, 'L' => 2, 'I' => 3, 'M' => 4, 'F' => 5, 'Y' => 6, 'W' => 7, 'R' => 8, 'H' => 9, 'K' => 10, 'D' => 11, 'E' => 12, 'S' => 13, 'T' => 14, 'N' => 15, 'Q' => 16, 'G' => 17, 'C' => 18, 'P' => 19, '-' => 20, '*' => 21, '=' => 22 ); # This should probably be replaced with the keys of the hash, but we want to make sure they # appear in the right order! $rowLabels = "A,V,L,I,M,F,Y,W,R,H,K,D,E,S,T,N,Q,G,C,P,-,*,="; $rowCount = scalar(keys %aaIndex); @exonScores = (); for ($i=0; $i < scalar(@sorted_pos)*$rowCount; $i++) { $exonScores[$i] = ""; $exonLabels[$i] = ""; } $hasDup = 0; foreach $exon (keys %exonList) { $exon =~ m/^(\d+),(\d+)/; ($exonStart, $exonEnd) = ($1, $2); if (defined $exonIxs{"$exonStart,$exonEnd"}) { $exonIx = $exonIxs{"$exonStart,$exonEnd"}; } else { # shouldn't ever happen, since we just populated exonIxs from the same list # that filled exonList die "Unexpected exon in the bagging area - $exonStart,$exonEnd vs " . (join ";", keys %exonIxs); } # now I need to populate the exonScores and exonLabels matrices. There's a fixed order for the destination AA, so # Really I just have to calculate the position in the list as $i + AA_ix*exon_count $label = $exonList{$exon}; if ($label =~ m/#\d+ (\S+): (\S+)/) { ($hgvs, $score) = ($1, $2); } else { die "unexpected label failure: $label"; } if ($hgvs =~ m/\.(\w{3})\d+=$/) { $destAA = $1; $sourceAA = $destAA; } elsif ($hgvs =~ m/(\w{3})\d+(\w{3})$/) { $destAA = $2; $sourceAA = $1; } else { die "unexpected failure to find dest AA: $hgvs"; } $aaLetter = $aaNames{$destAA}; if (!defined $aaIndex{$aaLetter}) { die "Don't have an index for AA: -$aaLetter- (from $destAA)"; } $aaIx = $aaIndex{$aaLetter}; $ix = $exonIx + $aaIx*scalar(@sorted_pos); if ($destAA ne $sourceAA) { $exonScores[$ix] = $score; $exonLabels[$ix] = $label; if (defined $exonDups{$exon}) { $hasDupe = 1; if ($exonScores[$ix] !~ m/\|/) - { $exonScores[$ix] .= "|"; } + { + $exonCount = 1 + scalar(() = $exonDups{$exon} =~ m/
/g); # awkward, but gives the right count + $exonScores[$ix] .= "|$exonCount"; + } $exonLabels[$ix] .= $exonDups{$exon}; } } else { # this is synonymous $exonScores[$ix] = "#FFD700"; $exonLabels[$ix] = "Wild type"; $aaIx = $aaIndex{"="}; $ix = $exonIx + $aaIx*scalar(@sorted_pos); $exonScores[$ix] = $score; $exonLabels[$ix] = $label; if (defined $exonDups{$exon}) { $hasDupe = 1; if ($exonScores[$ix] !~ m/|/) { $exonCount = 1 + scalar(() = $exonDups{$exon} =~ m/
/g); # awkward, but gives the right count $exonScores[$ix] .= "|$exonCount"; } $exonLabels[$ix] .= $exonDups{$exon}; } } } if ($hasDupe) { # just a san check, also sometimes useful for noting which datasets have dups for investigation # of whether that bit of mouseover code is working properly print STDERR "This one has dups\n"; } # at this point, we have # $sizeString : the exon sizes string # $exonStarts: the exon starts string # 21 (we know it's the number of rows) # "A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V,*" (this is the row labels) # @exonScores # @exonLabels # ID, where ID needs to be replaced with $urnID # # All of this is getting appended to $outstring # sanity check if (scalar(@exonScores) != $rowCount * scalar(@sorted_pos)) { die "exonScores list is wrong, compare " . scalar(@exonScores) . " with " . ($rowCount*scalar(@sorted_pos)); } if (scalar(@exonScores) != scalar(@exonLabels)) { die "mismatch in scores and labels lengths, compare " . scalar(@exonScores) . " with " . scalar(@exonLabels); } $outstring .= "$sizeString\t$exonStarts"; # that's the end of the basic bed fields. Now for the extra heatmap-specific fields $outstring .= "\t$rowCount\t$rowLabels\t"; # now write out the lower bound, midpoint, and upper bound for the track # These should be in ascending order -$outstring .= join ",", (min(@exonScores), (min(@exonScores)+max(@exonScores))/2, max(@exonScores)); +@justScores = grep (/^[^#]/, @exonScores); # if all positive or negative, the #color values and "" would screw this up +$outstring .= join ",", (min(@justScores), (min(@justScores)+max(@justScores))/2, max(@justScores)); $outstring .= ",\t"; # now the colors for each block $outstring .= join ",", ("blue", "silver", "red"); $outstring .= ",\t"; # now for the cell scores themselves, along with the cell labels $outstring .= join ",", @exonScores; $outstring .= ",\t"; $outstring .= join ",", @exonLabels; $outstring .= ",\t"; $metadata_ref = $$jsonObj{"metadata"}; $shortDescription = $$metadata_ref{"shortDescription"}; $shortDescription =~ s/\\u(.{4})/chr(hex($1))/ge; # thanks to https://www.perlmonks.org/?node_id=796799 # this deals with unicode characters. $shortDescription = encode_entities($shortDescription); # this HTML-encodes everything $shortDescription = cleanWhitespace($shortDescription); -#$shortDescription =~ s/\s+/ /g; $outstring .= "$shortDescription"; # everything above was mandatory fields for a heatmap (including a string for the heatmap title area) # What follows next is metadata fields for hgc display $url = "ID"; $url =~ s/ID/$urnID/g; $outstring .= "\t$url"; $abstract = $$metadata_ref{"abstractText"}; $abstract =~ s/\\u(.{4})/chr(hex($1))/ge; $abstract = encode_entities($abstract); # this HTML-encodes everything $abstract = cleanWhitespace($abstract); -#$abstract =~ s/\s+/ /g; $outstring .= "\t$abstract"; $methodText = $$metadata_ref{"methodText"}; $methodText =~ s/\\u(.{4})/chr(hex($1))/ge; $methodText = encode_entities($methodText); # this HTML-encodes everything $methodText = cleanWhitespace($methodText); $expRef = $$metadata_ref{"experiment"}; $expMethodText = $$expRef{"methodText"}; if ($expMethodText =~ m/\w+/) { $expMethodText =~ s/\\u(.{4})/chr(hex($1))/ge; $expMethodText = encode_entities($expMethodText); # this HTML-encodes everything $expMethodText = cleanWhitespace($expMethodText); $methodText .= "

$expMethodText"; } $outstring .= "\t$methodText"; $outstring .= "\n"; print "$outstring"; open($seqFile, ">>", "MaveDBSeqFile.fa"); -#metadata -> targetGenes (array) -> name, targetSequence -> sequence $targetGenesRef = $$metadata_ref{"targetGenes"}; foreach $targetGene_ref (@$targetGenesRef) { $targetGene_name = $$targetGene_ref{"name"}; $targetSeq_ref = $$targetGene_ref{"targetSequence"}; $sequence = $$targetSeq_ref{"sequence"}; - print $seqFile ">$targetGene_name $urnID\n"; + print $seqFile ">$urnID $targetGene_name\n"; print $seqFile "$sequence\n"; } close($seqFile);