4343f704c3b83b37d1c6784504e19463561d77ad jcasper Wed Apr 16 08:53:12 2025 -0700 Some command-line tools useful for parsing MaveDB data, refs #31812 diff --git src/hg/oneShot/parseMave/parseMave.pl src/hg/oneShot/parseMave/parseMave.pl new file mode 100644 index 00000000000..971d746dfbf --- /dev/null +++ src/hg/oneShot/parseMave/parseMave.pl @@ -0,0 +1,572 @@ +# parseMave.pl - parse the dcd_mappings json files provided by MaveDB into a series of bed records, +# where each record is a heatmap of the data for one experiment. Also writes out a separate fasta +# file containing the reference sequence for each experiment. + +use JSON::PP; +use HTML::Entities; +use Encode; +use List::Util qw(min max); + + +sub cleanWhitespace($) +# There seems to be a variety of weird newline stuff, depending on who curated the record. There's a few +# dos newlines in records somewhere, others have "\r\n" (like that actual string, not the characters). +# Going to try to replace them all with
here, and reduce all other whitespace to single spaces. +{ + my $str = shift; + $str =~ s/\\r\\n/
/g; + $str =~ s/\\n/
/g; + $str =~ s/(\012|\015)+/
/g; + $str =~ s/\s+/ /g; + return $str; +} + +$jsonBlob = ""; +while ($line = ) +{ + $jsonBlob .= $line; +} + +if ($jsonBlob !~ m/hgvs\.p/) +{ + # no point continuing + exit(0); +} + +$jsonObj = decode_json($jsonBlob); + +$mapped_scores_ref = $$jsonObj{"mapped_scores"}; + +$itemStart = -1; +$itemEnd = -1; +$chrom = ""; +$urnID = ""; + +%exonList = (); +%txStarts = (); +%txSizes = (); + +foreach $record_ref (@$mapped_scores_ref) +{ + # record_ref should be a hash + # We'll want "mavedb_id" (example "urn:mavedb:00000097-s-1#2") + # "score" (example -0.0757349427170253) + # "post_mapped" -> "expressions" -> "value" (example "NP_009225.1:p.Asn1774Thr") + # ASSUMING that expressions -> syntax is hgvs.p. + # NOTE: expressions is an array, so you'll have to loop through the contents to find the hgvs + + $id = $$record_ref{"mavedb_id"}; + $score = $$record_ref{"score"}; + next if ($score eq ""); + $post = $$record_ref{"post_mapped"}; + $type = $$post{"type"}; + + next if ($type eq "Haplotype"); # we're not set up to handle this right now + next if ($type ne "VariationDescriptor"); # we're not set up to handle anything but VariationDescriptor right now + + $expr = $$post{"expressions"}; + $hgvs = ""; + foreach $expr_i (@$expr) + { + if ($$expr_i{"syntax"} eq "hgvs.p") + { + $hgvs = $$expr_i{"value"}; + } + } + next if ($hgvs eq ""); + + # There are some HGVS term types (or HGVS-like terms) that we're skipping over for now. + # NP_003336.1:p.159delinsHis is causing a core dump + # NP_003336.1:p.Ser158_159delinsSerSer is being mapped to + # chr16 1324787 1324790 Ser158_159delinsSerSer + # Seems like that should be more than one codon wide. + + if ($hgvs =~ m/(NP_[^:]+):p\.[a-zA-Z]{3}\d+[a-zA-Z]{3}$/) + { $ncbiProt = $1; } + elsif ($hgvs =~ m/(NP_[^:]+):p\.[a-zA-Z]{3}\d+=$/) + { $ncbiProt = $1; } + else + { next; } + + $partialHgvs = $hgvs; + $partialHgvs =~ s/(\d+)\D+$/$1/; # strip off the destination AA; we only care about the genome mapping + if (!defined $coords{$partialHgvs}) + { + # Using another oneShot tool to convert HGVS terms to BED records + $bedMapping = `~/bin/x86_64/hgvsToBed hg38 "$hgvs"` or die "hgvsToBed failed on $hgvs; have you built it?"; + $coords{$partialHgvs} = $bedMapping; + } + else + { + $bedMapping = $coords{$partialHgvs}; + } + next if ($bedMapping eq ""); # if we can't map it, it's useless. Discard + + if ($bedMapping !~ m/^\S+\s+\d+\s+\d+/) + { + # something weird happened. Panic. + die "Something went wrong, hgvs.p $hgvs was mapped to $bedMapping"; + } + + @f = split /\s+/, $bedMapping; + if ($chrom eq "") + { + $chrom = $f[0]; + } + elsif ($chrom ne $f[0]) + { + die "Whoa, multiple chromosomes found in file (compare $chrom, " . $f[0] . ")"; + } + + if ($itemStart == -1) + { $itemStart = $f[1]; } + elsif ($itemStart > $f[1]) + { $itemStart = $f[1]; } + if ($itemEnd == -1) + { $itemEnd = $f[2]; } + elsif ($itemEnd < $f[2]) + { $itemEnd = $f[2]; } + + $oldUrnID = $urnID; + if ($id =~ m/^urn:mavedb:([^#]+)(#\d+)/) + { + ($urnID, $thisID) = ($1, $2); + if ($oldUrnID ne "" && $oldUrnID ne $urnID) + { + die "I guess we change urn IDs mid-file now? $oldUrnID vs $urnID"; + } + } + else + { + die "Unexpected id string: $id did not contain an urn:mavedb identifier"; + } + + + # Right about here, we want to parse f[1] and f[2], which are the start and end of the codon + # We want to compare that against the exon coordinates against the bounds from the PSL record for + # the transcript. Which means we need to get that. + if (!defined $txStarts{$ncbiProt}) + { + $st_size = `hgsql hg38 -Ne 'select psl.tStarts, psl.blockSizes from ncbiRefSeqPsl psl join ncbiRefSeqLink link on psl.qName = link.mrnaAcc where link.protAcc = "$ncbiProt"'` or die "No exons for $ncbiProt in ncbiRefSeqLink/ncbiRefSeqPsl"; + chomp $st_size; + @g = split /\t/, $st_size; + $txStarts{$NM} = $g[0]; + $txSizes{$NM} = $g[1]; + } + @thisStarts = split /,/, $txStarts{$NM}; + @thisSizes = split /,/, $txSizes{$NM}; + $thisStart = $f[1]; + $thisEnd = $f[2]; + # hgvsToBed mapped us to this position using the same record that we pulled txStarts and txSizes from, + # but it ignored the exon structure. We need to build (maybe) this one "exon" into multiple exons. + # Going to do this kinda in reverse, actually. Loop through the transcript's exons. Either delete + # them if they have no overlap with the mapped range, or else truncate them to it (by adjusting + # sizes and maybe starts. + # Except actual splicing would screw with array offsets, and we don't need that kind of headache. + # So we'll just set thisStarts[$k] = -1 and filter them out subsequently. + + for ($k=0; $k<@thisStarts; $k++) + { + if ($thisStart > $thisStarts[$k] + $thisSizes[$k] || $thisEnd < $thisStarts[$k]) + { + $thisStarts[$k] = -1; + next; + } + if ($thisStarts[$k] < $thisStart) + { + # move the start up and trim the size down + $thisSizes[$k] -= ($thisStart - $thisStarts[$k]); + $thisStarts[$k] = $thisStart; + } + if ($thisStarts[$k] + $thisSizes[$k] > $thisEnd) + { + # trim back the size to fit + $thisSizes[$k] = $thisEnd - $thisStarts[$k]; + } + } + + for ($k=0; $k<@thisStarts; $k++) + { + next if $thisStarts[$k] == -1; + #$posString = $f[1] . "," . $f[2]; + $posString = $thisStarts[$k] . "," . ($thisStarts[$k]+$thisSizes[$k]); + if ($hgvs =~ m/:(.*)/) + { + $pString = $1; + $posString .= ",$pString"; + $scoreTrunc = sprintf("%.4f", $score); + if (defined $exonList{$posString}) + { + $exonDups{$posString} .= "
$thisID $pString: $scoreTrunc"; + } + else + { + $exonList{$posString} = "$thisID $pString: $scoreTrunc"; + } + } + else + { + die "Huh, hgvs string didn't have a p term? $hgvs"; + } + } +} + +# let's check that something actually happened, and skip out if the file was functionally +# empty (e.g. it was only haplotypes) +if (scalar(keys %exonList) == 0) +{ + exit(0); +} + + +# At this point, %exonList takes position strings (of exons) to the label-type info for them + + +# need to manually sort the exons at this point +# a bit of a hack here, but numeric sort should sort by the start positions of each exon since +# that's the start of each key in %exonList. + +#@sorted_exons = sort {$a <=> $b} (keys %exonList); + +# so %coords is actually the list of distinct mappings from partial HGVS terms to bed coordinates +# That makes its size the number of exons that we need. We can sanity check the ranges in that +# list if we want. +# +# Okaaaay, so that's no longer accurate. Some of those codon bed mappings may have resulted in multiple +# "exons" because they were split across intron boundaries. We can recover what we need +# from the keys of %exonList instead. +# I WAS using coords because it imposed uniqueness on the exon coordinates. So I have to reimplement that. + +#foreach $key (keys %coords) +%uniquify = (); +foreach $key (keys %exonList) +{ +# $bedStr = $coords{$key}; +# $bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/; + $bedStr = $key; # $exonList{$key}; + $bedStr =~ m/^(\d+),(\d+),/; + $st_en = "$1,$2"; + if (!defined $uniquify{$st_en}) + { + $uniquify{$st_en} = 1; + $start_end[@start_end] = $st_en; + } +} +@sorted_pos = sort {$a <=> $b} (@start_end); +$sizeString = ""; +$exonStarts = ""; +$lasten = ""; +$lastst = ""; +%exonIxs = (); + +for ($i=0; $i<@sorted_pos; $i++) +{ + $sorted_pos[$i] =~ m/(\d+),(\d+)/; + ($st, $en) = ($1, $2); + if ($i > 0 && $st < $lasten) + { + # something went horribly wrong - one exon started before the previous one ended. + # Track down the previous coordinates so we can report both sets. + # inefficient, but at least we don't do it often + foreach $key (keys %uniquify) + { + $bedStr = $uniquify{$key}; + #$bedStr =~ m/^\S+\s+(\d+)\s+(\d+)/; + $bedStr =~ m/^(\d+),(\d+)/; + $st_en = "$1,$2"; + if ($st_en eq "$lastst,$lasten") + { + $prevbad = $key; + } + if ($st_en eq "$st,$en") + { + $thisbad = $key; + } + } + die "One item started at $st ($thisbad) after another ended at $lasten ($prevbad)"; + } + $width = $en - $st; + $sizeString .= "$width,"; + $exonStarts .= ($st - $itemStart) . ","; + + $exonIxs{"$st,$en"} = $i; + + $lasten = $en; + $lastst = $st; +} + + +$outstring = "$chrom\t$itemStart\t$itemEnd\t$urnID\t1000\t+\t$itemStart\t$itemEnd\t0\t"; +$outstring .= scalar(@sorted_pos) . "\t"; + +################################################## +# Now to assemble the score and label arrays + +%aaNames = ( + "Ala" => 'A', + "Arg" => 'R', + "Asn" => 'N', + "Asp" => 'D', + "Cys" => 'C', + "Glu" => 'E', + "Gln" => 'Q', + "Gly" => 'G', + "His" => 'H', + "Ile" => 'I', + "Leu" => 'L', + "Lys" => 'K', + "Met" => 'M', + "Phe" => 'F', + "Pro" => 'P', + "Ser" => 'S', + "Thr" => 'T', + "Trp" => 'W', + "Tyr" => 'Y', + "Val" => 'V', + "Ter" => '*', + "del" => '-' +); + +# This takes a single-letter code to the index of the row in the heatmap it appears in +%aaIndex = ( + 'A' => 0, + 'V' => 1, + 'L' => 2, + 'I' => 3, + 'M' => 4, + 'F' => 5, + 'Y' => 6, + 'W' => 7, + 'R' => 8, + 'H' => 9, + 'K' => 10, + 'D' => 11, + 'E' => 12, + 'S' => 13, + 'T' => 14, + 'N' => 15, + 'Q' => 16, + 'G' => 17, + 'C' => 18, + 'P' => 19, + '-' => 20, + '*' => 21, + '=' => 22 +); + +# This should probably be replaced with the keys of the hash, but we want to make sure they +# appear in the right order! +$rowLabels = "A,V,L,I,M,F,Y,W,R,H,K,D,E,S,T,N,Q,G,C,P,-,*,="; + +$rowCount = scalar(keys %aaIndex); + +@exonScores = (); +for ($i=0; $i < scalar(@sorted_pos)*$rowCount; $i++) +{ + $exonScores[$i] = ""; + $exonLabels[$i] = ""; +} + + +$hasDup = 0; + +foreach $exon (keys %exonList) +{ + $exon =~ m/^(\d+),(\d+)/; + ($exonStart, $exonEnd) = ($1, $2); + if (defined $exonIxs{"$exonStart,$exonEnd"}) + { + $exonIx = $exonIxs{"$exonStart,$exonEnd"}; + } + else + { + # shouldn't ever happen, since we just populated exonIxs from the same list + # that filled exonList + die "Unexpected exon in the bagging area - $exonStart,$exonEnd vs " . (join ";", keys %exonIxs); + } + + # now I need to populate the exonScores and exonLabels matrices. There's a fixed order for the destination AA, so + # Really I just have to calculate the position in the list as $i + AA_ix*exon_count + + $label = $exonList{$exon}; + if ($label =~ m/#\d+ (\S+): (\S+)/) + { + ($hgvs, $score) = ($1, $2); + } + else + { + die "unexpected label failure: $label"; + } + + if ($hgvs =~ m/\.(\w{3})\d+=$/) + { + $destAA = $1; + $sourceAA = $destAA; + } + elsif ($hgvs =~ m/(\w{3})\d+(\w{3})$/) + { + $destAA = $2; + $sourceAA = $1; + } + else + { + die "unexpected failure to find dest AA: $hgvs"; + } + $aaLetter = $aaNames{$destAA}; + if (!defined $aaIndex{$aaLetter}) + { + die "Don't have an index for AA: -$aaLetter- (from $destAA)"; + } + $aaIx = $aaIndex{$aaLetter}; + $ix = $exonIx + $aaIx*scalar(@sorted_pos); + + if ($destAA ne $sourceAA) + { + $exonScores[$ix] = $score; + $exonLabels[$ix] = $label; + + if (defined $exonDups{$exon}) + { + $hasDupe = 1; + if ($exonScores[$ix] !~ m/\|/) + { $exonScores[$ix] .= "|"; } + $exonLabels[$ix] .= $exonDups{$exon}; + } + } + else + { + # this is synonymous + $exonScores[$ix] = "#FFD700"; + $exonLabels[$ix] = "Wild type"; + + $aaIx = $aaIndex{"="}; + $ix = $exonIx + $aaIx*scalar(@sorted_pos); + + $exonScores[$ix] = $score; + $exonLabels[$ix] = $label; + + if (defined $exonDups{$exon}) + { + $hasDupe = 1; + if ($exonScores[$ix] !~ m/|/) + { + $exonCount = 1 + scalar(() = $exonDups{$exon} =~ m/
/g); # awkward, but gives the right count + $exonScores[$ix] .= "|$exonCount"; + } + $exonLabels[$ix] .= $exonDups{$exon}; + } + + } + +} + +if ($hasDupe) +{ + # just a san check, also sometimes useful for noting which datasets have dups for investigation + # of whether that bit of mouseover code is working properly + print STDERR "This one has dups\n"; +} + +# at this point, we have +# $sizeString : the exon sizes string +# $exonStarts: the exon starts string +# 21 (we know it's the number of rows) +# "A,R,N,D,C,E,Q,G,H,I,L,K,M,F,P,S,T,W,Y,V,*" (this is the row labels) +# @exonScores +# @exonLabels +# ID, where ID needs to be replaced with $urnID +# +# All of this is getting appended to $outstring + + +# sanity check +if (scalar(@exonScores) != $rowCount * scalar(@sorted_pos)) +{ + die "exonScores list is wrong, compare " . scalar(@exonScores) . " with " . ($rowCount*scalar(@sorted_pos)); +} +if (scalar(@exonScores) != scalar(@exonLabels)) +{ + die "mismatch in scores and labels lengths, compare " . scalar(@exonScores) . " with " . scalar(@exonLabels); +} + +$outstring .= "$sizeString\t$exonStarts"; + +# that's the end of the basic bed fields. Now for the extra heatmap-specific fields + +$outstring .= "\t$rowCount\t$rowLabels\t"; + +# now write out the lower bound, midpoint, and upper bound for the track +# These should be in ascending order + +$outstring .= join ",", (min(@exonScores), (min(@exonScores)+max(@exonScores))/2, max(@exonScores)); +$outstring .= ",\t"; + +# now the colors for each block + +$outstring .= join ",", ("blue", "silver", "red"); +$outstring .= ",\t"; + +# now for the cell scores themselves, along with the cell labels + +$outstring .= join ",", @exonScores; +$outstring .= ",\t"; +$outstring .= join ",", @exonLabels; +$outstring .= ",\t"; + +$metadata_ref = $$jsonObj{"metadata"}; +$shortDescription = $$metadata_ref{"shortDescription"}; +$shortDescription =~ s/\\u(.{4})/chr(hex($1))/ge; # thanks to https://www.perlmonks.org/?node_id=796799 + # this deals with unicode characters. +$shortDescription = encode_entities($shortDescription); # this HTML-encodes everything +$shortDescription = cleanWhitespace($shortDescription); +#$shortDescription =~ s/\s+/ /g; +$outstring .= "$shortDescription"; + +# everything above was mandatory fields for a heatmap (including a string for the heatmap title area) +# What follows next is metadata fields for hgc display + +$url = "ID"; +$url =~ s/ID/$urnID/g; +$outstring .= "\t$url"; + +$abstract = $$metadata_ref{"abstractText"}; +$abstract =~ s/\\u(.{4})/chr(hex($1))/ge; +$abstract = encode_entities($abstract); # this HTML-encodes everything +$abstract = cleanWhitespace($abstract); +#$abstract =~ s/\s+/ /g; +$outstring .= "\t$abstract"; + + +$methodText = $$metadata_ref{"methodText"}; +$methodText =~ s/\\u(.{4})/chr(hex($1))/ge; +$methodText = encode_entities($methodText); # this HTML-encodes everything +$methodText = cleanWhitespace($methodText); + +$expRef = $$metadata_ref{"experiment"}; +$expMethodText = $$expRef{"methodText"}; +if ($expMethodText =~ m/\w+/) +{ + $expMethodText =~ s/\\u(.{4})/chr(hex($1))/ge; + $expMethodText = encode_entities($expMethodText); # this HTML-encodes everything + $expMethodText = cleanWhitespace($expMethodText); + + $methodText .= "

$expMethodText"; +} + +$outstring .= "\t$methodText"; + +$outstring .= "\n"; +print "$outstring"; + +open($seqFile, ">>", "MaveDBSeqFile.fa"); +#metadata -> targetGenes (array) -> name, targetSequence -> sequence +$targetGenesRef = $$metadata_ref{"targetGenes"}; +foreach $targetGene_ref (@$targetGenesRef) +{ + $targetGene_name = $$targetGene_ref{"name"}; + $targetSeq_ref = $$targetGene_ref{"targetSequence"}; + $sequence = $$targetSeq_ref{"sequence"}; + print $seqFile ">$targetGene_name $urnID\n"; + print $seqFile "$sequence\n"; +} +close($seqFile);