5aa96722c0548d51c76a430c576d1e7823c30ef7 chmalee Wed Aug 25 11:32:44 2021 -0700 Get double sided gaps supported in building of LRG track, refs #24672 diff --git src/hg/utils/automation/parseLrgXml.pl src/hg/utils/automation/parseLrgXml.pl index 7052075..e250595 100755 --- src/hg/utils/automation/parseLrgXml.pl +++ src/hg/utils/automation/parseLrgXml.pl @@ -1,333 +1,383 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/parseLrgXml.pl instead. use warnings; use strict; use Encode; # Of the UTF-8 sort, not the ENCODE project. :) use XML::LibXML; use Data::Dumper; #print Data::Dumper->Dump([$dom], [qw(dom)]); my $assemblyPrefix = shift @ARGV; my $workdir = shift @ARGV; if (not defined $workdir) { $workdir = "./";} sub usage { my ($status) = @_; my $base = $0; $base =~ s/^(.*\/)?//; print STDERR " usage: $base assemblyPrefix pathToFiles Parses LRG_*.xml files in current directory into BED (LRG regions) and genePred+Cdna+Pep (transcripts in LRG coordinates.) assemblyPrefix is something like \"GRCh37\" to match GRCh37.p5 etc. If pathToFiles is present, look for pathToFiles/LRG_*.xml files. pathToFiles defaults to the current working directory. "; exit $status; } # usage my %gbAccToHg19Alt = ( gl000250 => 'chr6_apd_hap1', gl000251 => 'chr6_cox_hap2', gl000252 => 'chr6_dbb_hap3', gl000253 => 'chr6_mann_hap4', gl000254 => 'chr6_mcf_hap5', gl000255 => 'chr6_qbl_hap6', gl000256 => 'chr6_ssto_hap7', gl000257 => 'chr4_ctg9_hap1', gl000258 => 'chr17_ctg5_hap1', ); sub findAccession($$$$) { my ($annotationSets, $fixedId, $transType, $nodeId) = @_; my $transcript = ""; foreach my $s (@$annotationSets) { my $type = $s->findvalue('@type'); if ($type eq $transType) { my @transcripts = $s->findnodes($nodeId . '[@fixed_id="' . $fixedId . '"]'); foreach my $p (@transcripts) { $transcript = $p->findvalue('@accession'); } } } return $transcript; } # findAccession() sub findAssemblyMappings($$) { # Return the dom node(s) of the for the desired assembly (disregarding GRC patch suffix) my ($assemblyPrefix, $annotationSets) = @_; my @mappings = (); # There are usually several annotation sets; the one with source/name "LRG" has # the reference assembly mapping that we're looking for; my $lrgMapping; foreach my $s (@$annotationSets) { my $name = $s->findvalue("source/name"); if ($name eq "LRG") { my @mappingNodes = $s->findnodes("mapping"); foreach my $m (@mappingNodes) { # Check the assembly name my $assembly = $m->findvalue('@coord_system'); push @mappings, $m if ($assembly =~ /^$assemblyPrefix/); } } } return @mappings; } # findAssemblyMappings sub utf8ToHtml { my ($word) = @_; $word = decode_utf8($word, Encode::FB_CROAK); my @chars = split(//, $word); $word = ""; foreach my $c (@chars) { if (ord($c) > 127) { $c = sprintf "&#%d;", ord($c); } $word .= $c; } return $word; } # utf8ToHtml sub parseOneLrg { my ($xmlIn, $bedF, $gpF, $cdnaF, $pepF) = @_; my $dom = XML::LibXML->load_xml( location => $xmlIn ); my @annotationSets = $dom->findnodes("/lrg/updatable_annotation/annotation_set"); my @refMappings = findAssemblyMappings($assemblyPrefix, \@annotationSets); if (@refMappings == 0) { die "LRG mapping for $assemblyPrefix not found in $xmlIn."; } my $lrgName = $dom->findvalue('/lrg/fixed_annotation/id'); my $lrgSeq = $dom->findvalue('/lrg/fixed_annotation/sequence'); my $lrgSize = length($lrgSeq); # HGNC id & symbol my $hgncId = $dom->findvalue('/lrg/fixed_annotation/hgnc_id'); my $hgncSymbol = $dom->findvalue('/lrg/updatable_annotation/annotation_set/lrg_locus[@source="HGNC"]'); # GenBank reference sequence accession my $lrgNcbiAcc = $dom->findvalue('/lrg/fixed_annotation/sequence_source'); # LRG sequence source(s); for now, keeping only the first listed source because hgc.c # doesn't yet anticipate lists. my @lrgSources = $dom->findnodes('/lrg/fixed_annotation/source'); my ($lrgSource, $lrgSourceUrl) = ('', ''); while ((defined $lrgSources[0]) && (! $lrgSources[0]->findvalue('name'))) { shift @lrgSources; } if (defined $lrgSources[0]) { $lrgSource = utf8ToHtml($lrgSources[0]->findvalue('name')); $lrgSourceUrl = $lrgSources[0]->findvalue('url'); } # watch out for stray tab chars: $lrgSource =~ s/^\s*(.*?)\s*$/$1/; $lrgSourceUrl =~ s/^\s*(.*?)\s*$/$1/; # watch out for URLs that are just hostnames (no protocol) if ($lrgSourceUrl !~ /^\w+:\/\//) { $lrgSourceUrl = "http://" . $lrgSourceUrl; } my $creationDate = $dom->findvalue('/lrg/fixed_annotation/creation_date'); foreach my $refMapping (@refMappings) { # Find BED 12+ fields. my $mapType = $refMapping->findvalue('@type'); my $seq = $refMapping->findvalue('@other_name'); - if ($seq eq 'unlocalized') { + if ($seq eq 'unlocalized' || $seq eq 'Unknown') { $seq = "Un"; } - if ($mapType eq 'haplotype' || $mapType eq 'fix_patch' || $mapType eq 'novel_patch' || $mapType eq 'patch') { + if ($mapType eq 'haplotype' || $mapType eq 'fix_patch' || $mapType eq 'novel_patch' || $mapType eq 'patch' || $seq eq "Un") { my $gbAcc = $refMapping->findvalue('@other_id_syn'); $gbAcc =~ m/^[A-Z]+\d+\.\d+$/ || die "$xmlIn: $assemblyPrefix has $mapType mapping with " . "other_id_syn='$gbAcc', expecting versioned GenBank acc (e.g. 'KI270850.1')."; if ($assemblyPrefix eq 'GRCh37') { $gbAcc =~ s/\..*//; $gbAcc = lc $gbAcc; } else { $gbAcc =~ s/\./v/; } # Trim chromosome band stuff if present $seq =~ s/[pq].*//; if ($assemblyPrefix eq 'GRCh37' && exists $gbAccToHg19Alt{$gbAcc}) { $seq = $gbAccToHg19Alt{$gbAcc}; } elsif ($seq eq 'Un') { $seq .= "_$gbAcc"; } else { # NOTE: as of 5/30/18, there are no mappings to hg19 or hg38 seqs with the suffix _random, # so I'm not sure what those would look like in the XML. This could cause us to lose # mappings to the _random sequences, *if* any are added in the future. my $suffix = (($mapType eq 'haplotype' || $mapType eq 'novel_patch') ? 'alt' : 'fix'); $seq .= "_${gbAcc}_$suffix"; } } $seq = 'chr' . $seq unless ($seq =~ /^chr/); my $start = $refMapping->findvalue('@other_start') - 1; my $end = $refMapping->findvalue('@other_end'); my @mappingSpans = $refMapping->findnodes('mapping_span'); - die 'Unusual number of mapping_spans' if (@mappingSpans != 1); - my $span = $mappingSpans[0]; - my $lrgStart = $span->findvalue('@lrg_start') - 1; - my $lrgEnd = $span->findvalue('@lrg_end'); + #We can certainly have multiple mapping_span elements, see LRG_1321 on GRCh37.p13 + #This happens when there is a large gap in the alignment + my $firstSpan = $mappingSpans[0]; + my $finalSpan = $mappingSpans[$#mappingSpans]; + my $lrgStart = $firstSpan->findvalue('@lrg_start') - 1; + my $lrgEnd = $finalSpan->findvalue('@lrg_end'); if ($lrgSize < $lrgEnd) { die "$xmlIn: length of sequence is $lrgSize but $assemblyPrefix lrg_end is $lrgEnd"; } my $name = $lrgName; if ($lrgStart != 0 || $lrgEnd != $lrgSize) { $name .= ":". ($lrgStart+1) . "-$lrgEnd"; } - my $strand = $span->findvalue('@strand'); + # ensure strandedness is the same: + my $strand = $firstSpan->findvalue('@strand'); + foreach my $span (@mappingSpans) { + my $st = $span->findvalue('@strand'); + die "$xmlIn: Mismatched strands across mapping_span elements" if ($st != $strand); + } $strand = ($strand == 1 ? '+' : '-'); - # Sort out indels and mismatches when building block coords: + # Sort out indels, mismatches, and large gaps when building block coords: my @blockSizes = (); my @blockStarts = (0); my @mismatches = (); my @indels = (); - my @diffs = $refMapping->findnodes('mapping_span/diff'); + @mappingSpans = sort { $a->findvalue('@other_start') <=> $b->findvalue('@other_start') } @mappingSpans; + foreach my $ix (0 .. $#mappingSpans) { + my $span = $mappingSpans[$ix]; + my @diffs = $span->findnodes('diff'); + if (scalar(@diffs) > 0) { # Sort @diffs by ascending assembly start coords, so the loop below has sorted inputs: @diffs = sort { $a->findvalue('@other_start') <=> $b->findvalue('@other_start') } @diffs; foreach my $d (@diffs) { my $lrgStart = $d->findvalue('@lrg_start') - 1; my $lrgEnd = $d->findvalue('@lrg_end'); my $blkStart = $d->findvalue('@other_start') - 1 - $start; my $blkEnd = $d->findvalue('@other_end') - $start; my $lrgSeq = $d->findvalue('@lrg_sequence'); my $refSeq = $d->findvalue('@other_sequence'); my $type = $d->findvalue('@type'); my $lastBlkStart = $blockStarts[$#blockStarts]; if ($type eq 'mismatch') { push @mismatches, join(':', $blkStart, $lrgStart, $refSeq, $lrgSeq); } elsif ($type eq 'lrg_ins') { # LRG has sequence where assembly has none; adjust assembly coords to start=end $blkStart++; $blkEnd--; die "weird lrg_ins other_ coords" if ($blkStart != $blkEnd); push @blockSizes, ($blkStart - $lastBlkStart); push @blockStarts, $blkStart; push @indels, join(':', $blkStart, $lrgStart, $refSeq, $lrgSeq); } elsif ($type eq 'other_ins') { # assembly has sequence where LRG has none; adjust LRG coords to be start=end $lrgStart++; $lrgEnd--; die "weird other_ins lrg_ coords" if ($lrgStart != $lrgEnd); push @blockSizes, ($blkStart - $lastBlkStart); push @blockStarts, $blkEnd; push @indels, join(':', $blkStart, $lrgStart, $refSeq, $lrgSeq); } else { die "Unrecognized diff type '$type' in $xmlIn"; } } + } elsif (scalar(@mappingSpans) > 1) { + # at this point we have multiple mapping spans with no , which is + # supposed to represent large gaps. Downstream we need this represented + # as an indel so fake it here. As per the above code, this kind of + # element can be thought of as both the LRG and assembly having sequence + # where the other doesn't + my $lrgStart = $span->findvalue('@lrg_start') - 1; + my $lrgEnd = $span->findvalue('@lrg_end'); + my $blkStart = $span->findvalue('@other_start') - 1 - $start; + my $otherStart = $span->findvalue('@other_start'); + my $blkEnd = $span->findvalue('@other_end') - $start; + my $lastBlkStart = $blockStarts[$#blockStarts]; + push @blockSizes, ($blkEnd - $blkStart); + push @blockStarts, $blkStart if $blkStart != $lastBlkStart; + if ($ix > 0) { + # add a fake indel element representing the double sided gap, but we want one less + # than the number of total blocks in the bed + my $prevSpan = $mappingSpans[$ix-1]; + my $prevLrgStart = $prevSpan->findvalue('@lrg_start')-1; + my $prevLrgEnd = $prevSpan->findvalue('@lrg_end'); + my $prevBlkStart = $blockStarts[$ix-1]; + my $prevBlkSize = $blockSizes[$ix - 1]; + my $assemblyGapStart = $prevBlkSize; + my $assemblyGapSize = ($start + $blkStart) - ($start + $assemblyGapStart); + my $lrgGapSize; + my $lrgGapStart; + if ($strand eq '-') { + $lrgGapSize = $prevLrgStart - $lrgEnd; + $lrgGapStart = $lrgEnd; + } else { + $lrgGapSize = $lrgStart - $prevLrgEnd; + $lrgGapStart = $prevLrgEnd; + } + push @indels, join(':', $prevBlkSize, $prevBlkSize, "?" x $assemblyGapSize, substr $lrgSeq, $lrgGapStart, $lrgGapSize); + } + } + } + if (scalar(@mappingSpans) == 1) { push @blockSizes, ($end - $start - $blockStarts[$#blockStarts]); + } my $blockCount = @blockStarts; - die unless ($blockCount == @blockSizes); + die "$xmlIn blockCount $blockCount != blockSizes @blockSizes" unless ($blockCount == @blockSizes); my $blockSizesStr = join(',', @blockSizes) . ','; my $blockStartsStr = join(',', @blockStarts) . ','; my $mismatchesStr = join(',', @mismatches); my $indelsStr = join(',', @indels); print $bedF join("\t", $seq, $start, $end, $name, 0, $strand, $start, $end, 0, $blockCount, $blockSizesStr, $blockStartsStr, $mismatchesStr, $indelsStr, $lrgSize, $hgncId, $hgncSymbol, $lrgNcbiAcc, $lrgSource, $lrgSourceUrl, $creationDate) . "\n"; } # each refMapping # Now convert fixed transcript annotation to genePredExt and fasta for mrna and protein. my @fixedTs = $dom->findnodes('/lrg/fixed_annotation/transcript'); foreach my $t (@fixedTs) { my $txStart = $t->findvalue('coordinates/@start') - 1; my $txEnd = $t->findvalue('coordinates/@end'); my $txStrand = ($t->findvalue('coordinates/@strand') < 0) ? '-' : '+'; my $tN = $t->findvalue('@name'); my $ncbiTranscript = findAccession(\@annotationSets, $tN, "ncbi", "features/gene/transcript"); my $ensemblTranscript = findAccession(\@annotationSets, $tN, "ensembl", "features/gene/transcript"); my $txName = $lrgName . $tN; # There may be multiple coding regions for different transcripts (or no coding regions # for non-coding gene loci). LRG_321.xml has an interesting assortment of coding regions, # exons and proteins. We don't really have a way to represent one transcript with multiple # coding regions, so if there's a p# that matches the t#, use that; otherwise just use the # first given. my ($cdsStart, $cdsEnd); my $pN = $tN; $pN =~ s/^t/p/ || die; my $ncbiProtein = findAccession(\@annotationSets, $pN, "ncbi", "features/gene/transcript/protein_product"); my $ensemblProtein = findAccession(\@annotationSets, $pN, "ensembl", "features/gene/transcript/protein_product"); my @codingRegions = $t->findnodes('coding_region[translation/@name="' . $pN . '"]'); if (@codingRegions == 0) { @codingRegions = $t->findnodes('coding_region'); } if (@codingRegions == 0) { # non-coding $cdsStart = $cdsEnd = $txStart; } else { $cdsStart = $codingRegions[0]->findvalue('coordinates/@start') - 1; $cdsEnd = $codingRegions[0]->findvalue('coordinates/@end'); } my (@exonStarts, @exonEnds, @exonFrames); # In LRG xml, phase is assigned to introns and applies to the following exon. # There are no intron nodes between exon nodes outside the coding region. # So we can't process exons and introns separately; we have to go through # all in order, and remember the phase when it's given in an intron node. my $phase = -1; my @trons = $t->findnodes('exon | intron'); foreach my $ei (@trons) { if ($ei->nodeName eq 'exon') { my @lrgCoords = $ei->findnodes('coordinates[@coord_system="' . $lrgName . '"]'); my $eStart = $lrgCoords[0]->findvalue('@start') - 1; my $eEnd = $lrgCoords[0]->findvalue('@end'); push @exonStarts, $eStart; push @exonEnds, $eEnd; if ($phase == -1 && $cdsStart != $cdsEnd && $eStart <= $cdsStart && $eEnd > $cdsStart) { # First coding exon's phase is 0. $phase = 0; } push @exonFrames, $phase; # In case this is the last coding exon (no subsequent intron node): $phase = -1; } elsif ($ei->nodeName eq 'intron') { # Now we know the phase of the following exon. $phase = $ei->findvalue('@phase'); } } my $cmpl = ($cdsStart != $cdsEnd) ? "cmpl" : "none"; print $gpF join("\t", $txName, $lrgName, $txStrand, $txStart, $txEnd, $cdsStart, $cdsEnd, scalar(@exonStarts), join(",", @exonStarts).",", join(",", @exonEnds).",", 0, $hgncSymbol, $cmpl, $cmpl, join(",", @exonFrames).",", $ncbiTranscript, $ensemblTranscript, $ncbiProtein, $ensemblProtein) . "\n"; print $cdnaF "$txName\t" . $t->findvalue('cdna/sequence') . "\n"; if (@codingRegions) { my @ps = $codingRegions[0]->findnodes('translation'); # I expect one for protein-coding genes, none for non-coding: print $pepF "$txName\t" . $ps[0]->findvalue('sequence') . "\n"; } } # each transcript } # parseOneLrg ################################ MAIN ################################# if (! $assemblyPrefix) { usage(1); } my @xmlFiles = <"$workdir/LRG_*.xml">; if (@xmlFiles == 0) { warn "No LRG_*.xml files found in $workdir -- need to cd somewhere?\n"; } open(my $bedF, "| sort -k1,1 -k2n,2n >lrg.bed") || die "Can't open lrg.bed for (sorting and) writing): $!\n"; open(my $gpF, "| sort -k2,2 -k4n,4n >lrgTranscriptsUnmapped.gp") || die "Can't open lrgTranscriptsUnmapped.gp for (sorting and) writing: $!\n"; open(my $cdnaF, ">lrgCdna.tab") || die "Can't open lrgCdna.tab for writing: $!\n"; open(my $pepF, ">lrgPep.tab") || die "Can't open lrgPep.tab for writing: $!\n"; foreach my $file (@xmlFiles) { parseOneLrg($file, $bedF, $gpF, $cdnaF, $pepF); } close($bedF); close($gpF); close($cdnaF); close($pepF);