41ac5854aee6292610b1519e69d107116c8b9e22 hiram Wed Sep 16 08:27:15 2020 -0700 adding NCBI and Ensembl transcript and protein IDs to the genePred output refs #24672; diff --git src/hg/utils/automation/parseLrgXml.pl src/hg/utils/automation/parseLrgXml.pl index cff0370..d33dd84 100755 --- src/hg/utils/automation/parseLrgXml.pl +++ src/hg/utils/automation/parseLrgXml.pl @@ -25,39 +25,53 @@ "; exit $status; } # usage my %gbAccToHg19Alt = ( gl000250 => 'chr6_apd_hap1', gl000251 => 'chr6_cox_hap2', gl000252 => 'chr6_dbb_hap3', gl000253 => 'chr6_mann_hap4', gl000254 => 'chr6_mcf_hap5', gl000255 => 'chr6_qbl_hap6', gl000256 => 'chr6_ssto_hap7', gl000257 => 'chr4_ctg9_hap1', gl000258 => 'chr17_ctg5_hap1', ); -sub findAssemblyMappings { +sub findAccession($$$$) { + my ($annotationSets, $fixedId, $transType, $nodeId) = @_; + my $transcript = ""; + foreach my $s (@$annotationSets) { + my $type = $s->findvalue('@type'); + if ($type eq $transType) { + my @transcripts = $s->findnodes($nodeId . '[@fixed_id="' . $fixedId . '"]'); + foreach my $p (@transcripts) { + $transcript = $p->findvalue('@accession'); + } + } + } + return $transcript; +} # findAccession() + +sub findAssemblyMappings($$) { # Return the dom node(s) of the for the desired assembly (disregarding GRC patch suffix) - my ($dom, $assemblyPrefix) = @_; + my ($assemblyPrefix, $annotationSets) = @_; my @mappings = (); - my @annotationSets = $dom->findnodes("/lrg/updatable_annotation/annotation_set"); # There are usually several annotation sets; the one with source/name "LRG" has # the reference assembly mapping that we're looking for; my $lrgMapping; - foreach my $s (@annotationSets) { + foreach my $s (@$annotationSets) { my $name = $s->findvalue("source/name"); if ($name eq "LRG") { my @mappingNodes = $s->findnodes("mapping"); foreach my $m (@mappingNodes) { # Check the assembly name my $assembly = $m->findvalue('@coord_system'); push @mappings, $m if ($assembly =~ /^$assemblyPrefix/); } } } return @mappings; } # findAssemblyMappings sub utf8ToHtml { my ($word) = @_; @@ -66,31 +80,32 @@ $word = ""; foreach my $c (@chars) { if (ord($c) > 127) { $c = sprintf "&#%d;", ord($c); } $word .= $c; } return $word; } # utf8ToHtml sub parseOneLrg { my ($xmlIn, $bedF, $gpF, $cdnaF, $pepF) = @_; my $dom = XML::LibXML->load_xml( location => $xmlIn ); - my @refMappings = findAssemblyMappings($dom, $assemblyPrefix); + my @annotationSets = $dom->findnodes("/lrg/updatable_annotation/annotation_set"); + my @refMappings = findAssemblyMappings($assemblyPrefix, \@annotationSets); if (@refMappings == 0) { die "LRG mapping for $assemblyPrefix not found in $xmlIn."; } my $lrgName = $dom->findvalue('/lrg/fixed_annotation/id'); my $lrgSeq = $dom->findvalue('/lrg/fixed_annotation/sequence'); my $lrgSize = length($lrgSeq); # HGNC id & symbol my $hgncId = $dom->findvalue('/lrg/fixed_annotation/hgnc_id'); my $hgncSymbol = $dom->findvalue('/lrg/updatable_annotation/annotation_set/lrg_locus[@source="HGNC"]'); # GenBank reference sequence accession my $lrgNcbiAcc = $dom->findvalue('/lrg/fixed_annotation/sequence_source'); # LRG sequence source(s); for now, keeping only the first listed source because hgc.c # doesn't yet anticipate lists. my @lrgSources = $dom->findnodes('/lrg/fixed_annotation/source'); @@ -199,44 +214,49 @@ my $blockCount = @blockStarts; die unless ($blockCount == @blockSizes); my $blockSizesStr = join(',', @blockSizes) . ','; my $blockStartsStr = join(',', @blockStarts) . ','; my $mismatchesStr = join(',', @mismatches); my $indelsStr = join(',', @indels); print $bedF join("\t", $seq, $start, $end, $name, 0, $strand, $start, $end, 0, $blockCount, $blockSizesStr, $blockStartsStr, $mismatchesStr, $indelsStr, $lrgSize, $hgncId, $hgncSymbol, $lrgNcbiAcc, $lrgSource, $lrgSourceUrl, $creationDate) . "\n"; } # each refMapping # Now convert fixed transcript annotation to genePredExt and fasta for mrna and protein. + my @fixedTs = $dom->findnodes('/lrg/fixed_annotation/transcript'); foreach my $t (@fixedTs) { my $txStart = $t->findvalue('coordinates/@start') - 1; my $txEnd = $t->findvalue('coordinates/@end'); my $txStrand = ($t->findvalue('coordinates/@strand') < 0) ? '-' : '+'; my $tN = $t->findvalue('@name'); + my $ncbiTranscript = findAccession(\@annotationSets, $tN, "ncbi", "features/gene/transcript"); + my $ensemblTranscript = findAccession(\@annotationSets, $tN, "ensembl", "features/gene/transcript"); my $txName = $lrgName . $tN; # There may be multiple coding regions for different transcripts (or no coding regions # for non-coding gene loci). LRG_321.xml has an interesting assortment of coding regions, # exons and proteins. We don't really have a way to represent one transcript with multiple # coding regions, so if there's a p# that matches the t#, use that; otherwise just use the # first given. my ($cdsStart, $cdsEnd); my $pN = $tN; $pN =~ s/^t/p/ || die; + my $ncbiProtein = findAccession(\@annotationSets, $pN, "ncbi", "features/gene/transcript/protein_product"); + my $ensemblProtein = findAccession(\@annotationSets, $pN, "ensembl", "features/gene/transcript/protein_product"); my @codingRegions = $t->findnodes('coding_region[translation/@name="' . $pN . '"]'); if (@codingRegions == 0) { @codingRegions = $t->findnodes('coding_region'); } if (@codingRegions == 0) { # non-coding $cdsStart = $cdsEnd = $txStart; } else { $cdsStart = $codingRegions[0]->findvalue('coordinates/@start') - 1; $cdsEnd = $codingRegions[0]->findvalue('coordinates/@end'); } my (@exonStarts, @exonEnds, @exonFrames); # In LRG xml, phase is assigned to introns and applies to the following exon. # There are no intron nodes between exon nodes outside the coding region. # So we can't process exons and introns separately; we have to go through @@ -255,31 +275,32 @@ $eStart <= $cdsStart && $eEnd > $cdsStart) { # First coding exon's phase is 0. $phase = 0; } push @exonFrames, $phase; # In case this is the last coding exon (no subsequent intron node): $phase = -1; } elsif ($ei->nodeName eq 'intron') { # Now we know the phase of the following exon. $phase = $ei->findvalue('@phase'); } } my $cmpl = ($cdsStart != $cdsEnd) ? "cmpl" : "none"; print $gpF join("\t", $txName, $lrgName, $txStrand, $txStart, $txEnd, $cdsStart, $cdsEnd, scalar(@exonStarts), join(",", @exonStarts).",", join(",", @exonEnds).",", - 0, $hgncSymbol, $cmpl, $cmpl, join(",", @exonFrames).",") . "\n"; + 0, $hgncSymbol, $cmpl, $cmpl, join(",", @exonFrames).",", + $ncbiTranscript, $ensemblTranscript, $ncbiProtein, $ensemblProtein) . "\n"; print $cdnaF "$txName\t" . $t->findvalue('cdna/sequence') . "\n"; if (@codingRegions) { my @ps = $codingRegions[0]->findnodes('translation'); # I expect one for protein-coding genes, none for non-coding: print $pepF "$txName\t" . $ps[0]->findvalue('sequence') . "\n"; } } # each transcript } # parseOneLrg ################################ MAIN ################################# if (! $assemblyPrefix) { usage(1);