src/hg/utils/automation/parseLrgXml.pl 41ac5854aee6292610b1519e69d107116c8b9e22

41ac5854aee6292610b1519e69d107116c8b9e22
hiram
  Wed Sep 16 08:27:15 2020 -0700
adding NCBI and Ensembl transcript and protein IDs to the genePred output refs #24672;

diff --git src/hg/utils/automation/parseLrgXml.pl src/hg/utils/automation/parseLrgXml.pl
index cff0370..d33dd84 100755
--- src/hg/utils/automation/parseLrgXml.pl
+++ src/hg/utils/automation/parseLrgXml.pl
@@ -25,39 +25,53 @@
 ";
   exit $status;
 } # usage
 
 my %gbAccToHg19Alt = ( gl000250 => 'chr6_apd_hap1',
                        gl000251 => 'chr6_cox_hap2',
                        gl000252 => 'chr6_dbb_hap3',
                        gl000253 => 'chr6_mann_hap4',
                        gl000254 => 'chr6_mcf_hap5',
                        gl000255 => 'chr6_qbl_hap6',
                        gl000256 => 'chr6_ssto_hap7',
                        gl000257 => 'chr4_ctg9_hap1',
                        gl000258 => 'chr17_ctg5_hap1',
                      );
 
-sub findAssemblyMappings {
+sub findAccession($$$$) {
+  my ($annotationSets, $fixedId, $transType, $nodeId) = @_;
+  my $transcript = "";
+  foreach my $s (@$annotationSets) {
+    my $type = $s->findvalue('@type');
+    if ($type eq $transType) {
+    my @transcripts = $s->findnodes($nodeId . '[@fixed_id="' . $fixedId . '"]');
+       foreach my $p (@transcripts) {
+	  $transcript = $p->findvalue('@accession');
+       }
+    }
+  }
+  return $transcript;
+} # findAccession()
+
+sub findAssemblyMappings($$) {
   # Return the dom node(s) of the <mapping> for the desired assembly (disregarding GRC patch suffix)
-  my ($dom, $assemblyPrefix) = @_;
+  my ($assemblyPrefix, $annotationSets) = @_;
   my @mappings = ();
-  my @annotationSets = $dom->findnodes("/lrg/updatable_annotation/annotation_set");
 # There are usually several annotation sets; the one with source/name "LRG" has
   # the reference assembly mapping that we're looking for;
   my $lrgMapping;
-  foreach my $s (@annotationSets) {
+  foreach my $s (@$annotationSets) {
     my $name = $s->findvalue("source/name");
     if ($name eq "LRG") {
       my @mappingNodes = $s->findnodes("mapping");
       foreach my $m (@mappingNodes) {
 	# Check the assembly name
 	my $assembly = $m->findvalue('@coord_system');
 	push @mappings, $m if ($assembly =~ /^$assemblyPrefix/);
       }
     }
   }
   return @mappings;
 } # findAssemblyMappings
 
 sub utf8ToHtml {
   my ($word) = @_;
@@ -66,31 +80,32 @@
   $word = "";
   foreach my $c (@chars) {
     if (ord($c) > 127) {
       $c = sprintf "&#%d;", ord($c);
     }
     $word .= $c;
   }
   return $word;
 } # utf8ToHtml
 
 
 sub parseOneLrg {
   my ($xmlIn, $bedF, $gpF, $cdnaF, $pepF) = @_;
   my $dom = XML::LibXML->load_xml( location => $xmlIn );
 
-  my @refMappings = findAssemblyMappings($dom, $assemblyPrefix);
+  my @annotationSets = $dom->findnodes("/lrg/updatable_annotation/annotation_set");
+  my @refMappings = findAssemblyMappings($assemblyPrefix, \@annotationSets);
   if (@refMappings == 0) {
     die "LRG mapping for $assemblyPrefix not found in $xmlIn.";
   }
 
   my $lrgName = $dom->findvalue('/lrg/fixed_annotation/id');
   my $lrgSeq = $dom->findvalue('/lrg/fixed_annotation/sequence');
   my $lrgSize = length($lrgSeq);
   # HGNC id & symbol
   my $hgncId = $dom->findvalue('/lrg/fixed_annotation/hgnc_id');
   my $hgncSymbol = $dom->findvalue('/lrg/updatable_annotation/annotation_set/lrg_locus[@source="HGNC"]');
   # GenBank reference sequence accession
   my $lrgNcbiAcc = $dom->findvalue('/lrg/fixed_annotation/sequence_source');
   # LRG sequence source(s); for now, keeping only the first listed source because hgc.c
   # doesn't yet anticipate lists.
   my @lrgSources = $dom->findnodes('/lrg/fixed_annotation/source');
@@ -199,44 +214,49 @@
     my $blockCount = @blockStarts;
     die unless ($blockCount == @blockSizes);
     my $blockSizesStr = join(',', @blockSizes) . ',';
     my $blockStartsStr = join(',', @blockStarts) . ',';
     my $mismatchesStr = join(',', @mismatches);
     my $indelsStr = join(',', @indels);
 
     print $bedF join("\t", $seq, $start, $end, $name, 0, $strand, $start, $end, 0,
                      $blockCount, $blockSizesStr, $blockStartsStr,
                      $mismatchesStr, $indelsStr, $lrgSize,
                      $hgncId, $hgncSymbol, $lrgNcbiAcc,
                      $lrgSource, $lrgSourceUrl, $creationDate) . "\n";
   } # each refMapping
 
   # Now convert fixed transcript annotation to genePredExt and fasta for mrna and protein.
+
   my @fixedTs = $dom->findnodes('/lrg/fixed_annotation/transcript');
   foreach my $t (@fixedTs) {
     my $txStart = $t->findvalue('coordinates/@start') - 1;
     my $txEnd = $t->findvalue('coordinates/@end');
     my $txStrand = ($t->findvalue('coordinates/@strand') < 0) ? '-' : '+';
     my $tN = $t->findvalue('@name');
+    my $ncbiTranscript = findAccession(\@annotationSets, $tN, "ncbi", "features/gene/transcript");
+    my $ensemblTranscript = findAccession(\@annotationSets, $tN, "ensembl", "features/gene/transcript");
     my $txName = $lrgName . $tN;
     # There may be multiple coding regions for different transcripts (or no coding regions
     # for non-coding gene loci).  LRG_321.xml has an interesting assortment of coding regions,
     # exons and proteins.  We don't really have a way to represent one transcript with multiple
     # coding regions, so if there's a p# that matches the t#, use that; otherwise just use the
     # first given.
     my ($cdsStart, $cdsEnd);
     my $pN = $tN;  $pN =~ s/^t/p/ || die;
+    my $ncbiProtein = findAccession(\@annotationSets, $pN, "ncbi", "features/gene/transcript/protein_product");
+    my $ensemblProtein = findAccession(\@annotationSets, $pN, "ensembl", "features/gene/transcript/protein_product");
     my @codingRegions = $t->findnodes('coding_region[translation/@name="' . $pN . '"]');
     if (@codingRegions == 0) {
       @codingRegions = $t->findnodes('coding_region');
     }
     if (@codingRegions == 0) {
       # non-coding
       $cdsStart = $cdsEnd = $txStart;
     } else {
       $cdsStart = $codingRegions[0]->findvalue('coordinates/@start') - 1;
       $cdsEnd = $codingRegions[0]->findvalue('coordinates/@end');
     }
     my (@exonStarts, @exonEnds, @exonFrames);
     # In LRG xml, phase is assigned to introns and applies to the following exon.
     # There are no intron nodes between exon nodes outside the coding region.
     # So we can't process exons and introns separately; we have to go through
@@ -255,31 +275,32 @@
 	    $eStart <= $cdsStart && $eEnd > $cdsStart) {
 	  # First coding exon's phase is 0.
 	  $phase = 0;
 	}
 	push @exonFrames, $phase;
 	# In case this is the last coding exon (no subsequent intron node):
 	$phase = -1;
       } elsif ($ei->nodeName eq 'intron') {
 	# Now we know the phase of the following exon.
 	$phase = $ei->findvalue('@phase');
       }
     }
     my $cmpl = ($cdsStart != $cdsEnd) ? "cmpl" : "none";
     print $gpF join("\t", $txName, $lrgName, $txStrand, $txStart, $txEnd, $cdsStart, $cdsEnd,
 	scalar(@exonStarts), join(",", @exonStarts).",", join(",", @exonEnds).",",
-		    0, $hgncSymbol, $cmpl, $cmpl, join(",", @exonFrames).",") . "\n";
+	0, $hgncSymbol, $cmpl, $cmpl, join(",", @exonFrames).",",
+    $ncbiTranscript, $ensemblTranscript, $ncbiProtein, $ensemblProtein) . "\n";
 
     print $cdnaF "$txName\t" . $t->findvalue('cdna/sequence') . "\n";
 
     if (@codingRegions) {
       my @ps = $codingRegions[0]->findnodes('translation');
       # I expect one for protein-coding genes, none for non-coding:
       print $pepF "$txName\t" . $ps[0]->findvalue('sequence') . "\n";
     }
   } # each transcript
 } # parseOneLrg
 
 ################################ MAIN #################################
 
 if (! $assemblyPrefix) {
   usage(1);