5aa96722c0548d51c76a430c576d1e7823c30ef7
chmalee
  Wed Aug 25 11:32:44 2021 -0700
Get double sided gaps supported in building of LRG track, refs #24672

diff --git src/hg/utils/automation/parseLrgXml.pl src/hg/utils/automation/parseLrgXml.pl
index 7052075..e250595 100755
--- src/hg/utils/automation/parseLrgXml.pl
+++ src/hg/utils/automation/parseLrgXml.pl
@@ -123,113 +123,163 @@
     $lrgSourceUrl = $lrgSources[0]->findvalue('url');
   }
   # watch out for stray tab chars:
   $lrgSource =~ s/^\s*(.*?)\s*$/$1/;
   $lrgSourceUrl =~ s/^\s*(.*?)\s*$/$1/;
   # watch out for URLs that are just hostnames (no protocol)
   if ($lrgSourceUrl !~ /^\w+:\/\//) {
     $lrgSourceUrl = "http://" . $lrgSourceUrl;
   }
   my $creationDate = $dom->findvalue('/lrg/fixed_annotation/creation_date');
 
   foreach my $refMapping (@refMappings) {
     # Find BED 12+ fields.
     my $mapType = $refMapping->findvalue('@type');
     my $seq = $refMapping->findvalue('@other_name');
-    if ($seq eq 'unlocalized') {
+    if ($seq eq 'unlocalized' || $seq eq 'Unknown') {
       $seq = "Un";
     }
-    if ($mapType eq 'haplotype' || $mapType eq 'fix_patch' || $mapType eq 'novel_patch' || $mapType eq 'patch') {
+    if ($mapType eq 'haplotype' || $mapType eq 'fix_patch' || $mapType eq 'novel_patch' || $mapType eq 'patch' || $seq eq "Un") {
       my $gbAcc = $refMapping->findvalue('@other_id_syn');
       $gbAcc =~ m/^[A-Z]+\d+\.\d+$/ || die "$xmlIn: $assemblyPrefix has $mapType mapping with " .
         "other_id_syn='$gbAcc', expecting versioned GenBank acc (e.g. 'KI270850.1').";
       if ($assemblyPrefix eq 'GRCh37') {
         $gbAcc =~ s/\..*//;
         $gbAcc = lc $gbAcc;
       } else {
         $gbAcc =~ s/\./v/;
       }
       # Trim chromosome band stuff if present
       $seq =~ s/[pq].*//;
       if ($assemblyPrefix eq 'GRCh37' && exists $gbAccToHg19Alt{$gbAcc}) {
         $seq = $gbAccToHg19Alt{$gbAcc};
       } elsif ($seq eq 'Un') {
         $seq .= "_$gbAcc";
       } else {
         # NOTE: as of 5/30/18, there are no mappings to hg19 or hg38 seqs with the suffix _random,
         # so I'm not sure what those would look like in the XML.  This could cause us to lose
         # mappings to the _random sequences, *if* any are added in the future.
         my $suffix = (($mapType eq 'haplotype' || $mapType eq 'novel_patch') ? 'alt' : 'fix');
         $seq .= "_${gbAcc}_$suffix";
       }
     }
     $seq = 'chr' . $seq unless ($seq =~ /^chr/);
     my $start = $refMapping->findvalue('@other_start') - 1;
     my $end = $refMapping->findvalue('@other_end');
     my @mappingSpans = $refMapping->findnodes('mapping_span');
-    die 'Unusual number of mapping_spans' if (@mappingSpans != 1);
-    my $span = $mappingSpans[0];
-    my $lrgStart = $span->findvalue('@lrg_start') - 1;
-    my $lrgEnd = $span->findvalue('@lrg_end');
+    #We can certainly have multiple mapping_span elements, see LRG_1321 on GRCh37.p13
+    #This happens when there is a large gap in the alignment
+    my $firstSpan = $mappingSpans[0];
+    my $finalSpan = $mappingSpans[$#mappingSpans];
+    my $lrgStart = $firstSpan->findvalue('@lrg_start') - 1;
+    my $lrgEnd = $finalSpan->findvalue('@lrg_end');
     if ($lrgSize < $lrgEnd) {
       die "$xmlIn: length of sequence is $lrgSize but $assemblyPrefix lrg_end is $lrgEnd";
     }
     my $name = $lrgName;
     if ($lrgStart != 0 || $lrgEnd != $lrgSize) {
       $name .= ":". ($lrgStart+1) . "-$lrgEnd";
     }
-    my $strand = $span->findvalue('@strand');
+    # ensure strandedness is the same:
+    my $strand = $firstSpan->findvalue('@strand');
+    foreach my $span (@mappingSpans) {
+        my $st = $span->findvalue('@strand');
+        die "$xmlIn: Mismatched strands across mapping_span elements" if ($st != $strand);
+    }
     $strand = ($strand == 1 ? '+' : '-');
 
-    # Sort out indels and mismatches when building block coords:
+    # Sort out indels, mismatches, and large gaps  when building block coords:
     my @blockSizes = ();
     my @blockStarts = (0);
     my @mismatches = ();
     my @indels = ();
-    my @diffs = $refMapping->findnodes('mapping_span/diff');
+    @mappingSpans = sort { $a->findvalue('@other_start') <=> $b->findvalue('@other_start') } @mappingSpans;
+    foreach my $ix (0 .. $#mappingSpans) {
+      my $span = $mappingSpans[$ix];
+      my @diffs = $span->findnodes('diff');
+      if (scalar(@diffs) > 0) {
         # Sort @diffs by ascending assembly start coords, so the loop below has sorted inputs:
         @diffs = sort { $a->findvalue('@other_start') <=> $b->findvalue('@other_start') } @diffs;
         foreach my $d (@diffs) {
           my $lrgStart = $d->findvalue('@lrg_start') - 1;
           my $lrgEnd = $d->findvalue('@lrg_end');
           my $blkStart = $d->findvalue('@other_start') - 1 - $start;
           my $blkEnd = $d->findvalue('@other_end') - $start;
           my $lrgSeq = $d->findvalue('@lrg_sequence');
           my $refSeq = $d->findvalue('@other_sequence');
           my $type = $d->findvalue('@type');
           my $lastBlkStart = $blockStarts[$#blockStarts];
           if ($type eq 'mismatch') {
             push @mismatches, join(':', $blkStart, $lrgStart, $refSeq, $lrgSeq);
           } elsif ($type eq 'lrg_ins') {
             # LRG has sequence where assembly has none; adjust assembly coords to start=end
             $blkStart++;  $blkEnd--;  die "weird lrg_ins other_ coords" if ($blkStart != $blkEnd);
             push @blockSizes, ($blkStart - $lastBlkStart);
             push @blockStarts, $blkStart;
             push @indels, join(':', $blkStart, $lrgStart, $refSeq, $lrgSeq);
           } elsif ($type eq 'other_ins') {
             # assembly has sequence where LRG has none; adjust LRG coords to be start=end
             $lrgStart++;  $lrgEnd--;  die "weird other_ins lrg_ coords" if ($lrgStart != $lrgEnd);
             push @blockSizes, ($blkStart - $lastBlkStart);
             push @blockStarts, $blkEnd;
             push @indels, join(':', $blkStart, $lrgStart, $refSeq, $lrgSeq);
           } else {
             die "Unrecognized diff type '$type' in $xmlIn";
           }
         }
+      } elsif (scalar(@mappingSpans) > 1) {
+        # at this point we have multiple mapping spans with no <diff>, which is
+        # supposed to represent large gaps. Downstream we need this represented
+        # as an indel so fake it here. As per the above code, this kind of
+        # element can be thought of as both the LRG and assembly having sequence
+        # where the other doesn't
+        my $lrgStart = $span->findvalue('@lrg_start') - 1;
+        my $lrgEnd = $span->findvalue('@lrg_end');
+        my $blkStart = $span->findvalue('@other_start') - 1 - $start;
+        my $otherStart = $span->findvalue('@other_start');
+        my $blkEnd = $span->findvalue('@other_end') - $start;
+        my $lastBlkStart = $blockStarts[$#blockStarts];
+        push @blockSizes, ($blkEnd - $blkStart);
+        push @blockStarts, $blkStart if $blkStart != $lastBlkStart;
+        if ($ix > 0) {
+          # add a fake indel element representing the double sided gap, but we want one less
+          # than the number of total blocks in the bed
+          my $prevSpan = $mappingSpans[$ix-1];
+          my $prevLrgStart = $prevSpan->findvalue('@lrg_start')-1;
+          my $prevLrgEnd = $prevSpan->findvalue('@lrg_end');
+          my $prevBlkStart = $blockStarts[$ix-1];
+          my $prevBlkSize = $blockSizes[$ix - 1];
+          my $assemblyGapStart = $prevBlkSize;
+          my $assemblyGapSize = ($start + $blkStart) - ($start + $assemblyGapStart);
+          my $lrgGapSize;
+          my $lrgGapStart;
+          if ($strand eq '-') {
+            $lrgGapSize = $prevLrgStart - $lrgEnd;
+            $lrgGapStart = $lrgEnd;
+          } else {
+            $lrgGapSize = $lrgStart - $prevLrgEnd;
+            $lrgGapStart = $prevLrgEnd;
+          }
+          push @indels, join(':', $prevBlkSize, $prevBlkSize, "?" x $assemblyGapSize, substr $lrgSeq, $lrgGapStart, $lrgGapSize);
+        }
+      }
+    }
+    if (scalar(@mappingSpans) == 1) {
       push @blockSizes, ($end - $start - $blockStarts[$#blockStarts]);
+    }
     my $blockCount = @blockStarts;
-    die unless ($blockCount == @blockSizes);
+    die "$xmlIn blockCount $blockCount != blockSizes @blockSizes" unless ($blockCount == @blockSizes);
     my $blockSizesStr = join(',', @blockSizes) . ',';
     my $blockStartsStr = join(',', @blockStarts) . ',';
     my $mismatchesStr = join(',', @mismatches);
     my $indelsStr = join(',', @indels);
 
     print $bedF join("\t", $seq, $start, $end, $name, 0, $strand, $start, $end, 0,
                      $blockCount, $blockSizesStr, $blockStartsStr,
                      $mismatchesStr, $indelsStr, $lrgSize,
                      $hgncId, $hgncSymbol, $lrgNcbiAcc,
                      $lrgSource, $lrgSourceUrl, $creationDate) . "\n";
   } # each refMapping
 
   # Now convert fixed transcript annotation to genePredExt and fasta for mrna and protein.
 
   my @fixedTs = $dom->findnodes('/lrg/fixed_annotation/transcript');