fbd7a94213923088270a2bc3da63d0b8139451f0 angie Thu Oct 10 14:33:52 2019 -0700 LRG now distinguishes between fix_patch and novel_patch in their XML, so we can include mappings to fix & alt patch sequences. Overdue for an update anyway. refs #24285 diff --git src/hg/utils/automation/parseLrgXml.pl src/hg/utils/automation/parseLrgXml.pl index cce61b9..069df2c 100755 --- src/hg/utils/automation/parseLrgXml.pl +++ src/hg/utils/automation/parseLrgXml.pl @@ -102,51 +102,51 @@ $lrgSource = utf8ToHtml($lrgSources[0]->findvalue('name')); $lrgSourceUrl = $lrgSources[0]->findvalue('url'); } # watch out for stray tab chars: $lrgSource =~ s/^\s*(.*?)\s*$/$1/; $lrgSourceUrl =~ s/^\s*(.*?)\s*$/$1/; my $creationDate = $dom->findvalue('/lrg/fixed_annotation/creation_date'); foreach my $refMapping (@refMappings) { # Find BED 12+ fields. my $mapType = $refMapping->findvalue('@type'); my $seq = $refMapping->findvalue('@other_name'); if ($seq eq 'unlocalized') { $seq = "Un"; } - if ($mapType eq 'haplotype' || $mapType eq 'patch') { + if ($mapType eq 'haplotype' || $mapType eq 'fix_patch' || $mapType eq 'novel_patch') { my $gbAcc = $refMapping->findvalue('@other_id_syn'); $gbAcc =~ m/^[A-Z]+\d+\.\d+$/ || die "$xmlIn: $assemblyPrefix has $mapType mapping with " . "other_id_syn='$gbAcc', expecting versioned GenBank acc (e.g. 'KI270850.1')."; if ($assemblyPrefix eq 'GRCh37') { $gbAcc =~ s/\..*//; $gbAcc = lc $gbAcc; } else { $gbAcc =~ s/\./v/; } # Trim chromosome band stuff if present $seq =~ s/[pq].*//; if ($assemblyPrefix eq 'GRCh37' && exists $gbAccToHg19Alt{$gbAcc}) { $seq = $gbAccToHg19Alt{$gbAcc}; } elsif ($seq eq 'Un') { $seq .= "_$gbAcc"; } else { # NOTE: as of 5/30/18, there are no mappings to hg19 or hg38 seqs with the suffix _random, # so I'm not sure what those would look like in the XML. This could cause us to lose # mappings to the _random sequences, *if* any are added in the future. - my $suffix = ($mapType eq 'haplotype' ? 'alt' : 'fix'); + my $suffix = (($mapType eq 'haplotype' || $mapType eq 'novel_patch') ? 'alt' : 'fix'); $seq .= "_${gbAcc}_$suffix"; } } $seq = 'chr' . $seq unless ($seq =~ /^chr/); my $start = $refMapping->findvalue('@other_start') - 1; my $end = $refMapping->findvalue('@other_end'); my @mappingSpans = $refMapping->findnodes('mapping_span'); die 'Unusual number of mapping_spans' if (@mappingSpans != 1); my $span = $mappingSpans[0]; my $lrgStart = $span->findvalue('@lrg_start') - 1; my $lrgEnd = $span->findvalue('@lrg_end'); if ($lrgSize < $lrgEnd) { die "$xmlIn: length of sequence is $lrgSize but $assemblyPrefix lrg_end is $lrgEnd"; } my $name = $lrgName;