src/hg/utils/automation/gff3ToRefLink.pl 62ccb95d5c71a3bf7658327e6db2bc35c19d1d71

62ccb95d5c71a3bf7658327e6db2bc35c19d1d71
chmalee
  Wed Oct 11 09:57:36 2023 -0700
Adding back hgvs search support for historical refseq transcripts and
historical refSeq transcripts track. Should be more resilient code this
time.

diff --git src/hg/utils/automation/gff3ToRefLink.pl src/hg/utils/automation/gff3ToRefLink.pl
index 2983d06..9e703df 100755
--- src/hg/utils/automation/gff3ToRefLink.pl
+++ src/hg/utils/automation/gff3ToRefLink.pl
@@ -1,25 +1,26 @@
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 sub usage {
-  printf STDERR "usage: gff3ToRefLink.pl [raFile.txt] [ncbi.gff.gz] [pragmaLabels.txt] > refLink.tab\n";
+  printf STDERR "usage: gff3ToRefLink.pl [raFile.txt] [ncbi.gff.gz] [pragmaLabels.txt] [continueOnParentErrors] > refLink.tab\n";
   printf STDERR "- raFile.txt is the output of gbProcess \${asmId}_rna.gbff.gz\n";
   printf STDERR "- ncbi.gff.gz is \${asmId}_genomic.gff.gz\n";
   printf STDERR "- pragmaLabels.txt has words that appear before '::' within ##*-START/##*-END\n";
+  printf STDERR "- continueOnParentErrors is a flag to tell the script to continue when a child is defined before it's parent\n";
   printf STDERR "  text in \${asmId}_rna.gbff.gz, one label per line.\n";
   printf STDERR "output to stdout is the tab separated ncbiRefSeqLink data for each item\n";
   printf STDERR "This is intended to be called by doNcbiRefSeq.pl.\n";
   exit 255;
 } # usage
 
 # this order list of items will keep the tab output consistent so all
 #  'columns' of the data in the tab output will be the same order always.
 # there are two columns before these, the id (transcript_id) and the gene reviewed 'status',
 # and two columns after these items, the 'description' string from gbProcess
 # and the externalId for external URL outlinks
 my @tabOrderOutput = ( 'gene', 'product', 'transcript_id', 'protein_id', 'geneid',
    'mim', 'hgnc', 'genbank', 'pseudo', 'gbkey', 'source',
    'gene_biotype', 'gene_synonym', 'ncrna_class', 'note' );
 
@@ -54,75 +55,83 @@
 
 sub mustOpen ($) {
   # Open possibly gzipped file or "stdin" for reading, return filehandle or die with error.
   my ($fileName) = @_;
   my $fh;
   if ($fileName =~ m/.gz$/) {
     open ($fh, "zcat $fileName |") or die "can not read $fileName: $!";
   } elsif ($fileName =~ m/^stdin$/) {
     open ($fh, "</dev/stdin") or die "can not read $fileName: $!";
   } else {
     open ($fh, "<$fileName") or die "can not read $fileName: $!";
   }
   return $fh;
 } # mustOpen
 
-sub parseGff3($) {
+sub parseGff3($;$) {
 # Parse GFF3 into a tree structure using parent pointers.  Store only tags of interest.
 # Children store only tags that are not present or differ from the parent's tags.
-  my ($gffFile) = @_;
+  my ($gffFile, $continueOnParentErrors) = @_;
   my %gff;
   my @topLevelIds;
   my $fh = mustOpen($gffFile);
   while (my $line = <$fh>) {
     chomp $line;
     next if ($line =~ m/^#/);
     my @a = split('\t', $line);
     my ($source, $type, $attributes) = ($a[1], $a[2], $a[8]);
     next if ($type eq 'region' || $type eq 'cDNA_match' || $type eq 'match' ||
              $type eq 'sequence_feature');
     my %tags;
     foreach my $tagEqVal (split(';', $attributes)) {
       if ($tagEqVal =~ m/^(\w+)=(.*)/) {
         my ($tag, $val) = (lc($1), $2);
         if ($tagsOfInterest{$tag}) {
           $tags{$tag} = $val;
         }
+      } elsif ($tagEqVal eq "") {
+        # NCBI historical refseq transcripts can have multiple ';;' in a row
+        # for some reason, so skip those empty tags
+        next;
       } else {
         die "parse error: expecting tag=val, got '$tagEqVal'";
       }
     }
     die "ID not found" if (! exists($tags{id}));
     my $id = $tags{id};
     if (exists($gff{$id})) {
       # So weird that NCBI's GFF has unique IDs for exons but not CDS.
       # And non-uniq IDs for a few sequence_feature, even primary_transcript mir-8199 in ce11...
       warn "Encountered a second record with ID=$id" unless ($id =~ /^cds/);
       next;
     }
     $gff{$id} = \%tags;
     # Add source and type columns as pseudo-tags -- make sure there aren't actual tags w/those names.
     die "Tag name conflict for 'type'." if (exists($tags{type}));
     $tags{type} = $type;
     die "Tag name conflict for 'source'." if (exists($tags{source}));
     $tags{source} = $source;
     # If this has a parent, hook up parent to this child; otherwise, this is a top-level element.
     if (exists($tags{parent})) {
       my $parentId = $tags{parent};
       # We're counting on NCBI's GFF to define parents before children.
       if (! exists($gff{$parentId})) {
+        if (not defined  $continueOnParentErrors) {
           die "ID $id: Parent ID $parentId has not already been defined in GFF.";
+        } else {
+          push @topLevelIds, $id;
+        }
       }
       push @{$gff{$parentId}->{children}}, $id;
     } else {
       # No parent -- this is top level.
       push @topLevelIds, $id;
     }
   }
   close($fh);
   print STDERR "Processed " . scalar(keys %gff) . " elements.\n";
   print STDERR "Found " . scalar(@topLevelIds) . " top level IDs.\n";
   # print STDERR join(", ", @topLevelIds) . "\n";
   return \%gff, \@topLevelIds;
 } # parseGff3
 
 sub parseDbXref($) {
@@ -343,31 +352,31 @@
       $proteinId{$acc} = $protein;
     }
   }
   close ($fh);
   return (\%descriptionData, \%statusData, \%proteinId);
 } # parseRaFile
 
 sub printOutput($$$$) {
   # Output tab-separated attributes
   my ($outColumns, $descriptionData, $statusData, $proteinId)  = @_;
   foreach my $id (sort keys %{$outColumns}) {
     my $idDataPtr = $outColumns->{$id};
     my $missingData = "";
     printf "%s", $id;    # first column
     # second column is the gene reviewed 'status'
-    if (exists($statusData->{$id})) {
+    if (exists($statusData->{$id}) && defined $statusData->{$id}) {
       printf "\t%s", $statusData->{$id};
     } else { printf "\tUnknown"; }
     # Make sure we found the corresponding protein for each coding transcript.
     # GFF *almost* always has it (except in rare cases when only non-coding exon(s) of
     # a coding transcript can be mapped to the assembly, so no CDS records) -- use
     # the protein ID from raFile as a backup.
     if ($id =~ m/^[NXY][MP]/ && ! $idDataPtr->{protein_id}) {
       if ($proteinId->{$id}) {
         $idDataPtr->{protein_id} = $proteinId->{$id};
       } else {
         die "Missing protein_id for coding transcript $id\n";
       }
     }
     # the rest go in order according to tabOrderOutput
     foreach my $tag ( @tabOrderOutput ) {
@@ -388,30 +397,33 @@
       if (exists $idDataPtr->{$extDb} && $idDataPtr->{$extDb}) {
         print "\t" . $idDataPtr->{$extDb};
         $gotExt = 1;
         last;
       }
     }
     print "\t$missingData" if (! $gotExt);
     print "\n";
   }
 } # printOutput
 
 #################################################################################
 # MAIN
 
 my $argc = scalar(@ARGV);
-if ($argc != 3) {
+if ($argc != 3 && $argc != 4) {
+  &usage;
+}
+my ($raFile, $gffFile, $labelFile, $continueOnParentErrors) = @ARGV;
+if (defined $continueOnParentErrors && $continueOnParentErrors ne "continueOnParentErrors") {
   &usage;
 }
-my ($raFile, $gffFile, $labelFile) = @ARGV;
 
 printf STDERR "# raFile: %s\n", $raFile;
 printf STDERR "# gffFile: %s\n", $gffFile;
 printf STDERR "# labelFile: %s\n", $labelFile;
 
-my ($gff, $topLevelIds) = parseGff3($gffFile);
+my ($gff, $topLevelIds) = parseGff3($gffFile, $continueOnParentErrors);
 my ($outColumns) = collectColumns($gff, $topLevelIds);
 my ($descriptionData, $statusData, $proteinId) = parseRaFile($labelFile, $raFile);
 
 printOutput($outColumns, $descriptionData, $statusData, $proteinId);