62ccb95d5c71a3bf7658327e6db2bc35c19d1d71 chmalee Wed Oct 11 09:57:36 2023 -0700 Adding back hgvs search support for historical refseq transcripts and historical refSeq transcripts track. Should be more resilient code this time. diff --git src/hg/utils/automation/gff3ToRefLink.pl src/hg/utils/automation/gff3ToRefLink.pl index 2983d06..9e703df 100755 --- src/hg/utils/automation/gff3ToRefLink.pl +++ src/hg/utils/automation/gff3ToRefLink.pl @@ -1,25 +1,26 @@ #!/usr/bin/env perl use strict; use warnings; sub usage { - printf STDERR "usage: gff3ToRefLink.pl [raFile.txt] [ncbi.gff.gz] [pragmaLabels.txt] > refLink.tab\n"; + printf STDERR "usage: gff3ToRefLink.pl [raFile.txt] [ncbi.gff.gz] [pragmaLabels.txt] [continueOnParentErrors] > refLink.tab\n"; printf STDERR "- raFile.txt is the output of gbProcess \${asmId}_rna.gbff.gz\n"; printf STDERR "- ncbi.gff.gz is \${asmId}_genomic.gff.gz\n"; printf STDERR "- pragmaLabels.txt has words that appear before '::' within ##*-START/##*-END\n"; + printf STDERR "- continueOnParentErrors is a flag to tell the script to continue when a child is defined before it's parent\n"; printf STDERR " text in \${asmId}_rna.gbff.gz, one label per line.\n"; printf STDERR "output to stdout is the tab separated ncbiRefSeqLink data for each item\n"; printf STDERR "This is intended to be called by doNcbiRefSeq.pl.\n"; exit 255; } # usage # this order list of items will keep the tab output consistent so all # 'columns' of the data in the tab output will be the same order always. # there are two columns before these, the id (transcript_id) and the gene reviewed 'status', # and two columns after these items, the 'description' string from gbProcess # and the externalId for external URL outlinks my @tabOrderOutput = ( 'gene', 'product', 'transcript_id', 'protein_id', 'geneid', 'mim', 'hgnc', 'genbank', 'pseudo', 'gbkey', 'source', 'gene_biotype', 'gene_synonym', 'ncrna_class', 'note' ); @@ -54,75 +55,83 @@ sub mustOpen ($) { # Open possibly gzipped file or "stdin" for reading, return filehandle or die with error. my ($fileName) = @_; my $fh; if ($fileName =~ m/.gz$/) { open ($fh, "zcat $fileName |") or die "can not read $fileName: $!"; } elsif ($fileName =~ m/^stdin$/) { open ($fh, "</dev/stdin") or die "can not read $fileName: $!"; } else { open ($fh, "<$fileName") or die "can not read $fileName: $!"; } return $fh; } # mustOpen -sub parseGff3($) { +sub parseGff3($;$) { # Parse GFF3 into a tree structure using parent pointers. Store only tags of interest. # Children store only tags that are not present or differ from the parent's tags. - my ($gffFile) = @_; + my ($gffFile, $continueOnParentErrors) = @_; my %gff; my @topLevelIds; my $fh = mustOpen($gffFile); while (my $line = <$fh>) { chomp $line; next if ($line =~ m/^#/); my @a = split('\t', $line); my ($source, $type, $attributes) = ($a[1], $a[2], $a[8]); next if ($type eq 'region' || $type eq 'cDNA_match' || $type eq 'match' || $type eq 'sequence_feature'); my %tags; foreach my $tagEqVal (split(';', $attributes)) { if ($tagEqVal =~ m/^(\w+)=(.*)/) { my ($tag, $val) = (lc($1), $2); if ($tagsOfInterest{$tag}) { $tags{$tag} = $val; } + } elsif ($tagEqVal eq "") { + # NCBI historical refseq transcripts can have multiple ';;' in a row + # for some reason, so skip those empty tags + next; } else { die "parse error: expecting tag=val, got '$tagEqVal'"; } } die "ID not found" if (! exists($tags{id})); my $id = $tags{id}; if (exists($gff{$id})) { # So weird that NCBI's GFF has unique IDs for exons but not CDS. # And non-uniq IDs for a few sequence_feature, even primary_transcript mir-8199 in ce11... warn "Encountered a second record with ID=$id" unless ($id =~ /^cds/); next; } $gff{$id} = \%tags; # Add source and type columns as pseudo-tags -- make sure there aren't actual tags w/those names. die "Tag name conflict for 'type'." if (exists($tags{type})); $tags{type} = $type; die "Tag name conflict for 'source'." if (exists($tags{source})); $tags{source} = $source; # If this has a parent, hook up parent to this child; otherwise, this is a top-level element. if (exists($tags{parent})) { my $parentId = $tags{parent}; # We're counting on NCBI's GFF to define parents before children. if (! exists($gff{$parentId})) { + if (not defined $continueOnParentErrors) { die "ID $id: Parent ID $parentId has not already been defined in GFF."; + } else { + push @topLevelIds, $id; + } } push @{$gff{$parentId}->{children}}, $id; } else { # No parent -- this is top level. push @topLevelIds, $id; } } close($fh); print STDERR "Processed " . scalar(keys %gff) . " elements.\n"; print STDERR "Found " . scalar(@topLevelIds) . " top level IDs.\n"; # print STDERR join(", ", @topLevelIds) . "\n"; return \%gff, \@topLevelIds; } # parseGff3 sub parseDbXref($) { @@ -343,31 +352,31 @@ $proteinId{$acc} = $protein; } } close ($fh); return (\%descriptionData, \%statusData, \%proteinId); } # parseRaFile sub printOutput($$$$) { # Output tab-separated attributes my ($outColumns, $descriptionData, $statusData, $proteinId) = @_; foreach my $id (sort keys %{$outColumns}) { my $idDataPtr = $outColumns->{$id}; my $missingData = ""; printf "%s", $id; # first column # second column is the gene reviewed 'status' - if (exists($statusData->{$id})) { + if (exists($statusData->{$id}) && defined $statusData->{$id}) { printf "\t%s", $statusData->{$id}; } else { printf "\tUnknown"; } # Make sure we found the corresponding protein for each coding transcript. # GFF *almost* always has it (except in rare cases when only non-coding exon(s) of # a coding transcript can be mapped to the assembly, so no CDS records) -- use # the protein ID from raFile as a backup. if ($id =~ m/^[NXY][MP]/ && ! $idDataPtr->{protein_id}) { if ($proteinId->{$id}) { $idDataPtr->{protein_id} = $proteinId->{$id}; } else { die "Missing protein_id for coding transcript $id\n"; } } # the rest go in order according to tabOrderOutput foreach my $tag ( @tabOrderOutput ) { @@ -388,30 +397,33 @@ if (exists $idDataPtr->{$extDb} && $idDataPtr->{$extDb}) { print "\t" . $idDataPtr->{$extDb}; $gotExt = 1; last; } } print "\t$missingData" if (! $gotExt); print "\n"; } } # printOutput ################################################################################# # MAIN my $argc = scalar(@ARGV); -if ($argc != 3) { +if ($argc != 3 && $argc != 4) { + &usage; +} +my ($raFile, $gffFile, $labelFile, $continueOnParentErrors) = @ARGV; +if (defined $continueOnParentErrors && $continueOnParentErrors ne "continueOnParentErrors") { &usage; } -my ($raFile, $gffFile, $labelFile) = @ARGV; printf STDERR "# raFile: %s\n", $raFile; printf STDERR "# gffFile: %s\n", $gffFile; printf STDERR "# labelFile: %s\n", $labelFile; -my ($gff, $topLevelIds) = parseGff3($gffFile); +my ($gff, $topLevelIds) = parseGff3($gffFile, $continueOnParentErrors); my ($outColumns) = collectColumns($gff, $topLevelIds); my ($descriptionData, $statusData, $proteinId) = parseRaFile($labelFile, $raFile); printOutput($outColumns, $descriptionData, $statusData, $proteinId);