0f1c9a391299747bdac544ce016ddbf291ca75c7 hiram Wed Mar 12 13:38:03 2025 -0700 adding parent relation and description column refs #32704 diff --git src/hg/utils/automation/updateName2.pl src/hg/utils/automation/updateName2.pl index aa4d9cb3cf2..12532bbe5ff 100755 --- src/hg/utils/automation/updateName2.pl +++ src/hg/utils/automation/updateName2.pl @@ -1,83 +1,162 @@ #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 2) { printf STDERR "usage: updateName2.pl \n"; - printf STDERR "will read the file.gp converting it to a bigGenePred\n"; + printf STDERR "Will read the file.gp converting it to a bigGenePred\n"; printf STDERR " with column 18 of the bigGenePred outputting the\n"; printf STDERR " GeneID identifier when found in the file.attrs.txt\n"; - printf STDERR "output of the revised bigBigGenePred will be to stdout.\n"; + printf STDERR "Also fills in 'type' and 'geneType' columns when available. \n"; + printf STDERR "Output of the revised bigBigGenePred will be to stdout.\n"; printf STDERR "This function is used in the doNcbiRefSeq.pl script.\n"; exit 255; } my $attrs = shift; my $gpFile = shift; +my %parent; # key is gene name, value is parent name my %dupList; # key is gene name, placed on this list when duplicate IDs found # for the same named gene, avoid these. my %geneId; # key is gene name from colunn 1, value is GeneID number +my %bioType; # key is gene name from column 1, value is + # the string associated with gene_biotype when found +my %Type; # key is gene name from column 1, value is + # the string associated with Type when found +my %descr; # key is gene name, value is description string -open (my $fh, "-|", "grep GeneID $attrs") or die "can not grep $attrs"; +open (my $fh, "-|", "egrep -w 'Parent|GeneID|Type|gene_biotype|description' $attrs") or die "can not grep $attrs"; while (my $line = <$fh>) { chomp $line; - my @a = split(/\s+/, $line); + my @a = split(/\s+/, $line, 3); my $geneName = $a[0]; next if (defined($dupList{$geneName})); + if ($a[1] =~ /Dbxref/i) { my $geneId = $a[-1]; my @b = split(/,/, $geneId); # could be csv list of gene IDs here, looking for the 'GeneID' for my $maybeGeneId (@b) { if ($maybeGeneId =~ m/^GeneID/i) { $maybeGeneId =~ s/GeneID://i; if (defined($geneId{$geneName})) { if ($geneId{$geneName} ne $maybeGeneId) { $dupList{$geneName} = "$geneId{$geneName},${maybeGeneId}"; delete $geneId{$geneName}; next; } } else { $geneId{$geneName} = $maybeGeneId; } last; } } + } elsif ($a[1] =~ /description/i) { + if (defined($descr{$geneName})) { + if ($a[2] ne $descr{$geneName}) { + $dupList{$geneName} = "$geneName,$descr{$geneName}.$a[2]"; + delete $descr{$geneName}; + } + } else { + $descr{$geneName} = $a[2]; + } + } elsif ($a[1] =~ /Parent/i) { + if (defined($parent{$geneName})) { + if ($a[2] ne $parent{$geneName}) { + $dupList{$geneName} = "$geneName,$parent{$geneName}.$a[2]"; + delete $parent{$geneName}; + } + } else { + $parent{$geneName} = $a[2]; + } + + } elsif ($a[1] =~ /gene_biotype/i) { + if (defined($bioType{$geneName})) { + if ($a[2] ne $bioType{$geneName}) { + $dupList{$geneName} = "$geneName,$bioType{$geneName}.$a[2]"; + delete $bioType{$geneName}; + } + } else { + $bioType{$geneName} = $a[2]; + } + } elsif ($a[1] =~ /^Type$/i) { + if (defined($Type{$geneName})) { + if ($a[2] ne $Type{$geneName}) { + $dupList{$geneName} = "$geneName,$Type{$geneName}.$a[2]"; + delete $Type{$geneName}; + } + } else { + $Type{$geneName} = $a[2]; + } + } } close ($fh); my $updatedNames = 0; my $totalItems = 0; open ($fh, "-|", "genePredToBigGenePred $gpFile stdout") or die "can not read $gpFile"; while (my $line = <$fh>) { chomp $line; ++$totalItems; # the -1 keeps the trailing empty field at the end of the line my @a = split(/\t/, $line, -1); + my $sizeA = scalar(@a); # if name is equal to geneName see if geneName can be improved if ($a[3] eq $a[17]) { if (defined($geneId{$a[3]})) { $a[17] = $geneId{$a[3]}; ++$updatedNames; # if name2 is equal to geneName2 reproduce name in geneName2 if ($a[12] eq $a[18]) { $a[18] = $a[3]; } } } + # "Transcript type" + if (length($a[16]) < 1) { + if (defined($bioType{$a[3]})) { + $a[16] = $bioType{$a[3]}; + } elsif (defined($parent{$a[3]})) { + my $parent = $parent{$a[3]}; + if (defined($bioType{$parent})) { + $a[16] = $bioType{$parent}; + } + } + } + # "Gene type" + if (length($a[19]) < 1) { + if (defined($Type{$a[3]})) { + $a[19] = $Type{$a[3]}; + } elsif (defined($parent{$a[3]})) { + my $parent = $parent{$a[3]}; + if (defined($Type{$parent})) { + $a[19] = $Type{$parent}; + } + } + } + if (defined($descr{$a[3]})) { + $a[$sizeA] = $descr{$a[3]}; + } elsif (defined($parent{$a[3]})) { + my $parent = $parent{$a[3]}; + if (defined($descr{$parent})) { + $a[$sizeA] = $descr{$parent}; + } + } else { + $a[$sizeA] = ""; + } printf "%s\n", join("\t", @a); } close ($fh); printf STDERR "### updated %d items of total %d - %s\n", $updatedNames, $totalItems, $gpFile; __END__ 00 NW_027257890.1 01 99779 02 105382 03 XM_071090490.1 04 0 05 + 06 99779 07 105239