0f1c9a391299747bdac544ce016ddbf291ca75c7
hiram
  Wed Mar 12 13:38:03 2025 -0700
adding parent relation and description column refs #32704

diff --git src/hg/utils/automation/updateName2.pl src/hg/utils/automation/updateName2.pl
index aa4d9cb3cf2..12532bbe5ff 100755
--- src/hg/utils/automation/updateName2.pl
+++ src/hg/utils/automation/updateName2.pl
@@ -1,83 +1,162 @@
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my $argc = scalar(@ARGV);
 if ($argc != 2) {
   printf STDERR "usage: updateName2.pl <file.attrs.txt> <file.gp>\n";
-  printf STDERR "will read the file.gp converting it to a bigGenePred\n";
+  printf STDERR "Will read the file.gp converting it to a bigGenePred\n";
   printf STDERR "    with column 18 of the bigGenePred outputting the\n";
   printf STDERR "    GeneID identifier when found in the file.attrs.txt\n";
-  printf STDERR "output of the revised bigBigGenePred will be to stdout.\n";
+  printf STDERR "Also fills in 'type' and 'geneType' columns when available. \n";
+  printf STDERR "Output of the revised bigBigGenePred will be to stdout.\n";
   printf STDERR "This function is used in the doNcbiRefSeq.pl script.\n";
   exit 255;
 }
 my $attrs = shift;
 my $gpFile = shift;
 
+my %parent;	# key is gene name, value is parent name
 my %dupList;	# key is gene name, placed on this list when duplicate IDs found
 		# for the same named gene, avoid these.
 my %geneId;	# key is gene name from colunn 1, value is GeneID number
+my %bioType;	# key is gene name from column 1, value is
+		#    the string associated with gene_biotype when found
+my %Type;	# key is gene name from column 1, value is
+		#    the string associated with Type when found
+my %descr;	# key is gene name, value is description string
 
-open (my $fh, "-|", "grep GeneID $attrs") or die "can not grep $attrs";
+open (my $fh, "-|", "egrep -w 'Parent|GeneID|Type|gene_biotype|description' $attrs") or die "can not grep $attrs";
 while (my $line = <$fh>) {
   chomp $line;
-  my @a = split(/\s+/, $line);
+  my @a = split(/\s+/, $line, 3);
   my $geneName = $a[0];
   next if (defined($dupList{$geneName}));
+  if ($a[1] =~ /Dbxref/i) {
       my $geneId = $a[-1];
       my @b = split(/,/, $geneId);
   # could be csv list of gene IDs here, looking for the 'GeneID'
       for my $maybeGeneId (@b) {
          if ($maybeGeneId =~ m/^GeneID/i) {
              $maybeGeneId =~ s/GeneID://i;
              if (defined($geneId{$geneName})) {
                  if ($geneId{$geneName} ne $maybeGeneId) {
                     $dupList{$geneName} = "$geneId{$geneName},${maybeGeneId}";
                     delete $geneId{$geneName};
                     next;
                  }
              } else {
                  $geneId{$geneName} = $maybeGeneId;
              }
              last;
          }
       }
+  } elsif ($a[1] =~ /description/i) {
+      if (defined($descr{$geneName})) {
+         if ($a[2] ne $descr{$geneName}) {
+            $dupList{$geneName} = "$geneName,$descr{$geneName}.$a[2]";
+            delete $descr{$geneName};
+         }
+      } else {
+         $descr{$geneName} = $a[2];
+      }
+  } elsif ($a[1] =~ /Parent/i) {
+      if (defined($parent{$geneName})) {
+         if ($a[2] ne $parent{$geneName}) {
+            $dupList{$geneName} = "$geneName,$parent{$geneName}.$a[2]";
+            delete $parent{$geneName};
+         }
+      } else {
+         $parent{$geneName} = $a[2];
+      }
+
+  } elsif ($a[1] =~ /gene_biotype/i) {
+      if (defined($bioType{$geneName})) {
+         if ($a[2] ne $bioType{$geneName}) {
+            $dupList{$geneName} = "$geneName,$bioType{$geneName}.$a[2]";
+            delete $bioType{$geneName};
+         }
+      } else {
+         $bioType{$geneName} = $a[2];
+      }
+  } elsif ($a[1] =~ /^Type$/i) {
+      if (defined($Type{$geneName})) {
+         if ($a[2] ne $Type{$geneName}) {
+            $dupList{$geneName} = "$geneName,$Type{$geneName}.$a[2]";
+            delete $Type{$geneName};
+         }
+      } else {
+         $Type{$geneName} = $a[2];
+      }
+  }
 }
 close ($fh);
 
 my $updatedNames = 0;
 my $totalItems = 0;
 open ($fh, "-|", "genePredToBigGenePred $gpFile stdout") or die "can not read $gpFile";
 while (my $line = <$fh>) {
   chomp $line;
   ++$totalItems;
   # the -1 keeps the trailing empty field at the end of the line
   my @a = split(/\t/, $line, -1);
+  my $sizeA = scalar(@a);
   # if name is equal to geneName see if geneName can be improved
   if ($a[3] eq $a[17]) {
      if (defined($geneId{$a[3]})) {
         $a[17] = $geneId{$a[3]};
         ++$updatedNames;
         # if name2 is equal to geneName2 reproduce name in geneName2
         if ($a[12] eq $a[18]) {
              $a[18] = $a[3];
         }
      }
   }
+  #  "Transcript type"
+  if (length($a[16]) < 1) {
+     if (defined($bioType{$a[3]})) {
+         $a[16] = $bioType{$a[3]};
+     } elsif (defined($parent{$a[3]})) {
+        my $parent = $parent{$a[3]};
+        if (defined($bioType{$parent})) {
+           $a[16] = $bioType{$parent};
+        }
+     }
+  }
+  #  "Gene type"
+  if (length($a[19]) < 1) {
+     if (defined($Type{$a[3]})) {
+         $a[19] = $Type{$a[3]};
+     } elsif (defined($parent{$a[3]})) {
+        my $parent = $parent{$a[3]};
+        if (defined($Type{$parent})) {
+           $a[19] = $Type{$parent};
+        }
+     }
+  }
+  if (defined($descr{$a[3]})) {
+     $a[$sizeA] = $descr{$a[3]};
+  } elsif (defined($parent{$a[3]})) {
+     my $parent = $parent{$a[3]};
+     if (defined($descr{$parent})) {
+        $a[$sizeA] = $descr{$parent};
+     }
+  } else {
+     $a[$sizeA] = "";
+  }
   printf "%s\n", join("\t", @a);
 }
 close ($fh);
 printf STDERR "### updated %d items of total %d - %s\n", $updatedNames, $totalItems, $gpFile;
 
 __END__
 
 00      NW_027257890.1
 01      99779
 02      105382
 03      XM_071090490.1
 04      0
 05      +
 06      99779
 07      105239