src/hg/utils/automation/updateName2.pl b4924336d330068ac4c39222f6cd730d33f486a1

b4924336d330068ac4c39222f6cd730d33f486a1
hiram
  Thu Mar 6 15:35:53 2025 -0800
can also fixup geneName2 when transition takes place refs #32704

diff --git src/hg/utils/automation/updateName2.pl src/hg/utils/automation/updateName2.pl
index bc0a8188702..aa4d9cb3cf2 100755
--- src/hg/utils/automation/updateName2.pl
+++ src/hg/utils/automation/updateName2.pl
@@ -1,131 +1,135 @@
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my $argc = scalar(@ARGV);
 if ($argc != 2) {
   printf STDERR "usage: updateName2.pl <file.attrs.txt> <file.gp>\n";
   printf STDERR "will read the file.gp converting it to a bigGenePred\n";
   printf STDERR "with column 18 of the bigGenePred outputting the\n";
   printf STDERR "GeneID identifier when found in the file.attrs.txt\n";
   printf STDERR "output of the revised bigBigGenePred will be to stdout.\n";
   printf STDERR "This function is used in the doNcbiRefSeq.pl script.\n";
   exit 255;
 }
 my $attrs = shift;
 my $gpFile = shift;
 
 my %dupList;	# key is gene name, placed on this list when duplicate IDs found
 		# for the same named gene, avoid these.
 my %geneId;	# key is gene name from colunn 1, value is GeneID number
 
 open (my $fh, "-|", "grep GeneID $attrs") or die "can not grep $attrs";
 while (my $line = <$fh>) {
   chomp $line;
   my @a = split(/\s+/, $line);
   my $geneName = $a[0];
   next if (defined($dupList{$geneName}));
   my $geneId = $a[-1];
   my @b = split(/,/, $geneId);
   # could be csv list of gene IDs here, looking for the 'GeneID'
   for my $maybeGeneId (@b) {
      if ($maybeGeneId =~ m/^GeneID/i) {
          $maybeGeneId =~ s/GeneID://i;
          if (defined($geneId{$geneName})) {
              if ($geneId{$geneName} ne $maybeGeneId) {
                 $dupList{$geneName} = "$geneId{$geneName},${maybeGeneId}";
                 delete $geneId{$geneName};
                 next;
              }
          } else {
              $geneId{$geneName} = $maybeGeneId;
          }
          last;
      }
   }
 }
 close ($fh);
 
 my $updatedNames = 0;
 my $totalItems = 0;
 open ($fh, "-|", "genePredToBigGenePred $gpFile stdout") or die "can not read $gpFile";
 while (my $line = <$fh>) {
   chomp $line;
   ++$totalItems;
   # the -1 keeps the trailing empty field at the end of the line
   my @a = split(/\t/, $line, -1);
-  # if name is equal to name2 see if name2 can be improved
+  # if name is equal to geneName see if geneName can be improved
   if ($a[3] eq $a[17]) {
      if (defined($geneId{$a[3]})) {
         $a[17] = $geneId{$a[3]};
         ++$updatedNames;
+        # if name2 is equal to geneName2 reproduce name in geneName2
+        if ($a[12] eq $a[18]) {
+             $a[18] = $a[3];
+        }
      }
   }
   printf "%s\n", join("\t", @a);
 }
 close ($fh);
 printf STDERR "### updated %d items of total %d - %s\n", $updatedNames, $totalItems, $gpFile;
 
 __END__
 
 00      NW_027257890.1
 01      99779
 02      105382
 03      XM_071090490.1
 04      0
 05      +
 06      99779
 07      105239
 08      0
 09      2
 10      58,349,
 11      0,5254,
 12      LOC139361723
 13      cmpl
 14      cmpl
 15      0,1,
 16
 17      XM_071090490.1
 18      LOC139361723
 19
 
 bigGenePredFields:
 
      1     string chrom;       "Reference sequence chromosome or scaffold"
      2     uint   chromStart;  "Start position in chromosome"
      3     uint   chromEnd;    "End position in chromosome"
      4     string name;        "Name or ID of item, ideally both human readable and unique"
      5     uint score;         "Score (0-1000)"
      6     char[1] strand;     "+ or - for strand"
      7     uint thickStart;    "Start of where display should be thick (start codon)"
      8     uint thickEnd;      "End of where display should be thick (stop codon)"
      9     uint reserved;       "RGB value (use R,G,B string in input file)"
     10     int blockCount;     "Number of blocks"
     11     int[blockCount] blockSizes; "Comma separated list of block sizes"
     12     int[blockCount] chromStarts; "Start positions relative to chromStart"
     13     string name2;       "Alternative/human readable name"
     14     string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)"
     15     string cdsEndStat;   "Status of CDS end annotation (none, unknown, incomplete, or complete)"
     16     int[blockCount] exonFrames; "Reading frame of the start of the CDS region of the exon, in the direction of transcription (0,1,2), or -1 if there is no CDS region."
     17     string type;        "Transcript type"
     18     string geneName;    "Primary identifier for gene"
     19     string geneName2;   "Alternative/human readable gene name"
     20     string geneType;    "Gene type"
 
 
 grep LOC139361723 GCF_043159975.1_mMacNem.hap1.attrs.txt
 gene-LOC139361723       ID      gene-LOC139361723
 gene-LOC139361723       Dbxref  GeneID:139361723
 gene-LOC139361723       Name    LOC139361723
 gene-LOC139361723       description     beta-defensin 109
 gene-LOC139361723       gbkey   Gene
 gene-LOC139361723       gene    LOC139361723
 gene-LOC139361723       gene_biotype    protein_coding
 gene-LOC139361723       Seqid   NW_027257890.1
 gene-LOC139361723       Source  Gnomon
 gene-LOC139361723       Type    gene
 gene-LOC139361723       Start   99779
 gene-LOC139361723       End     105382
 gene-LOC139361723       Strand  +
 XM_071090490.1  Parent  gene-LOC139361723