e7dfb28b932077142b9e5db6f43a21ddbd32e7fa
hiram
  Thu Mar 6 09:36:52 2025 -0800
function to update column 18 of bigGenePred to be GeneID for link to NCBI refs #34704

diff --git src/hg/utils/automation/updateName2.pl src/hg/utils/automation/updateName2.pl
new file mode 100755
index 00000000000..bc0a8188702
--- /dev/null
+++ src/hg/utils/automation/updateName2.pl
@@ -0,0 +1,131 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $argc = scalar(@ARGV);
+if ($argc != 2) {
+  printf STDERR "usage: updateName2.pl <file.attrs.txt> <file.gp>\n";
+  printf STDERR "will read the file.gp converting it to a bigGenePred\n";
+  printf STDERR "with column 18 of the bigGenePred outputting the\n";
+  printf STDERR "GeneID identifier when found in the file.attrs.txt\n";
+  printf STDERR "output of the revised bigBigGenePred will be to stdout.\n";
+  printf STDERR "This function is used in the doNcbiRefSeq.pl script.\n";
+  exit 255;
+}
+my $attrs = shift;
+my $gpFile = shift;
+
+my %dupList;	# key is gene name, placed on this list when duplicate IDs found
+		# for the same named gene, avoid these.
+my %geneId;	# key is gene name from colunn 1, value is GeneID number
+
+open (my $fh, "-|", "grep GeneID $attrs") or die "can not grep $attrs";
+while (my $line = <$fh>) {
+  chomp $line;
+  my @a = split(/\s+/, $line);
+  my $geneName = $a[0];
+  next if (defined($dupList{$geneName}));
+  my $geneId = $a[-1];
+  my @b = split(/,/, $geneId);
+  # could be csv list of gene IDs here, looking for the 'GeneID'
+  for my $maybeGeneId (@b) {
+     if ($maybeGeneId =~ m/^GeneID/i) {
+         $maybeGeneId =~ s/GeneID://i;
+         if (defined($geneId{$geneName})) {
+             if ($geneId{$geneName} ne $maybeGeneId) {
+                $dupList{$geneName} = "$geneId{$geneName},${maybeGeneId}";
+                delete $geneId{$geneName};
+                next;
+             }
+         } else {
+             $geneId{$geneName} = $maybeGeneId;
+         }
+         last;
+     }
+  }
+}
+close ($fh);
+
+my $updatedNames = 0;
+my $totalItems = 0;
+open ($fh, "-|", "genePredToBigGenePred $gpFile stdout") or die "can not read $gpFile";
+while (my $line = <$fh>) {
+  chomp $line;
+  ++$totalItems;
+  # the -1 keeps the trailing empty field at the end of the line
+  my @a = split(/\t/, $line, -1);
+  # if name is equal to name2 see if name2 can be improved
+  if ($a[3] eq $a[17]) {
+     if (defined($geneId{$a[3]})) {
+        $a[17] = $geneId{$a[3]};
+        ++$updatedNames;
+     }
+  }
+  printf "%s\n", join("\t", @a);
+}
+close ($fh);
+printf STDERR "### updated %d items of total %d - %s\n", $updatedNames, $totalItems, $gpFile;
+
+__END__
+
+00      NW_027257890.1
+01      99779
+02      105382
+03      XM_071090490.1
+04      0
+05      +
+06      99779
+07      105239
+08      0
+09      2
+10      58,349,
+11      0,5254,
+12      LOC139361723
+13      cmpl
+14      cmpl
+15      0,1,
+16
+17      XM_071090490.1
+18      LOC139361723
+19
+
+bigGenePredFields:
+
+     1     string chrom;       "Reference sequence chromosome or scaffold"
+     2     uint   chromStart;  "Start position in chromosome"
+     3     uint   chromEnd;    "End position in chromosome"
+     4     string name;        "Name or ID of item, ideally both human readable and unique"
+     5     uint score;         "Score (0-1000)"
+     6     char[1] strand;     "+ or - for strand"
+     7     uint thickStart;    "Start of where display should be thick (start codon)"
+     8     uint thickEnd;      "End of where display should be thick (stop codon)"
+     9     uint reserved;       "RGB value (use R,G,B string in input file)"
+    10     int blockCount;     "Number of blocks"
+    11     int[blockCount] blockSizes; "Comma separated list of block sizes"
+    12     int[blockCount] chromStarts; "Start positions relative to chromStart"
+    13     string name2;       "Alternative/human readable name"
+    14     string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)"
+    15     string cdsEndStat;   "Status of CDS end annotation (none, unknown, incomplete, or complete)"
+    16     int[blockCount] exonFrames; "Reading frame of the start of the CDS region of the exon, in the direction of transcription (0,1,2), or -1 if there is no CDS region."
+    17     string type;        "Transcript type"
+    18     string geneName;    "Primary identifier for gene"
+    19     string geneName2;   "Alternative/human readable gene name"
+    20     string geneType;    "Gene type"
+
+
+grep LOC139361723 GCF_043159975.1_mMacNem.hap1.attrs.txt
+gene-LOC139361723       ID      gene-LOC139361723
+gene-LOC139361723       Dbxref  GeneID:139361723
+gene-LOC139361723       Name    LOC139361723
+gene-LOC139361723       description     beta-defensin 109
+gene-LOC139361723       gbkey   Gene
+gene-LOC139361723       gene    LOC139361723
+gene-LOC139361723       gene_biotype    protein_coding
+gene-LOC139361723       Seqid   NW_027257890.1
+gene-LOC139361723       Source  Gnomon
+gene-LOC139361723       Type    gene
+gene-LOC139361723       Start   99779
+gene-LOC139361723       End     105382
+gene-LOC139361723       Strand  +
+XM_071090490.1  Parent  gene-LOC139361723