776952d3d5912ea1400d9213f3d0dfef4b10f4f4
hiram
  Mon Aug 18 16:02:54 2025 -0700
initial version to identify and load up new entries into liftOverChain from the hasChainNets.txt file refs #35575

diff --git src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl
new file mode 100755
index 00000000000..4b7a6b6ae3e
--- /dev/null
+++ src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl
@@ -0,0 +1,182 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $argc = scalar(@ARGV);
+
+if ($argc != 1) {
+  printf STDERR "usage: updateLiftOverChain.pl <hasChainNets.txt>\n";
+  exit 255;
+}
+
+my $hasChainNets = shift;
+
+my %validDb;	# key is db or GCx name, value is 1
+my $genArkCount = 0;
+
+my $genArkList = "https://hgdownload.soe.ucsc.edu/hubs/UCSC_GI.assemblyHubList.txt";
+open (my $fh, "-|", "curl $genArkList 2> /dev/null") or die "can not curl $genArkList";
+while (my $line = <$fh>) {
+  next if ($line =~ m/^#/);
+  my @a = split('\t', $line);
+  $validDb{$a[0]} = 1;
+  ++$genArkCount;
+}
+close ($fh);
+
+my $rrActiveCount = 0;
+
+open ($fh, "-|", "hgsql -h genome-centdb -N -e 'select name from dbDb where active=1' hgcentral") or die "can not select from hgcentral.liftOverChain";
+while (my $db = <$fh>) {
+  chomp $db;
+  $validDb{$db} = 1;
+  ++$rrActiveCount;
+}
+close ($fh);
+
+printf STDERR "## %d GenArk hubs, %d RR active db\n", $genArkCount, $rrActiveCount;
+
+my %liftOverFiles;	# key is "fromDb|toDb" value is path name
+my $liftOverCount = 0;
+my $invalidCount = 0;
+
+open ($fh, "-|", "zcat /hive/data/inside/GenArk/liftOver/hgwdev.liftOverList.txt.gz") or die "can not zcat hgwdev.liftOverList.txt.gz";
+while (my $line = <$fh>) {
+  chomp $line;
+  my ($md5, $filePath) = split('\t', $line);
+  my @a = split('/', $filePath);
+  my $loChain = $a[-1];
+  $loChain =~ s/.over.chain.gz//;
+  my @b = split('To', $loChain);
+  if (defined($b[0]) && defined($b[1])) {
+    my $fromDb = $b[0];
+    my $toDb = $b[1];
+    if ($toDb !~ m/^GC/) {
+       $toDb = lcfirst($toDb);
+    }
+    if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) {
+      my $key = sprintf("%s|%s", $fromDb, $toDb);
+      printf STDERR "# %s\n", $key if ($liftOverCount < 10);
+      $liftOverFiles{$key} = $filePath;
+      ++$liftOverCount;
+    } else {
+      printf STDERR "# invalid Db: %s or %s\n", $fromDb, $toDb if ($invalidCount < 10);
+      ++$invalidCount;
+    }
+  } else {
+      printf STDERR "# no db name foundin:\n%s\n"< $filePath if ($invalidCount < 10);
+      ++$invalidCount;
+#   printf STDERR "warning: no db name found in liftOver file name:\n%s\n", $filePath;
+  }
+}
+close ($fh);
+
+open ($fh, "-|", "zgrep 'over.chain.gz' /hive/data/inside/GenArk/liftOver/gbdb.genark.liftOver.file.list") or die "can not read gbdb/genark.liftOver.file.list";
+while (my $filePath = <$fh>) {
+  chomp $filePath;
+  my @a = split('/', $filePath);
+  my $loChain = $a[-1];
+  $loChain =~ s/.over.chain.gz//;
+  my @b = split('To', $loChain);
+  if (defined($b[0]) && defined($b[1])) {
+    my $fromDb = $b[0];
+    my $toDb = $b[1];
+    if ($toDb !~ m/^GC/) {
+       $toDb = lcfirst($toDb);
+    }
+    if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) {
+      my $key = sprintf("%s|%s", $fromDb, $toDb);
+      printf STDERR "# %s\n", $key if ($liftOverCount < 10);
+      $liftOverFiles{$key} = $filePath;
+      ++$liftOverCount;
+    } else {
+      printf STDERR "# invalid Db: %s or %s\n", $fromDb, $toDb if ($invalidCount < 10);
+      ++$invalidCount;
+    }
+  } else {
+      printf STDERR "# no db name foundin:\n%s\n"< $filePath if ($invalidCount < 10);
+      ++$invalidCount;
+#   printf STDERR "warning: no db name found in liftOver file name:\n%s\n", $filePath;
+  }
+}
+close ($fh);
+
+printf STDERR "# %d liftOverFiles, %d invalid files\n", $liftOverCount, $invalidCount;
+
+my %fromToToday;	# from the liftOverChain table, key is fromDb|toDb
+			# value is the file path
+my $todayLiftCount = 0;
+my $pathMissing = 0;
+my $invalidToFrom = 0;
+
+open ($fh, "-|", "hgsql -N -e 'select fromDb,toDb,path from liftOverChain;' hgcentraltest") or die "can not select from hgcentraltest.liftOverChain";
+while (my $line = <$fh>) {
+  chomp $line;
+  my ($fromDb, $toDb, $path) = split('\t', $line);
+  my $key = sprintf("%s|%s", $fromDb, $toDb);
+  if ( -s "${path}" ) {
+    if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) {
+      $fromToToday{$key} = $path;
+      ++$todayLiftCount;
+    } else {
+      ++$invalidToFrom;
+    }
+  } else {
+    ++$pathMissing;
+  }
+}
+close ($fh);
+
+printf STDERR "# liftOverCount table: %d valid, %d path missing, %d invalidToFrom\n", $todayLiftCount, $pathMissing, $invalidToFrom;
+
+my $toAddCount = 0;
+my $missingPath = 0;
+
+printf STDERR "# reading: '%s'\n", $hasChainNets;
+my $asmId = "";
+open ($fh, "<", $hasChainNets) or die "can not read $hasChainNets";
+while (my $line = <$fh>) {
+  chomp $line;
+  if ($line =~ m/^\t/) {
+     my $overChain = $line;
+     $overChain =~ s/^\t//;
+     $overChain =~ s/.over.chain.gz//;
+     my @b = split('To', $overChain);
+     if (defined($b[0]) && defined($b[1])) {
+       my $fromDb = $b[0];
+       my $toDb = $b[1];
+       if ($toDb !~ m/^GC/) {
+          $toDb = lcfirst($toDb);
+       }
+       if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) {
+          my $key = sprintf("%s|%s", $fromDb, $toDb);
+          if (defined($liftOverFiles{$key})) {
+            printf STDERR "# to add: %s\n", $key if ($toAddCount < 10);
+            printf "%s\t%s\t%s\t0.1\t0\t0\tY\t1\tN\n", $fromDb, $toDb, $liftOverFiles{$key};
+            ++$toAddCount;
+          } else {
+            printf STDERR "# missing path: %s\n", $key if ($missingPath < 10);
+            ++$missingPath;
+          }
+       }
+     }
+  } else {
+     $asmId = $line;
+  }
+}
+close ($fh);
+printf STDERR "# %d entries to add to table, %d missing path file\n", $toAddCount, $missingPath;
+
+# head hasChainNets.txt  | cat -A
+# GCA_028885625.2_NHGRI_mPonPyg2-v2.0_pri^I10$
+# ^IGCA_028885625.2ToGCA_028858775.2.over.chain.gz$
+# ^IGCA_028885625.2ToGCA_028878055.2.over.chain.gz$
+# ^IGCA_028885625.2ToGCA_028885655.2.over.chain.gz$
+# ^IGCA_028885625.2ToGCA_029281585.2.over.chain.gz$
+# ^IGCA_028885625.2ToGCA_029289425.2.over.chain.gz$
+# ^IGCA_028885625.2ToGCF_011100555.1.over.chain.gz$
+# ^IGCA_028885625.2ToGCF_020740605.2.over.chain.gz$
+# ^IGCA_028885625.2ToGCF_027406575.1.over.chain.gz$
+# ^IGCA_028885625.2ToHg38.over.chain.gz$
+