2c28295e6b1f47a483282b3790208679ba39a5fe
hiram
  Mon Aug 18 17:51:33 2025 -0700
checking for reciprocal both directions refs #35575

diff --git src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl
index 4b7a6b6ae3e..065590c63ee 100755
--- src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl
+++ src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl
@@ -45,61 +45,61 @@
 while (my $line = <$fh>) {
   chomp $line;
   my ($md5, $filePath) = split('\t', $line);
   my @a = split('/', $filePath);
   my $loChain = $a[-1];
   $loChain =~ s/.over.chain.gz//;
   my @b = split('To', $loChain);
   if (defined($b[0]) && defined($b[1])) {
     my $fromDb = $b[0];
     my $toDb = $b[1];
     if ($toDb !~ m/^GC/) {
        $toDb = lcfirst($toDb);
     }
     if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) {
       my $key = sprintf("%s|%s", $fromDb, $toDb);
-      printf STDERR "# %s\n", $key if ($liftOverCount < 10);
+#     printf STDERR "# %s\n", $key if ($liftOverCount < 10);
       $liftOverFiles{$key} = $filePath;
       ++$liftOverCount;
     } else {
-      printf STDERR "# invalid Db: %s or %s\n", $fromDb, $toDb if ($invalidCount < 10);
+#     printf STDERR "# invalid Db: %s or %s\n", $fromDb, $toDb if ($invalidCount < 10);
       ++$invalidCount;
     }
   } else {
       printf STDERR "# no db name found in:\n%s\n"< $filePath if ($invalidCount < 10);
       ++$invalidCount;
 #   printf STDERR "warning: no db name found in liftOver file name:\n%s\n", $filePath;
   }
 }
 close ($fh);
 
 open ($fh, "-|", "zgrep 'over.chain.gz' /hive/data/inside/GenArk/liftOver/gbdb.genark.liftOver.file.list") or die "can not read gbdb/genark.liftOver.file.list";
 while (my $filePath = <$fh>) {
   chomp $filePath;
   my @a = split('/', $filePath);
   my $loChain = $a[-1];
   $loChain =~ s/.over.chain.gz//;
   my @b = split('To', $loChain);
   if (defined($b[0]) && defined($b[1])) {
     my $fromDb = $b[0];
     my $toDb = $b[1];
     if ($toDb !~ m/^GC/) {
        $toDb = lcfirst($toDb);
     }
     if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) {
       my $key = sprintf("%s|%s", $fromDb, $toDb);
-      printf STDERR "# %s\n", $key if ($liftOverCount < 10);
+#     printf STDERR "# %s\n", $key if ($liftOverCount < 10);
       $liftOverFiles{$key} = $filePath;
       ++$liftOverCount;
     } else {
       printf STDERR "# invalid Db: %s or %s\n", $fromDb, $toDb if ($invalidCount < 10);
       ++$invalidCount;
     }
   } else {
       printf STDERR "# no db name foundin:\n%s\n"< $filePath if ($invalidCount < 10);
       ++$invalidCount;
 #   printf STDERR "warning: no db name found in liftOver file name:\n%s\n", $filePath;
   }
 }
 close ($fh);
 
 printf STDERR "# %d liftOverFiles, %d invalid files\n", $liftOverCount, $invalidCount;
@@ -120,63 +120,93 @@
       $fromToToday{$key} = $path;
       ++$todayLiftCount;
     } else {
       ++$invalidToFrom;
     }
   } else {
     ++$pathMissing;
   }
 }
 close ($fh);
 
 printf STDERR "# liftOverCount table: %d valid, %d path missing, %d invalidToFrom\n", $todayLiftCount, $pathMissing, $invalidToFrom;
 
 my $toAddCount = 0;
 my $missingPath = 0;
+my $alreadyExisting = 0;
 
 printf STDERR "# reading: '%s'\n", $hasChainNets;
 my $asmId = "";
 open ($fh, "<", $hasChainNets) or die "can not read $hasChainNets";
 while (my $line = <$fh>) {
   chomp $line;
   if ($line =~ m/^\t/) {
      my $overChain = $line;
      $overChain =~ s/^\t//;
      $overChain =~ s/.over.chain.gz//;
      my @b = split('To', $overChain);
      if (defined($b[0]) && defined($b[1])) {
        my $fromDb = $b[0];
        my $toDb = $b[1];
        if ($toDb !~ m/^GC/) {
           $toDb = lcfirst($toDb);
        }
        if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) {
           my $key = sprintf("%s|%s", $fromDb, $toDb);
           if (defined($liftOverFiles{$key})) {
+            if (defined($fromToToday{$key})) {
+              ++$alreadyExisting;
+            } else {
               printf STDERR "# to add: %s\n", $key if ($toAddCount < 10);
               printf "%s\t%s\t%s\t0.1\t0\t0\tY\t1\tN\n", $fromDb, $toDb, $liftOverFiles{$key};
               ++$toAddCount;
+            }
           } else {
             printf STDERR "# missing path: %s\n", $key if ($missingPath < 10);
             ++$missingPath;
           }
        }
      }
   } else {
      $asmId = $line;
   }
 }
 close ($fh);
-printf STDERR "# %d entries to add to table, %d missing path file\n", $toAddCount, $missingPath;
+printf STDERR "# %d already exist, %d entries to add to table, %d missing path file\n", $alreadyExisting, $toAddCount, $missingPath;
+
+my %checked;	# key is fromDb|toDb value is 1 for been examined
+my $hasRecipCount = 0;
+my $needRecip = 0;
+
+### check for reciprocals
+printf STDERR "### checking for reciprocals\n";
+my $totalEntries = 0;
+for my $fromTo (sort keys %fromToToday) {
+   ++$totalEntries;
+   next if (defined($checked{$fromTo}));
+   my ($from, $to) = split('\|', $fromTo);
+   $checked{$fromTo} = 1;
+   my $key = sprintf("%s|%s", $to, $from);
+   next if (defined($checked{$key}));
+   if (defined($fromToToday{$key})) {
+     ++$hasRecipCount;
+   } else {
+     ++$needRecip;
+     printf STDERR "# need recip %s %s\n", $to, $from if ($needRecip < 10);
+   }
+   $checked{$key} = 1;
+}
+
+printf STDERR "# %d totals, %d expected half, %d(%d) have reciprocals, %d(%d) need reciprocals\n", $totalEntries, int($totalEntries / 2), $hasRecipCount, 2 * $hasRecipCount, $needRecip, int($needRecip / 2);
 
 # head hasChainNets.txt  | cat -A
 # GCA_028885625.2_NHGRI_mPonPyg2-v2.0_pri^I10$
 # ^IGCA_028885625.2ToGCA_028858775.2.over.chain.gz$
 # ^IGCA_028885625.2ToGCA_028878055.2.over.chain.gz$
 # ^IGCA_028885625.2ToGCA_028885655.2.over.chain.gz$
 # ^IGCA_028885625.2ToGCA_029281585.2.over.chain.gz$
 # ^IGCA_028885625.2ToGCA_029289425.2.over.chain.gz$
 # ^IGCA_028885625.2ToGCF_011100555.1.over.chain.gz$
 # ^IGCA_028885625.2ToGCF_020740605.2.over.chain.gz$
 # ^IGCA_028885625.2ToGCF_027406575.1.over.chain.gz$
 # ^IGCA_028885625.2ToHg38.over.chain.gz$