2c28295e6b1f47a483282b3790208679ba39a5fe hiram Mon Aug 18 17:51:33 2025 -0700 checking for reciprocal both directions refs #35575 diff --git src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl index 4b7a6b6ae3e..065590c63ee 100755 --- src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl +++ src/hg/makeDb/doc/asmHubs/updateLiftOverChain.pl @@ -45,61 +45,61 @@ while (my $line = <$fh>) { chomp $line; my ($md5, $filePath) = split('\t', $line); my @a = split('/', $filePath); my $loChain = $a[-1]; $loChain =~ s/.over.chain.gz//; my @b = split('To', $loChain); if (defined($b[0]) && defined($b[1])) { my $fromDb = $b[0]; my $toDb = $b[1]; if ($toDb !~ m/^GC/) { $toDb = lcfirst($toDb); } if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) { my $key = sprintf("%s|%s", $fromDb, $toDb); - printf STDERR "# %s\n", $key if ($liftOverCount < 10); +# printf STDERR "# %s\n", $key if ($liftOverCount < 10); $liftOverFiles{$key} = $filePath; ++$liftOverCount; } else { - printf STDERR "# invalid Db: %s or %s\n", $fromDb, $toDb if ($invalidCount < 10); +# printf STDERR "# invalid Db: %s or %s\n", $fromDb, $toDb if ($invalidCount < 10); ++$invalidCount; } } else { printf STDERR "# no db name found in:\n%s\n"< $filePath if ($invalidCount < 10); ++$invalidCount; # printf STDERR "warning: no db name found in liftOver file name:\n%s\n", $filePath; } } close ($fh); open ($fh, "-|", "zgrep 'over.chain.gz' /hive/data/inside/GenArk/liftOver/gbdb.genark.liftOver.file.list") or die "can not read gbdb/genark.liftOver.file.list"; while (my $filePath = <$fh>) { chomp $filePath; my @a = split('/', $filePath); my $loChain = $a[-1]; $loChain =~ s/.over.chain.gz//; my @b = split('To', $loChain); if (defined($b[0]) && defined($b[1])) { my $fromDb = $b[0]; my $toDb = $b[1]; if ($toDb !~ m/^GC/) { $toDb = lcfirst($toDb); } if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) { my $key = sprintf("%s|%s", $fromDb, $toDb); - printf STDERR "# %s\n", $key if ($liftOverCount < 10); +# printf STDERR "# %s\n", $key if ($liftOverCount < 10); $liftOverFiles{$key} = $filePath; ++$liftOverCount; } else { printf STDERR "# invalid Db: %s or %s\n", $fromDb, $toDb if ($invalidCount < 10); ++$invalidCount; } } else { printf STDERR "# no db name foundin:\n%s\n"< $filePath if ($invalidCount < 10); ++$invalidCount; # printf STDERR "warning: no db name found in liftOver file name:\n%s\n", $filePath; } } close ($fh); printf STDERR "# %d liftOverFiles, %d invalid files\n", $liftOverCount, $invalidCount; @@ -120,63 +120,93 @@ $fromToToday{$key} = $path; ++$todayLiftCount; } else { ++$invalidToFrom; } } else { ++$pathMissing; } } close ($fh); printf STDERR "# liftOverCount table: %d valid, %d path missing, %d invalidToFrom\n", $todayLiftCount, $pathMissing, $invalidToFrom; my $toAddCount = 0; my $missingPath = 0; +my $alreadyExisting = 0; printf STDERR "# reading: '%s'\n", $hasChainNets; my $asmId = ""; open ($fh, "<", $hasChainNets) or die "can not read $hasChainNets"; while (my $line = <$fh>) { chomp $line; if ($line =~ m/^\t/) { my $overChain = $line; $overChain =~ s/^\t//; $overChain =~ s/.over.chain.gz//; my @b = split('To', $overChain); if (defined($b[0]) && defined($b[1])) { my $fromDb = $b[0]; my $toDb = $b[1]; if ($toDb !~ m/^GC/) { $toDb = lcfirst($toDb); } if ( defined($validDb{$toDb}) && defined($validDb{$fromDb}) ) { my $key = sprintf("%s|%s", $fromDb, $toDb); if (defined($liftOverFiles{$key})) { + if (defined($fromToToday{$key})) { + ++$alreadyExisting; + } else { printf STDERR "# to add: %s\n", $key if ($toAddCount < 10); printf "%s\t%s\t%s\t0.1\t0\t0\tY\t1\tN\n", $fromDb, $toDb, $liftOverFiles{$key}; ++$toAddCount; + } } else { printf STDERR "# missing path: %s\n", $key if ($missingPath < 10); ++$missingPath; } } } } else { $asmId = $line; } } close ($fh); -printf STDERR "# %d entries to add to table, %d missing path file\n", $toAddCount, $missingPath; +printf STDERR "# %d already exist, %d entries to add to table, %d missing path file\n", $alreadyExisting, $toAddCount, $missingPath; + +my %checked; # key is fromDb|toDb value is 1 for been examined +my $hasRecipCount = 0; +my $needRecip = 0; + +### check for reciprocals +printf STDERR "### checking for reciprocals\n"; +my $totalEntries = 0; +for my $fromTo (sort keys %fromToToday) { + ++$totalEntries; + next if (defined($checked{$fromTo})); + my ($from, $to) = split('\|', $fromTo); + $checked{$fromTo} = 1; + my $key = sprintf("%s|%s", $to, $from); + next if (defined($checked{$key})); + if (defined($fromToToday{$key})) { + ++$hasRecipCount; + } else { + ++$needRecip; + printf STDERR "# need recip %s %s\n", $to, $from if ($needRecip < 10); + } + $checked{$key} = 1; +} + +printf STDERR "# %d totals, %d expected half, %d(%d) have reciprocals, %d(%d) need reciprocals\n", $totalEntries, int($totalEntries / 2), $hasRecipCount, 2 * $hasRecipCount, $needRecip, int($needRecip / 2); # head hasChainNets.txt | cat -A # GCA_028885625.2_NHGRI_mPonPyg2-v2.0_pri^I10$ # ^IGCA_028885625.2ToGCA_028858775.2.over.chain.gz$ # ^IGCA_028885625.2ToGCA_028878055.2.over.chain.gz$ # ^IGCA_028885625.2ToGCA_028885655.2.over.chain.gz$ # ^IGCA_028885625.2ToGCA_029281585.2.over.chain.gz$ # ^IGCA_028885625.2ToGCA_029289425.2.over.chain.gz$ # ^IGCA_028885625.2ToGCF_011100555.1.over.chain.gz$ # ^IGCA_028885625.2ToGCF_020740605.2.over.chain.gz$ # ^IGCA_028885625.2ToGCF_027406575.1.over.chain.gz$ # ^IGCA_028885625.2ToHg38.over.chain.gz$