fe164c83850e2adba662df57142d37321d581236 hiram Tue Sep 9 22:16:59 2025 -0700 helper script for runBuild to find a reasonable RepeatMasker species to work with diff --git src/hg/makeDb/doc/asmHubs/findTaxon.pl src/hg/makeDb/doc/asmHubs/findTaxon.pl new file mode 100755 index 00000000000..dc335177df9 --- /dev/null +++ src/hg/makeDb/doc/asmHubs/findTaxon.pl @@ -0,0 +1,82 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + + +my $argc = scalar(@ARGV); + +if ($argc != 1) { + printf STDERR "usage: findTaxon.pl Scientific_name\n"; + exit 255; +} + +my $RMSK="/hive/data/outside/RepeatMasker/RepeatMasker-4.2.1/RepeatMasker"; + +sub testOne($) { + my $return = 1; + my ($s) = @_; + my $rmSpecies = $s; + $rmSpecies =~ s/_/ /g; + printf STDERR "# checking '%s'\n", $rmSpecies; + open (YN, "${RMSK} -engine rmblast -pa 1 -s -species \"${rmSpecies}\" /dev/null 2>&1 | egrep \"families in|may not be any|No results were found for that\" | tr '\\n' ' '|") or die "can not RepeatMasker\n"; + my $line = <YN>; + chomp $line; + if ($line =~ m/may not be any|for that name/) { + printf STDERR "%s - is not known\n", $rmSpecies; + $return = 0; + } else { + printf STDERR "%s - %s\n", ${rmSpecies}, ${line}; + } + close (YN); + return $return; +} + +my $TOP="/hive/data/genomes/asmHubs/allBuild/rmCheck"; +# my $path = "/cluster/software/bin:$ENV{'PATH'}"; +# $ENV{'PATH'} = $path; +# print `env`; + +my $findMe = shift; +my $thisProcess = $$; +my $tmpDir = "/dev/shm/findRm$thisProcess"; +# printf STDERR "# tmpDir: %s\n", $tmpDir; +mkdir("$tmpDir"); +chdir("$tmpDir"); + +my $yesNo = testOne($findMe); +if (1 == $yesNo) { + chdir($TOP); + rmdir("$tmpDir"); + printf "%s\n", $findMe; + exit 0; +} + +`rm -fr $tmpDir/RM_*`; +printf STDERR "# checking for other names %s\n", $findMe; + +open (FH, "cut -f4,5 /hive/data/outside/ncbi/genomes/reports/refseq/allAssemblies.taxonomy.tsv /hive/data/outside/ncbi/genomes/reports/genbank/allAssemblies.taxonomy.tsv | tr ' ' '_' | sort -k1,1 -u|") or die "can not read /hive/data/outside/ncbi/genomes/reports/refseq/allAssemblies.taxonomy.tsv"; +while (my $line = <FH>) { + chomp $line; + my ($sciName, $taxonomy) = split('\t', $line); + if ($sciName eq $findMe) { + $taxonomy =~ s/;$//; + printf STDERR "# %s - %s\n", $sciName, $taxonomy; + my @a = split(';', $taxonomy); + my $termCount = scalar(@a); + for (my $i = $termCount-1; $i > 0; --$i) { + my $yesNo = testOne($a[$i]); + if (1 == $yesNo) { + chdir($TOP); + rmdir("$tmpDir"); + printf "%s\n", $a[$i]; + last; + } else { + `rm -fr $tmpDir/RM_*`; + } + } + } +} +close (FH); +chdir($TOP); +rmdir("$tmpDir");