45e061bdd61a8fdf439157af29822df1589f6c43 hiram Thu Apr 7 13:25:31 2022 -0700 add a hubDateName function and allow newdev to be used as a cluster refs #29203 diff --git src/hg/utils/automation/HgAutomate.pm src/hg/utils/automation/HgAutomate.pm index 3bba2a2..8df1468 100755 --- src/hg/utils/automation/HgAutomate.pm +++ src/hg/utils/automation/HgAutomate.pm @@ -20,33 +20,33 @@ # This is a listing of the public methods and variables (which should be # treated as constants) exported by this module: @EXPORT_OK = ( # Support for common command line options: qw( getCommonOptionHelp processCommonOptions @commonOptionVars @commonOptionSpec ), # Some basic smarts about our compute infrastructure: qw( choosePermanentStorage chooseWorkhorse chooseFileServer chooseClusterByBandwidth chooseSmallClusterByBandwidth chooseFilesystemsForCluster checkClusterPath ), # General-purpose utility routines: qw( checkCleanSlate checkExistsUnlessDebug closeStdin - getAssemblyInfo getSpecies gensub2 machineHasFile databaseExists - makeGsub mustMkdir asmHubBuildDir asmHubDownloadDir mustOpen - nfsNoodge paraRun run verbose + getAssemblyInfo getSpecies hubDateName gensub2 machineHasFile + databaseExists makeGsub mustMkdir asmHubBuildDir asmHubDownloadDir + mustOpen nfsNoodge paraRun run verbose ), # Hardcoded paths/commands/constants: qw( $centralDbSql $git $clusterData $trackBuild $goldenPath $images $gbdb $splitThreshold $runSSH $setMachtype ), ); ######################################################################### # A simple model of our local compute environment with some subroutines # for checking the validity of path+machine combos and for suggesting # appropriate storage and machines. use vars qw( %cluster %clusterFilesystem $defaultDbHost ); @@ -54,30 +54,33 @@ sub readMainCluster(); # forward declaration to keep code order # the name of the cluster is in a separate text file, so it's easier to # use from bash scripts %cluster = ( readMainCluster() => { 'enabled' => 1, 'gigaHz' => 1.4, 'ram' => 8, 'hostCount' => 992, }, 'hgwdev-101' => { 'enabled' => 1, 'gigaHz' => 2.1, 'ram' => 1, 'hostCount' => 32, }, 'hgwdev' => { 'enabled' => 1, 'gigaHz' => 2.1, 'ram' => 1, 'hostCount' => 32, }, + 'newdev' => + { 'enabled' => 1, 'gigaHz' => 2.1, 'ram' => 1, + 'hostCount' => 32, }, ); my %obsoleteCluster = ( 'swarm' => , { 'enabled' => 1, 'gigaHz' => 2.33, 'ram' => 8, 'hostCount' => 1024, }, 'memk' => { 'enabled' => 1, 'gigaHz' => 1.0, 'ram' => 32, 'hostCount' => 32, }, 'encodek' => { 'enabled' => 1, 'gigaHz' => 2.0, 'ram' => 16, 'hostCount' => 48, }, ); my @allClusters = (keys %cluster); @@ -724,30 +727,78 @@ return ($scientificName); } # getSpecies sub getOrganism { # fetch organism from dbDb my ($dbHost, $db) = @_; confess "Must have exactly 2 arguments" if (scalar(@_) != 2); my $query = "select organism from dbDb " . "where name = \"$db\";"; my $line = `echo '$query' | $HgAutomate::runSSH $dbHost $centralDbSql`; chomp $line; my ($organism) = split("\t", $line); return ($organism); } # getOrganism +# try to find the date and assembly name for a hub given just the accession +sub hubDateName($) { + my ($accession) = @_; + my $returnDate = "some date"; + my $returnAsmName = ""; + + if ($accession !~ m/^GC/) { # not a GenArk hub, database assembly + my ($comName, $asmDate, $submitter) = getAssemblyInfo("hgwdev", $accession); + $returnDate = $asmDate; + $returnAsmName = $accession; + } else { + my $gcX = substr($accession, 0, 3); + my $d0 = substr($accession, 4, 3); + my $d1 = substr($accession, 7, 3); + my $d2 = substr($accession, 10, 3); + # a couple assemblies unfortunately have two different assembly names + # can't work with those with just an accession + # special case the CHM13 assembly + my $betterId = $accession; + if ($accession =~ m/GCA_009914755.4/) { + $betterId = "GCA_009914755.4_CHM13_T2T_v2.0"; + } + my $dirCount = `ls -d /hive/data/outside/ncbi/genomes/$gcX/$d0/$d1/$d2/${betterId}* | wc -l`; + chomp $dirCount; + if (1 == $dirCount) { + my $srcDir = `ls -d /hive/data/outside/ncbi/genomes/$gcX/$d0/$d1/$d2/${betterId}*`; + chomp $srcDir; + if ( -d "${srcDir}" ) { + my $asmId = basename($srcDir); + my $asmRpt = "$srcDir/${asmId}_assembly_report.txt"; + if ( -s "${asmRpt}" ) { + (undef, undef, $returnAsmName) = split('_', $asmId, 3); + if (defined($returnAsmName)) { + $returnAsmName =~ s/\r//; + $returnAsmName =~ s/^/_/; + } else { + $returnAsmName = ""; + } + my $tDate = `egrep -m 1 -i "^#[[:space:]]*Date:" "${asmRpt}" | sed -e 's/.*ate: \\+//;' | tr -d '\r'`; + chomp $tDate; + $returnDate = $tDate if (length($tDate)); + } + } + } + } + return ($returnDate, $returnAsmName); +} # sub hubDateName($) + sub machineHasFile { # Return a positive integer if $mach appears to have $file or 0 if it # does not. my ($mach, $file) = @_; confess "Must have exactly 2 arguments" if (scalar(@_) != 2); confess "undef input" if (! defined $mach || ! defined $file); my $count = `$HgAutomate::runSSH $mach ls -1 $file 2>>/dev/null | wc -l`; chomp $count; return $count + 0; } sub databaseExists { my ($dbHost, $db) = @_; return 0 if ($dbHost =~ m/nohost/i); confess "Must have exactly 2 arguments" if (scalar(@_) != 2);