24c443b10b97e5b1e81df9ee5cb393131e7b873a hiram Wed May 5 11:40:43 2021 -0700 better handling of assembly hub chain net tracks refs #26988 diff --git src/hg/utils/automation/HgAutomate.pm src/hg/utils/automation/HgAutomate.pm index 4a1b1a5..3bba2a2 100755 --- src/hg/utils/automation/HgAutomate.pm +++ src/hg/utils/automation/HgAutomate.pm @@ -21,31 +21,32 @@ # treated as constants) exported by this module: @EXPORT_OK = ( # Support for common command line options: qw( getCommonOptionHelp processCommonOptions @commonOptionVars @commonOptionSpec ), # Some basic smarts about our compute infrastructure: qw( choosePermanentStorage chooseWorkhorse chooseFileServer chooseClusterByBandwidth chooseSmallClusterByBandwidth chooseFilesystemsForCluster checkClusterPath ), # General-purpose utility routines: qw( checkCleanSlate checkExistsUnlessDebug closeStdin getAssemblyInfo getSpecies gensub2 machineHasFile databaseExists - makeGsub mustMkdir mustOpen nfsNoodge paraRun run verbose + makeGsub mustMkdir asmHubBuildDir asmHubDownloadDir mustOpen + nfsNoodge paraRun run verbose ), # Hardcoded paths/commands/constants: qw( $centralDbSql $git $clusterData $trackBuild $goldenPath $images $gbdb $splitThreshold $runSSH $setMachtype ), ); ######################################################################### # A simple model of our local compute environment with some subroutines # for checking the validity of path+machine combos and for suggesting # appropriate storage and machines. use vars qw( %cluster %clusterFilesystem $defaultDbHost ); @@ -616,41 +617,104 @@ # allow PATH to find the gensub2 command $answer = "gensub2"; } return $answer; } sub closeStdin { # If we don't do this, the script can hang ("Suspended (tty input)") # when it is run backgrounded (&) and then something is typed into the # terminal... or something like that. Anyway, doesn't hurt. It does not # prevent hanging on ssh prompts, however. close(STDIN); open(STDIN, '/dev/null'); } +sub asmHubDownloadDir { + # return path to assembly hub build directory + my ($asmId) = @_; + confess "Must have exactly 1 argument" if (scalar(@_) != 1); + confess "must supply GC[AF]_... assembly ID" if ($asmId !~ m/^GC/); + my $gcX = substr($asmId,0,3); + my $d0 = substr($asmId,4,3); + my $d1 = substr($asmId,7,3); + my $d2 = substr($asmId,10,3); + my $downloadDir = $goldenPath . "/$gcX/$d0/$d1/$d2"; + return $downloadDir; +} + +sub asmHubBuildDir { + # return path to assembly hub build directory + my ($asmId) = @_; + confess "Must have exactly 1 argument" if (scalar(@_) != 1); + confess "must supply GC[AF]_... assembly ID" if ($asmId !~ m/^GC/); + my $gcX = substr($asmId,0,3); + my $d0 = substr($asmId,4,3); + my $d1 = substr($asmId,7,3); + my $d2 = substr($asmId,10,3); + my $buildDir = "/hive/data/genomes/asmHubs/allBuild/$gcX/$d0/$d1/$d2/$asmId"; + return $buildDir; +} + +sub asmHubSubmitter { + # common name is in (parens) in the assembly_report 'Organism name:' line + my ($asmReport) = @_; + my $submitter = `grep -i "submitter" $asmReport | head -1 | tr -d '\r'`; + chomp $submitter; + $submitter =~ s/.*ubmitter:\s+//i; + return $submitter; +} + +sub asmHubDate { + # common name is in (parens) in the assembly_report 'Organism name:' line + my ($asmReport) = @_; + my $date = `grep -i "date:" $asmReport | head -1 | tr -d '\r'`; + chomp $date; + $date =~ s/.*ate:\s+//i; + return $date; +} + +sub asmHubCommonName { + # common name is in (parens) in the assembly_report 'Organism name:' line + my ($asmReport) = @_; + my $names = `grep -i "organism name:" $asmReport | head -1 | tr -d '\r'`; + chomp $names; + $names =~ s/.*\(//; + $names =~ s/\).*//; + return $names; +} + sub getAssemblyInfo { # Do a quick dbDb lookup to get assembly descriptive info for README.txt. my ($dbHost, $db) = @_; confess "Must have exactly 2 arguments" if (scalar(@_) != 2); + if ($db =~ m/^GC/) { + my $asmReport = asmHubBuildDir($db) . "/download/${db}_assembly_report.txt"; + confess "Can not find $asmReport" if ( ! -s "${asmReport}" ); + my $genome = asmHubCommonName($asmReport); + my $date = asmHubDate($asmReport); + my $source = asmHubSubmitter($asmReport); + return ($genome, $date, $source); + } else { my $query = "select genome,description,sourceName from dbDb " . "where name = \"$db\";"; my $line = `echo '$query' | $HgAutomate::runSSH $dbHost $centralDbSql`; chomp $line; my ($genome, $date, $source) = split("\t", $line); return ($genome, $date, $source); } +} sub getSpecies { # fetch scientificName from dbDb my ($dbHost, $db) = @_; confess "Must have exactly 2 arguments" if (scalar(@_) != 2); my $query = "select scientificName from dbDb " . "where name = \"$db\";"; my $line = `echo '$query' | $HgAutomate::runSSH $dbHost $centralDbSql`; chomp $line; my ($scientificName) = split("\t", $line); if (length($scientificName) < 1) { if ( -s "$HgAutomate::clusterData/$db/species.name.txt" ) { $scientificName = `cat $HgAutomate::clusterData/$db/species.name.txt`; chomp $scientificName; } else {