dd27449e2ac4790137f84c232f6f3ff3f79e1556 hiram Tue Feb 4 14:07:15 2020 -0800 now using the new GCF FTP hierarchy layout refs #23891 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 05d1cb2..6132439 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -14,30 +14,31 @@ use File::stat; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_sourceDir + $opt_species $opt_rmskSpecies $opt_augustusSpecies $opt_xenoRefSeq $opt_ucscNames $opt_asmHubName /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, @@ -50,90 +51,91 @@ { name => 'gapOverlap', func => \&doGapOverlap }, { name => 'tandemDups', func => \&doTandemDups }, { name => 'cpgIslands', func => \&doCpgIslands }, { name => 'ncbiGene', func => \&doNcbiGene }, { name => 'ncbiRefSeq', func => \&doNcbiRefSeq }, { name => 'xenoRefGene', func => \&doXenoRefGene }, { name => 'augustus', func => \&doAugustus }, { name => 'trackDb', func => \&doTrackDb }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; +my $species = ""; # usually found in asmId_assembly_report.txt +my $ftpDir = ""; # will be determined from given asmId my $rmskSpecies = ""; my $augustusSpecies = "human"; my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; my $ucscNames = 0; # default 'FALSE' (== 0) my $asmHubName = "n/a"; # directory name in: /gbdb/hubs/asmHubName my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; # key is original accession name from the remove.dups.list, value is 1 my %dupAccessionList = {}; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " -usage: $base [options] genbank|refseq subGroup species asmId +usage: $base [options] asmId required arguments: - genbank|refseq - specify either genbank or refseq hierarchy source - subGroup - specify subGroup at NCBI FTP site, examples: - - vertebrate_mammalian vertebrate_other plant etc... - species - species directory at NCBI FTP site, examples: - - Homo_sapiens Mus_musculus etc... asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Construct assembly hub in dir instead of default - $HgAutomate::clusterData/asmHubs/{genbank|refseq}/subGroup/species/asmId/ + $HgAutomate::clusterData/asmHubs/refseqBuild/GC[AF]/123/456/789/asmId/ -sourceDir dir Find assembly in dir instead of default: - $sourceDir/<genbank|refseq>/subGroup/species/all_assembly_versions/asmId + $sourceDir/GC[AF]/123/456/789/asmId -ucscNames Translate NCBI/INSDC/RefSeq names to UCSC names default is to use the given NCBI/INSDC/RefSeq names -asmHubName <name> directory name in: /gbdb/hubs/asmHubName - -rmskSpecies to override command line 'species' name for repeat masker + -species <name> use this species designation if there is no + asmId_assembly_report.txt with an + 'Organism name:' entry to obtain species + -rmskSpecies <name> to override default 'species' name for repeat masker + the default is found in the asmId_asssembly_report.txt e.g. -rmskSpecies=viruses -augustusSpecies <human|chicken|zebrafish> default 'human' -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz expanded directory of mrnas/ and xenoRefMrna.sizes, default $xenoRefSeq _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => $fileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates build of assembly hub. Steps: download: sets up sym link working hierarchy from already mirrored files from NCBI in: - $sourceDir/{genbank|refseq}/ + $sourceDir/GC[AF]/123/456/789/asmId sequence: establish AGP and 2bit file from NCBI directory assemblyGap: create assembly and gap bigBed files and indexes for assembly track names gatewayPage: create html/asmId.description.html contents (USE: asmHubName) cytoBand: create cytoBand track and navigation ideogram gc5Base: create bigWig file for gc5Base track repeatMasker: run repeat masker cluster run and create bigBed files for the composite track categories of repeats simpleRepeat: run trf cluster run and create bigBed file for simple repeats allGaps: calculate all actual real gaps due to N's in sequence, can be more than were specified in the AGP file idKeys: calculate md5sum for each sequence in the assembly to be used to find identical sequences in similar assemblies windowMasker: run windowMasker cluster run, create windowMasker bigBed file and compute intersection with repeatMasker results @@ -161,32 +163,32 @@ Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from \$db.2bit. 3. The \$db.2bit files have already been distributed to cluster-scratch (/scratch/hg or /iscratch, /san etc). 4. template? " if ($detailed); print "\n"; exit $status; } # Globals: -# Command line args: genbankRefseq subGroup species asmId -my ($genbankRefseq, $subGroup, $species, $asmId); +# Command line args: asmId +my ( $asmId); # Other: my ($buildDir, $secondsStart, $secondsEnd, $assemblySource); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'sourceDir=s', 'rmskSpecies=s', 'augustusSpecies=s', 'xenoRefSeq=s', 'asmHubName=s', 'ucscNames', @HgAutomate::commonOptionSpec, ); @@ -1496,31 +1498,30 @@ $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId="$asmId" export buildDir="$buildDir" export liftFile="\$buildDir/sequence/\$asmId.ncbiToUcsc.lift" export target2bit="\$buildDir/\$asmId.2bit" if [ $buildDir/\$asmId.2bit -nt \$asmId.ncbiRefSeq.bb ]; then ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -toGpWarnOnly -buildDir=`pwd` \\ -bigClusterHub=$bigClusterHub -dbHost=$dbHost \\ -liftFile="\$liftFile" \\ -target2bit="\$target2bit" \\ -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\ - $genbankRefseq $subGroup $species \\ \$asmId \$asmId else printf "# ncbiRefSeq previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # ncbiRefSeq ######################################################################### # * step: augustus [workhorse] sub doAugustus { my $runDir = "$buildDir/trackData/augustus"; &HgAutomate::mustMkdir($runDir); @@ -1579,87 +1580,113 @@ } # doXenoRefGene ######################################################################### # * step: trackDb [workhorse] sub doTrackDb { my $runDir = "$buildDir"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct asmId.trackDb.txt file"; my $bossScript = newBash HgRemoteScript("$runDir/doTrackDb.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId -\$HOME/kent/src/hg/utils/automation/asmHubTrackDb.sh $genbankRefseq \$asmId $runDir \\ +\$HOME/kent/src/hg/utils/automation/doHubTrackDb.sh \$asmId $runDir \\ > \$asmId.trackDb.txt _EOF_ ); $bossScript->execute(); } # trackDb ######################################################################### # * step: cleanup [fileServer] sub doCleanup { my $runDir = "$buildDir"; my $whatItDoes = "clean up or compresses intermediate files."; my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ printf "to be done\\n" 1>&2 _EOF_ ); $bossScript->execute(); } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); -&usage(1) if (scalar(@ARGV) != 4); +&usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; # expected command line arguments after options are processed -($genbankRefseq, $subGroup, $species, $asmId) = @ARGV; +($asmId) = @ARGV; +# yes, there can be more than two fields separated by _ +# but in this case, we only care about the first two: +# GC[AF]_123456789.3_assembly_Name +# 0 1 2 3 .... +my @partNames = split('_', $asmId); +$ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0], + substr($partNames[1],0,3), substr($partNames[1],3,3), + substr($partNames[1],6,3), $asmId); # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1; # $opt_verbose = 3 if ($opt_verbose < 3); # Establish what directory we will work in. $buildDir = $opt_buildDir ? $opt_buildDir : - "$HgAutomate::clusterData/asmHubs/$genbankRefseq/$subGroup/$species/$asmId"; + "$HgAutomate::clusterData/asmHubs/refseqBuild/$ftpDir"; + +$assemblySource = $opt_sourceDir ? "$sourceDir" : "$sourceDir/$ftpDir"; +my $asmReport = "$assemblySource/${asmId}_assembly_report.txt"; + +$species = $opt_species ? $opt_species : $species; + +if (length($species) < 1) { + if (-s "$asmReport") { + $species = `grep -i "organism name:" $asmReport`; + chomp $species; + $species =~ s/.*organism\s+name:\s+//i; + $species =~ s/\s+\(.*//; + } else { + die "no -species specified and can not find $asmReport"; + } + if (length($species) < 1) { + die "no -species specified and can not find Organism name: in $asmReport"; + } +} $sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir; $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName; -$assemblySource = $opt_sourceDir ? "$sourceDir" : "$sourceDir/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId"; die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq; printf STDERR "# assemblySource: %s\n", $assemblySource; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)";