dd27449e2ac4790137f84c232f6f3ff3f79e1556 hiram Tue Feb 4 14:07:15 2020 -0800 now using the new GCF FTP hierarchy layout refs #23891 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index a8ce0d3f..5303e5a 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -17,33 +17,30 @@ use HgAutomate; use HgRemoteScript; use HgStepManager; my $doIdKeys = "$Bin/doIdKeys.pl"; my $gff3ToRefLink = "$Bin/gff3ToRefLink.pl"; my $gbffToCds = "$Bin/gbffToCds.pl"; my $ncbiRefSeqOtherIxIxx = "$Bin/ncbiRefSeqOtherIxIxx.pl"; my $ncbiRefSeqOtherAttrs = "$Bin/ncbiRefSeqOtherAttrs.pl"; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir - $opt_genbank - $opt_subgroup - $opt_species $opt_liftFile $opt_target2bit $opt_toGpWarnOnly /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'process', func => \&doProcess }, { name => 'load', func => \&doLoad }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: @@ -51,37 +48,32 @@ my $bigClusterHub = 'ku'; my $smallClusterHub = 'ku'; my $workhorse = 'hgwdev'; my $defaultWorkhorse = 'hgwdev'; my $defaultFileServer = 'hgwdev'; my $fileServer = 'hgwdev'; my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " -usage: $base [options] genbank|refseq subGroup species asmId db +usage: $base [options] asmId db required arguments: - genbank|refseq - specify either genbank or refseq hierarchy source - subGroup - specify subGroup at NCBI FTP site, examples: - - vertebrate_mammalian vertebrate_other plant etc... - species - species directory at NCBI FTP site, examples: - - Homo_sapiens Mus_musculus etc... asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. db - database to load with track tables options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date (necessary when continuing at a later date). -toGpWarnOnly add -warnAndContinue to the gff3ToGenePred operation to avoid gene definitions that will not convert -liftFile pathName a lift file to translate NCBI names to local genome names @@ -110,128 +102,130 @@ # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from \$db.2bit. NOTE: Override these assumptions with the -target2Bit option " if ($detailed); print "\n"; exit $status; } # Globals: -# Command line args: genbankRefseq subGroup species asmId db -my ($genbankRefseq, $subGroup, $species, $asmId, $db, $ftpDir); +# Command line args: asmId db +my ($asmId, $db); # Other: -my ($buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit); +my ($ftpDir, $buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit); my ($secondsStart, $secondsEnd); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'liftFile=s', 'target2bit=s', 'toGpWarnOnly', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); $workhorse = $opt_workhorse if ($opt_workhorse); $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub); $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub); $fileServer = $opt_fileServer if ($opt_fileServer); } ######################################################################### # * step: download [workhorse] sub doDownload { my $filesFound = 0; my @requiredFiles = qw( genomic.gff.gz rna.fna.gz rna.gbff.gz protein.faa.gz ); my $filesExpected = scalar(@requiredFiles); foreach my $expectFile (@requiredFiles) { - if ( -s "/hive/data/outside/ncbi/${asmId}_${expectFile}" ) { + if ( -s "/hive/data/outside/ncbi/genomes/$ftpDir/${asmId}_${expectFile}" ) { ++$filesFound; } else { printf STDERR "# doNcbiRefSeq.pl: missing required file /hive/data/outside/ncbi/${asmId}_${expectFile}\n"; } } if ($filesFound < $filesExpected) { printf STDERR "# doNcbiRefSeq.pl download: can not find all files required\n"; exit 0; } my $runDir = "$buildDir/download"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "download required set of files from NCBI."; my $bossScript = newBash HgRemoteScript("$runDir/doDownload.bash", $workhorse, $runDir, $whatItDoes); - my $outsideCopy = "/hive/data/outside/ncbi/$ftpDir"; - my $localData = "/hive/data/inside/ncbi/$ftpDir"; - $localData =~ s/all_assembly_versions/latest_assembly_versions/; - my $local2Bit = "$localData/$asmId.ncbi.2bit"; + my $outsideCopy = "/hive/data/outside/ncbi/genomes/$ftpDir"; + # might already have the NCBI 2bit file here: + my $localData = $buildDir; + $localData =~ s#trackData/ncbiRefSeq#download#; + my $local2Bit = "$localData/$asmId.2bit"; # establish variables $bossScript->add(<<_EOF_ # establish all potential variables to use here, not all may be used export outsideCopy=$outsideCopy export asmId=$asmId export ftpDir=$ftpDir export runDir=$runDir export db=$db _EOF_ ); printf STDERR "# checking $outsideCopy\n"; +printf STDERR "# checking $local2Bit\n"; # see if local symLinks can be made with copies already here from NCBI: if ( -d "$outsideCopy" ) { $bossScript->add(<<_EOF_ # local file copies exist, use symlinks ln -f -s \$outsideCopy/\${asmId}_genomic.gff.gz . ln -f -s \$outsideCopy/\${asmId}_rna.fna.gz . ln -f -s \$outsideCopy/\${asmId}_rna.gbff.gz . ln -f -s \$outsideCopy/\${asmId}_protein.faa.gz . _EOF_ ); } else { $bossScript->add(<<_EOF_ # local file copies do not exist, download from NCBI: for F in _rna.gbff _rna.fna _protein.faa _genomic.gff do rsync -a -P \\ rsync://ftp.ncbi.nlm.nih.gov/\$ftpDir/\$asmId\${F}.gz ./ done _EOF_ ); } if ( -s $local2Bit ) { $bossScript->add(<<_EOF_ -ln -f -s $local2Bit . +ln -f -s $local2Bit \${asmId}.ncbi.2bit _EOF_ ); } elsif ( -s "$outsideCopy/${asmId}_genomic.fna.gz") { $bossScript->add(<<_EOF_ # build \$asmId.ncbi.2bit from local copy of genomic fasta faToTwoBit \$outsideCopy/\${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit _EOF_ ); } else { $bossScript->add(<<_EOF_ # download genomic fasta and build \${asmId}.ncbi.2bit rsync -a -P \\ rsync://ftp.ncbi.nlm.nih.gov/\$ftpDir/\${asmId}_genomic.fna.gz ./ faToTwoBit \${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit @@ -747,43 +741,50 @@ gunzip process/ncbiRefSeqVersion.txt.gz _EOF_ ); $bossScript->execute(); } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); -&usage(1) if (scalar(@ARGV) != 5); +&usage(1) if (scalar(@ARGV) != 2); $toGpWarnOnly = 0; $toGpWarnOnly = 1 if ($opt_toGpWarnOnly); $liftFile = $opt_liftFile ? $opt_liftFile : ""; $target2bit = $opt_target2bit ? $opt_target2bit : ""; $secondsStart = `date "+%s"`; chomp $secondsStart; # expected command line arguments after options are processed -($genbankRefseq, $subGroup, $species, $asmId, $db) = @ARGV; -$ftpDir = "genomes/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId"; +($asmId, $db) = @ARGV; +# yes, there can be more than two fields separated by _ +# but in this case, we only care about the first two: +# GC[AF]_123456789.3_assembly_Name +# 0 1 2 3 .... +my @partNames = split('_', $asmId); +$ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0], + substr($partNames[1],0,3), substr($partNames[1],3,3), + substr($partNames[1],6,3), $asmId); if ( -z "$liftFile" && ! -s "/hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt") { die "ERROR: can not find /hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt\n\t need to run doIdKeys.pl for $db before this procedure."; } # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1; # $opt_verbose = 3 if ($opt_verbose < 3); # Establish what directory we will work in. my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/ncbiRefSeq.$date";