dd27449e2ac4790137f84c232f6f3ff3f79e1556
hiram
  Tue Feb 4 14:07:15 2020 -0800
now using the new GCF FTP hierarchy layout refs #23891

diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl
index a8ce0d3f..5303e5a 100755
--- src/hg/utils/automation/doNcbiRefSeq.pl
+++ src/hg/utils/automation/doNcbiRefSeq.pl
@@ -17,33 +17,30 @@
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 my $doIdKeys = "$Bin/doIdKeys.pl";
 my $gff3ToRefLink = "$Bin/gff3ToRefLink.pl";
 my $gbffToCds = "$Bin/gbffToCds.pl";
 my $ncbiRefSeqOtherIxIxx = "$Bin/ncbiRefSeqOtherIxIxx.pl";
 my $ncbiRefSeqOtherAttrs = "$Bin/ncbiRefSeqOtherAttrs.pl";
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
-    $opt_genbank
-    $opt_subgroup
-    $opt_species
     $opt_liftFile
     $opt_target2bit
     $opt_toGpWarnOnly
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download', func => \&doDownload },
       { name => 'process', func => \&doProcess },
       { name => 'load', func => \&doLoad },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
@@ -51,37 +48,32 @@
 my $bigClusterHub = 'ku';
 my $smallClusterHub = 'ku';
 my $workhorse = 'hgwdev';
 my $defaultWorkhorse = 'hgwdev';
 my $defaultFileServer = 'hgwdev';
 my $fileServer = 'hgwdev';
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
-usage: $base [options] genbank|refseq subGroup species asmId db
+usage: $base [options] asmId db
 required arguments:
-    genbank|refseq - specify either genbank or refseq hierarchy source
-    subGroup       - specify subGroup at NCBI FTP site, examples:
-                   - vertebrate_mammalian vertebrate_other plant etc...
-    species        - species directory at NCBI FTP site, examples:
-                   - Homo_sapiens Mus_musculus etc...
     asmId          - assembly identifier at NCBI FTP site, examples:
                    - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc..
     db             - database to load with track tables
 
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date
                           (necessary when continuing at a later date).
     -toGpWarnOnly         add -warnAndContinue to the gff3ToGenePred operation
                           to avoid gene definitions that will not convert
     -liftFile pathName    a lift file to translate NCBI names to local genome
                           names
@@ -110,128 +102,130 @@
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from
    \$db.2bit.
 NOTE: Override these assumptions with the -target2Bit option
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
-# Command line args: genbankRefseq subGroup species asmId db
-my ($genbankRefseq, $subGroup, $species, $asmId, $db, $ftpDir);
+# Command line args: asmId db
+my ($asmId, $db);
 # Other:
-my ($buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit);
+my ($ftpDir, $buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit);
 my ($secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'liftFile=s',
 		      'target2bit=s',
 		      'toGpWarnOnly',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
   $workhorse = $opt_workhorse if ($opt_workhorse);
   $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
   $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub);
   $fileServer = $opt_fileServer if ($opt_fileServer);
 }
 
 #########################################################################
 # * step: download [workhorse]
 sub doDownload {
   my $filesFound = 0;
  my @requiredFiles = qw( genomic.gff.gz rna.fna.gz rna.gbff.gz protein.faa.gz );
   my $filesExpected = scalar(@requiredFiles);
   foreach my $expectFile (@requiredFiles) {
-    if ( -s "/hive/data/outside/ncbi/${asmId}_${expectFile}" ) {
+    if ( -s "/hive/data/outside/ncbi/genomes/$ftpDir/${asmId}_${expectFile}" ) {
       ++$filesFound;
     } else {
       printf STDERR "# doNcbiRefSeq.pl: missing required file /hive/data/outside/ncbi/${asmId}_${expectFile}\n";
     }
   }
 
   if ($filesFound < $filesExpected) {
     printf STDERR "# doNcbiRefSeq.pl download: can not find all files required\n";
     exit 0;
   }
   my $runDir = "$buildDir/download";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "download required set of files from NCBI.";
   my $bossScript = newBash HgRemoteScript("$runDir/doDownload.bash", $workhorse,
 				      $runDir, $whatItDoes);
-  my $outsideCopy = "/hive/data/outside/ncbi/$ftpDir";
-  my $localData = "/hive/data/inside/ncbi/$ftpDir";
-  $localData =~ s/all_assembly_versions/latest_assembly_versions/;
-  my $local2Bit = "$localData/$asmId.ncbi.2bit";
+  my $outsideCopy = "/hive/data/outside/ncbi/genomes/$ftpDir";
+  # might already have the NCBI 2bit file here:
+  my $localData = $buildDir;
+  $localData =~ s#trackData/ncbiRefSeq#download#;
+  my $local2Bit = "$localData/$asmId.2bit";
 
   # establish variables
   $bossScript->add(<<_EOF_
 # establish all potential variables to use here, not all may be used
 
 export outsideCopy=$outsideCopy
 export asmId=$asmId
 export ftpDir=$ftpDir
 export runDir=$runDir
 export db=$db
 
 _EOF_
     );
 
 printf STDERR "# checking $outsideCopy\n";
+printf STDERR "# checking $local2Bit\n";
 
   # see if local symLinks can be made with copies already here from NCBI:
   if ( -d "$outsideCopy" ) {
     $bossScript->add(<<_EOF_
 # local file copies exist, use symlinks
 
 ln -f -s \$outsideCopy/\${asmId}_genomic.gff.gz .
 ln -f -s \$outsideCopy/\${asmId}_rna.fna.gz .
 ln -f -s \$outsideCopy/\${asmId}_rna.gbff.gz .
 ln -f -s \$outsideCopy/\${asmId}_protein.faa.gz .
 _EOF_
     );
   } else {
     $bossScript->add(<<_EOF_
 # local file copies do not exist, download from NCBI:
 
 for F in _rna.gbff _rna.fna _protein.faa _genomic.gff
 do
    rsync -a -P \\
        rsync://ftp.ncbi.nlm.nih.gov/\$ftpDir/\$asmId\${F}.gz ./
 done
 _EOF_
     );
   }
 
   if ( -s $local2Bit ) {
     $bossScript->add(<<_EOF_
-ln -f -s $local2Bit .
+ln -f -s $local2Bit \${asmId}.ncbi.2bit
 _EOF_
     );
   } elsif ( -s "$outsideCopy/${asmId}_genomic.fna.gz") {
     $bossScript->add(<<_EOF_
 # build \$asmId.ncbi.2bit from local copy of genomic fasta
 
 faToTwoBit \$outsideCopy/\${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit
 _EOF_
     );
   } else {
     $bossScript->add(<<_EOF_
 # download genomic fasta and build \${asmId}.ncbi.2bit
 rsync -a -P \\
        rsync://ftp.ncbi.nlm.nih.gov/\$ftpDir/\${asmId}_genomic.fna.gz ./
 faToTwoBit \${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit
@@ -747,43 +741,50 @@
 gunzip process/ncbiRefSeqVersion.txt.gz
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
-&usage(1) if (scalar(@ARGV) != 5);
+&usage(1) if (scalar(@ARGV) != 2);
 
 $toGpWarnOnly = 0;
 $toGpWarnOnly = 1 if ($opt_toGpWarnOnly);
 $liftFile = $opt_liftFile ? $opt_liftFile : "";
 $target2bit = $opt_target2bit ? $opt_target2bit : "";
 
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 
 # expected command line arguments after options are processed
-($genbankRefseq, $subGroup, $species, $asmId, $db) = @ARGV;
-$ftpDir = "genomes/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId";
+($asmId, $db) = @ARGV;
+# yes, there can be more than two fields separated by _
+# but in this case, we only care about the first two:
+# GC[AF]_123456789.3_assembly_Name
+#   0         1         2      3 ....
+my @partNames = split('_', $asmId);
+$ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0],
+   substr($partNames[1],0,3), substr($partNames[1],3,3),
+   substr($partNames[1],6,3), $asmId);
 
 if ( -z "$liftFile" && ! -s "/hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt") {
   die "ERROR: can not find /hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt\n\t  need to run doIdKeys.pl for $db before this procedure.";
 }
 
 # Force debug and verbose until this is looking pretty solid:
 # $opt_debug = 1;
 # $opt_verbose = 3 if ($opt_verbose < 3);
 
 # Establish what directory we will work in.
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/ncbiRefSeq.$date";