src/hg/utils/automation/doAssemblyHub.pl dd27449e2ac4790137f84c232f6f3ff3f79e1556

dd27449e2ac4790137f84c232f6f3ff3f79e1556
hiram
  Tue Feb 4 14:07:15 2020 -0800
now using the new GCF FTP hierarchy layout refs #23891

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 05d1cb2..6132439 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -14,30 +14,31 @@
 use File::stat;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_sourceDir
+    $opt_species
     $opt_rmskSpecies
     $opt_augustusSpecies
     $opt_xenoRefSeq
     $opt_ucscNames
     $opt_asmHubName
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download',   func => \&doDownload },
       { name => 'sequence',   func => \&doSequence },
       { name => 'assemblyGap',   func => \&doAssemblyGap },
       { name => 'gatewayPage',   func => \&doGatewayPage },
       { name => 'cytoBand',   func => \&doCytoBand },
       { name => 'gc5Base',   func => \&doGc5Base },
@@ -50,90 +51,91 @@
       { name => 'gapOverlap',   func => \&doGapOverlap },
       { name => 'tandemDups',   func => \&doTandemDups },
       { name => 'cpgIslands',   func => \&doCpgIslands },
       { name => 'ncbiGene',   func => \&doNcbiGene },
       { name => 'ncbiRefSeq',   func => \&doNcbiRefSeq },
       { name => 'xenoRefGene',   func => \&doXenoRefGene },
       { name => 'augustus',   func => \&doAugustus },
       { name => 'trackDb',   func => \&doTrackDb },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
+my $species = "";       # usually found in asmId_assembly_report.txt
+my $ftpDir = "";	# will be determined from given asmId
 my $rmskSpecies = "";
 my $augustusSpecies = "human";
 my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq";
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $asmHubName = "n/a";  # directory name in: /gbdb/hubs/asmHubName
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 # key is original accession name from the remove.dups.list, value is 1
 my %dupAccessionList = {};
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
-usage: $base [options] genbank|refseq subGroup species asmId
+usage: $base [options] asmId
 required arguments:
-    genbank|refseq - specify either genbank or refseq hierarchy source
-    subGroup       - specify subGroup at NCBI FTP site, examples:
-                   - vertebrate_mammalian vertebrate_other plant etc...
-    species        - species directory at NCBI FTP site, examples:
-                   - Homo_sapiens Mus_musculus etc...
     asmId          - assembly identifier at NCBI FTP site, examples:
                    - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc..
 
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir     Construct assembly hub in dir instead of default
-       $HgAutomate::clusterData/asmHubs/{genbank|refseq}/subGroup/species/asmId/
+       $HgAutomate::clusterData/asmHubs/refseqBuild/GC[AF]/123/456/789/asmId/
     -sourceDir dir    Find assembly in dir instead of default:
-       $sourceDir/<genbank|refseq>/subGroup/species/all_assembly_versions/asmId
+       $sourceDir/GC[AF]/123/456/789/asmId
     -ucscNames        Translate NCBI/INSDC/RefSeq names to UCSC names
                       default is to use the given NCBI/INSDC/RefSeq names
     -asmHubName <name>  directory name in: /gbdb/hubs/asmHubName
-    -rmskSpecies to override command line 'species' name for repeat masker
+    -species <name>   use this species designation if there is no
+                      asmId_assembly_report.txt with an
+                      'Organism name:' entry to obtain species
+    -rmskSpecies <name> to override default 'species' name for repeat masker
+                      the default is found in the asmId_asssembly_report.txt
                       e.g. -rmskSpecies=viruses
     -augustusSpecies <human|chicken|zebrafish> default 'human'
     -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz
                 expanded directory of mrnas/ and xenoRefMrna.sizes, default
                 $xenoRefSeq
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $workhorse,
 						'fileServer' => $fileServer,
 						'bigClusterHub' => $bigClusterHub,
 						'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Automates build of assembly hub.  Steps:
     download: sets up sym link working hierarchy from already mirrored
                 files from NCBI in:
-                      $sourceDir/{genbank|refseq}/
+                      $sourceDir/GC[AF]/123/456/789/asmId
     sequence: establish AGP and 2bit file from NCBI directory
     assemblyGap: create assembly and gap bigBed files and indexes
                  for assembly track names
     gatewayPage: create html/asmId.description.html contents (USE: asmHubName)
     cytoBand: create cytoBand track and navigation ideogram
     gc5Base: create bigWig file for gc5Base track
     repeatMasker: run repeat masker cluster run and create bigBed files for
                   the composite track categories of repeats
     simpleRepeat: run trf cluster run and create bigBed file for simple repeats
     allGaps: calculate all actual real gaps due to N's in sequence, can be
                   more than were specified in the AGP file
     idKeys: calculate md5sum for each sequence in the assembly to be used to
             find identical sequences in similar assemblies
     windowMasker: run windowMasker cluster run, create windowMasker bigBed file
                   and compute intersection with repeatMasker results
@@ -161,32 +163,32 @@
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from
    \$db.2bit.
 3. The \$db.2bit files have already been distributed to cluster-scratch
    (/scratch/hg or /iscratch, /san etc).
 4. template?
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
-# Command line args: genbankRefseq subGroup species asmId
-my ($genbankRefseq, $subGroup, $species, $asmId);
+# Command line args: asmId
+my ( $asmId);
 # Other:
 my ($buildDir, $secondsStart, $secondsEnd, $assemblySource);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'sourceDir=s',
 		      'rmskSpecies=s',
 		      'augustusSpecies=s',
 		      'xenoRefSeq=s',
 		      'asmHubName=s',
 		      'ucscNames',
 		      @HgAutomate::commonOptionSpec,
 		      );
@@ -1496,31 +1498,30 @@
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
 export buildDir="$buildDir"
 export liftFile="\$buildDir/sequence/\$asmId.ncbiToUcsc.lift"
 export target2bit="\$buildDir/\$asmId.2bit"
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.ncbiRefSeq.bb ]; then
 
 ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -toGpWarnOnly -buildDir=`pwd` \\
       -bigClusterHub=$bigClusterHub -dbHost=$dbHost \\
       -liftFile="\$liftFile" \\
       -target2bit="\$target2bit" \\
       -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\
-      $genbankRefseq $subGroup $species \\
       \$asmId \$asmId
 else
   printf "# ncbiRefSeq previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # ncbiRefSeq
 
 #########################################################################
 # * step: augustus [workhorse]
 sub doAugustus {
   my $runDir = "$buildDir/trackData/augustus";
 
   &HgAutomate::mustMkdir($runDir);
@@ -1579,87 +1580,113 @@
 } # doXenoRefGene
 
 #########################################################################
 # * step: trackDb [workhorse]
 sub doTrackDb {
   my $runDir = "$buildDir";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct asmId.trackDb.txt file";
   my $bossScript = newBash HgRemoteScript("$runDir/doTrackDb.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
-\$HOME/kent/src/hg/utils/automation/asmHubTrackDb.sh $genbankRefseq \$asmId $runDir \\
+\$HOME/kent/src/hg/utils/automation/doHubTrackDb.sh \$asmId $runDir \\
    > \$asmId.trackDb.txt
 
 _EOF_
   );
   $bossScript->execute();
 } # trackDb
 
 #########################################################################
 # * step: cleanup [fileServer]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "clean up or compresses intermediate files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 printf "to be done\\n" 1>&2
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
-&usage(1) if (scalar(@ARGV) != 4);
+&usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 
 # expected command line arguments after options are processed
-($genbankRefseq, $subGroup, $species, $asmId) = @ARGV;
+($asmId) = @ARGV;
+# yes, there can be more than two fields separated by _
+# but in this case, we only care about the first two:
+# GC[AF]_123456789.3_assembly_Name
+#   0         1         2      3 ....
+my @partNames = split('_', $asmId);
+$ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0],
+   substr($partNames[1],0,3), substr($partNames[1],3,3),
+   substr($partNames[1],6,3), $asmId);
 
 # Force debug and verbose until this is looking pretty solid:
 # $opt_debug = 1;
 # $opt_verbose = 3 if ($opt_verbose < 3);
 
 # Establish what directory we will work in.
 $buildDir = $opt_buildDir ? $opt_buildDir :
-  "$HgAutomate::clusterData/asmHubs/$genbankRefseq/$subGroup/$species/$asmId";
+  "$HgAutomate::clusterData/asmHubs/refseqBuild/$ftpDir";
+
+$assemblySource = $opt_sourceDir ? "$sourceDir" : "$sourceDir/$ftpDir";
+my $asmReport = "$assemblySource/${asmId}_assembly_report.txt";
+
+$species = $opt_species ? $opt_species : $species;
+
+if (length($species) < 1) {
+  if (-s "$asmReport") {
+     $species = `grep -i "organism name:" $asmReport`;
+     chomp $species;
+     $species =~ s/.*organism\s+name:\s+//i;
+     $species =~ s/\s+\(.*//;
+  } else {
+     die "no -species specified and can not find $asmReport";
+  }
+  if (length($species) < 1) {
+     die "no -species specified and can not find Organism name: in $asmReport";
+  }
+}
 
 $sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir;
 $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species;
 $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies;
 $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq;
 $ucscNames = $opt_ucscNames ? 1 : $ucscNames;   # '1' == 'TRUE'
 $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse;
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName;
 
-$assemblySource = $opt_sourceDir ? "$sourceDir" : "$sourceDir/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId";
 
 die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource);
 printf STDERR "# buildDir: %s\n", $buildDir;
 printf STDERR "# sourceDir %s\n", $sourceDir;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq;
 printf STDERR "# assemblySource: %s\n", $assemblySource;
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";