940dbe5d9aa8d7d58f364d2e152324f2ec4ec331
hiram
  Tue Jan 14 11:38:33 2020 -0800
now running up ncbiRefSeq gene tracks refs #24748

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index b39de80..fe6735a 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -38,30 +38,31 @@
       { name => 'sequence',   func => \&doSequence },
       { name => 'assemblyGap',   func => \&doAssemblyGap },
       { name => 'gatewayPage',   func => \&doGatewayPage },
       { name => 'cytoBand',   func => \&doCytoBand },
       { name => 'gc5Base',   func => \&doGc5Base },
       { name => 'repeatMasker',   func => \&doRepeatMasker },
       { name => 'simpleRepeat',   func => \&doSimpleRepeat },
       { name => 'allGaps',   func => \&doAllGaps },
       { name => 'idKeys',   func => \&doIdKeys },
       { name => 'windowMasker',   func => \&doWindowMasker },
       { name => 'addMask',   func => \&doAddMask },
       { name => 'gapOverlap',   func => \&doGapOverlap },
       { name => 'tandemDups',   func => \&doTandemDups },
       { name => 'cpgIslands',   func => \&doCpgIslands },
       { name => 'ncbiGene',   func => \&doNcbiGene },
+      { name => 'ncbiRefSeq',   func => \&doNcbiRefSeq },
       { name => 'xenoRefGene',   func => \&doXenoRefGene },
       { name => 'augustus',   func => \&doAugustus },
       { name => 'trackDb',   func => \&doTrackDb },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
 my $augustusSpecies = "human";
 my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq";
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $asmHubName = "n/a";  # directory name in: /gbdb/hubs/asmHubName
 my $workhorse = "hgwdev";  # default workhorse when none chosen
@@ -128,30 +129,32 @@
     simpleRepeat: run trf cluster run and create bigBed file for simple repeats
     allGaps: calculate all actual real gaps due to N's in sequence, can be
                   more than were specified in the AGP file
     idKeys: calculate md5sum for each sequence in the assembly to be used to
             find identical sequences in similar assemblies
     windowMasker: run windowMasker cluster run, create windowMasker bigBed file
                   and compute intersection with repeatMasker results
     addMask: combine the higher masking of (windowMasker or repeatMasker) with
                   trf simpleRepeats into one 2bit file
     gapOverlap: find duplicated sequence on each side of a gap
     tandemDups: annotate all pairs of duplicated sequence with some gap between
     cpgIslands: run CpG islands cluster runs for both masked and unmasked
                 sequences and create bigBed files for this composite track
     ncbiGene: on RefSeq assemblies, construct a gene track from the
               NCBI gff3 predictions
+    ncbiRefSeq on RefSeq assemblies, construct a gene track from the
+              NCBI gff3 predictions
     xenoRefSeq: map RefSeq mRNAs to the assembly to construct a 'xeno'
                 gene prediction track
     augustus: run the augustus gene prediction on the assembly
     trackDb: create trackDb.txt file for assembly hub to include all constructed
              bigBed and bigWig tracks
     cleanup: Removes or compresses intermediate files. (NOOP at this time !)
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/template.\$date unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from
@@ -1458,30 +1461,69 @@
     exit 255
   fi
   touch -r\$gffFile \$asmId.ncbiGene.bb
   bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\
     | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt
   LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo
 else
   printf "# ncbiGene previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doNcbiGene
 
 #########################################################################
+# * step: ncbiRefSeq [workhorse]
+sub doNcbiRefSeq {
+  my $runDir = "$buildDir/trackData/ncbiRefSeq";
+  my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz";
+  if ( ! -s "${gffFile}" ) {
+    printf STDERR "# step ncbiRefSeq no gff file found at:\n#  %s\n", $gffFile;
+    return;
+  }
+
+  &HgAutomate::mustMkdir($runDir);
+
+  my $whatItDoes = "run NCBI RefSeq gene procedures";
+  my $bossScript = newBash HgRemoteScript("$runDir/doNcbiRefSeq.bash",
+                    $workhorse, $runDir, $whatItDoes);
+
+  $bossScript->add(<<_EOF_
+export asmId="$asmId"
+export buildDir="$buildDir"
+export liftFile="\$buildDir/sequence/\$asmId.ncbiToUcsc.lift"
+export target2bit="\$buildDir/\$asmId.2bit"
+
+if [ $buildDir/\$asmId.2bit -nt \$asmId.ncbiRefSeq.bb ]; then
+
+~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \\
+      -bigClusterHub=$bigClusterHub -dbHost=$dbHost \\
+      -liftFile="\$liftFile" \\
+      -target2bit="\$target2bit" \\
+      -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\
+      $genbankRefseq $subGroup $species \\
+      \$asmId \$asmId
+else
+  printf "# ncbiRefSeq previously completed\\n" 1>&2
+fi
+_EOF_
+  );
+  $bossScript->execute();
+} # ncbiRefSeq
+
+#########################################################################
 # * step: augustus [workhorse]
 sub doAugustus {
   my $runDir = "$buildDir/trackData/augustus";
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "run Augustus gene prediction procedures";
   my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then
   time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\