src/hg/utils/automation/doNcbiRefSeq.pl 653eaa635409806f4885b0486125efd4390c0931

653eaa635409806f4885b0486125efd4390c0931
hiram
  Thu Aug 27 14:00:07 2020 -0700
fix scripts to behave properly when not using UCSC sequence names refs #23734

diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl
index 177d402..702fa7d 100755
--- src/hg/utils/automation/doNcbiRefSeq.pl
+++ src/hg/utils/automation/doNcbiRefSeq.pl
@@ -1,47 +1,42 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl instead.
 
-# HOW TO USE THIS TEMPLATE:
-# 1. Global-replace doNcbiRefSeq.pl with your actual script name.
-# 2. Search for template and replace each instance with something appropriate.
-#    Add steps and subroutines as needed.  Other do*.pl or make*.pl may have
-#    useful example code -- this is just a skeleton.
-
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 my $doIdKeys = "$Bin/doIdKeys.pl";
 my $gff3ToRefLink = "$Bin/gff3ToRefLink.pl";
 my $gbffToCds = "$Bin/gbffToCds.pl";
 my $ncbiRefSeqOtherIxIxx = "$Bin/ncbiRefSeqOtherIxIxx.pl";
 my $ncbiRefSeqOtherAttrs = "$Bin/ncbiRefSeqOtherAttrs.pl";
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_liftFile
+    $opt_assemblyHub
     $opt_target2bit
     $opt_toGpWarnOnly
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download', func => \&doDownload },
       { name => 'process', func => \&doProcess },
       { name => 'load', func => \&doLoad },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
@@ -61,30 +56,32 @@
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base [options] asmId db
 required arguments:
     asmId          - assembly identifier at NCBI FTP site, examples:
                    - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc..
     db             - database to load with track tables
 
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date
                           (necessary when continuing at a later date).
+    -assemblyHub          the processing is taking place in an assembly hub
+                          build
     -toGpWarnOnly         add -warnAndContinue to the gff3ToGenePred operation
                           to avoid gene definitions that will not convert
     -liftFile pathName    a lift file to translate NCBI names to local genome
                           names
     -target2bit pathName  when not a local UCSC genome, use this 2bit file for
                           the target genome sequence
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 					'workhorse' => $defaultWorkhorse,
 					'fileServer' => $defaultFileServer,
 					'bigClusterHub' => $bigClusterHub,
 					'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Automates UCSC's ncbiRefSeq track build.  Steps:
@@ -105,39 +102,40 @@
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from
    \$db.2bit.
 NOTE: Override these assumptions with the -target2Bit option
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: asmId db
 my ($asmId, $db);
 # Other:
-my ($ftpDir, $buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit);
+my ($ftpDir, $buildDir, $assemblyHub, $toGpWarnOnly, $dbExists, $liftFile, $target2bit);
 my ($secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'liftFile=s',
 		      'target2bit=s',
+		      'assemblyHub',
 		      'toGpWarnOnly',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
   $workhorse = $opt_workhorse if ($opt_workhorse);
   $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
   $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub);
   $fileServer = $opt_fileServer if ($opt_fileServer);
 }
 
@@ -246,50 +244,54 @@
 ### the sort -k2,2 -u eliminates the problem of duplicate sequences existing
 ### in the target $db assembly
        $bossScript->add(<<_EOF_
 cd \$runDir
 hgsql -N -e 'select 0,name,chromEnd,chrom,chromEnd from ucscToRefSeq;' \${db} \\
      | sort -k2,2 -u > \${asmId}To\${db}.lift
 _EOF_
        );
     }
   }
 
   my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit";
   $dbTwoBit = $target2bit if (-s "$target2bit");
 
   if (! $haveLiftFile ) {
+     # if this is an assembly hub and it did NOT supply a lift file,
+     # then we don't need one, do not attempt to build
+     if (! $opt_assemblyHub) {
        $bossScript->add(<<_EOF_
 # generate idKeys for the NCBI sequence to translate names to UCSC equivalents
 
 rm -rf \$runDir/idKeys
 mkdir -p \$runDir/idKeys
 cd \$runDir/idKeys
 time ($doIdKeys -buildDir=\$runDir/idKeys -twoBit=\$runDir/\$asmId.ncbi.2bit \$db) > idKeys.log 2>&1
 cd \$runDir
 twoBitInfo $dbTwoBit stdout | sort -k2,2nr > \$db.chrom.sizes
 ln -f -s idKeys/\$db.idKeys.txt ./ncbi.\$asmId.idKeys.txt
 ln -f -s /hive/data/genomes/\$db/bed/idKeys/\$db.idKeys.txt ./ucsc.\$db.idKeys.txt
 # joining the idKeys establishes a lift file to translate chrom names
 join -t\$'\\t' ucsc.\$db.idKeys.txt ncbi.\$asmId.idKeys.txt \\
    | cut -f2-3 | sort \\
      | join -t\$'\\t' - <(sort \$db.chrom.sizes) \\
        | awk -F\$'\\t' '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$2, \$3, \$1, \$3}' \\
           | sort -k3nr > \${asmId}To\${db}.lift
 _EOF_
        );
      }
+  }
 
   $bossScript->add(<<_EOF_
 cd \$runDir
 twoBitInfo \$asmId.ncbi.2bit stdout | sort -k2nr > \$asmId.ncbi.chrom.sizes
 zcat \${asmId}_rna.fna.gz | sed -e 's/ .*//;' | gzip -c > \$asmId.rna.fa.gz
 faSize -detailed \${asmId}.rna.fa.gz | sort -k2nr > rna.sizes
 
 # genbank processor extracts infomation about the RNAs
 /hive/data/outside/genbank/bin/x86_64/gbProcess \\
     -inclXMs /dev/null \$asmId.raFile.txt \${asmId}_rna.gbff.gz
 _EOF_
   );
   $bossScript->execute();
 } # doDownload
 
@@ -301,30 +303,31 @@
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "process NCBI download files into track files.";
   # Use dbHost since genePredCheck -db=$db needs database
   my $bossScript = newBash HgRemoteScript("$runDir/doProcess.bash", $dbHost,
 				      $runDir, $whatItDoes);
 
   my $genePredCheckDb = "genePredCheck -db=\$db";
   if (! $dbExists) {
     $genePredCheckDb = "genePredCheck";
   }
   my $warnOnly = "";
   $warnOnly = "-warnAndContinue" if ($toGpWarnOnly);
 
   my $localLiftFile = "\$downloadDir/\${asmId}To\${db}.lift";
+  $localLiftFile = "" if (! -s "$localLiftFile");
   $localLiftFile = $liftFile if (-s $liftFile);
   my $pslTargetSizes = "-db=\$db";
   my $fakePslSizes = "";
   if (-s "$target2bit") {
     $pslTargetSizes = "-targetSizes=\$db.chrom.sizes";
     $fakePslSizes = "-chromSize=\$db.chrom.sizes";
   }
 
   my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit";
   $dbTwoBit = $target2bit if (-s "$target2bit");
 
   $bossScript->add(<<_EOF_
 # establish all variables to use here
 
 export asmId=$asmId
@@ -358,33 +361,47 @@
        > \$asmId.refseqSelectTranscripts.txt
 fi
 rm -f before.cut9.txt
 
 # extract labels from semi-structured text in gbff COMMENT/description sections:
 zcat \$downloadDir/\${asmId}_rna.gbff.gz \\
   | (grep ' :: ' || true) \\
     | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\
       | sort -u \\
         > pragmaLabels.txt
 
 # extract cross reference text for refLink
 \$gff3ToRefLink \$downloadDir/\$asmId.raFile.txt \$ncbiGffGz pragmaLabels.txt 2> \$db.refLink.stderr.txt \\
   | sort > \$asmId.refLink.tab
 
+_EOF_
+  );
+  if ( -s "$localLiftFile") {
+  $bossScript->add(<<_EOF_
 # converting the NCBI coordinates to UCSC coordinates
 liftUp -extGenePred -type=.gp stdout $localLiftFile warn \$asmId.gp \\
   | gzip -c > \$asmId.\$db.gp.gz
+_EOF_
+  );
+  } else {
+  $bossScript->add(<<_EOF_
+# no lifting necessary
+cat \$asmId.gp | gzip -c > \$asmId.\$db.gp.gz
+_EOF_
+  );
+  }
+  $bossScript->add(<<_EOF_
 $genePredCheckDb \$asmId.\$db.gp.gz
 zcat \$asmId.\$db.gp.gz > ncbiRefSeq.\$dateStamp
 genePredToGtf -utr file ncbiRefSeq.\$dateStamp stdout | gzip -c \\
   > ../\$db.\$dateStamp.ncbiRefSeq.gtf.gz
 rm -f ncbiRefSeq.\$dateStamp
 
 # curated subset of all genes
 (zegrep "^[NY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.curated.gp
 # may not be any curated genes
 if [ ! -s \$db.curated.gp ]; then
   rm -f \$db.curated.gp
 elif [ -s \$asmId.refseqSelectTranscripts.txt ]; then
   cat \$db.curated.gp | fgrep -f \$asmId.refseqSelectTranscripts.txt - \\
     > \$db.refseqSelect.curated.gp
   # may not be any refseqSelect.curated genes
@@ -435,58 +452,90 @@
 
 # PSL data will be loaded into a psl type track to show the alignments
 (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff
 if [ -s ../../../download/\${asmId}.remove.dups.list ]; then
   (zegrep -v "NG_" \$ncbiGffGz || true) \\
     | grep -v -f ../../../download/\${asmId}.remove.dups.list \\
     | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff
   gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\
     gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl
 else
   (zegrep -v "NG_" \$ncbiGffGz || true) \\
     | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff
   gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\
     gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl
 fi
+# might be empty result
+if [ ! -s \$asmId.psl ]; then
+  rm -f \$asmId.psl
+else
+_EOF_
+  );
+  # note else clause of above if statement is concluded below in these two
+  # cases
+  if ( -s "$localLiftFile") {
+  $bossScript->add(<<_EOF_
  simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\
    | liftUp -type=.psl stdout $localLiftFile warn stdin \\
      | gzip -c > \$db.psl.gz
+fi
+_EOF_
+  );
+  } else {
+  $bossScript->add(<<_EOF_
+ simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\
+    | gzip -c > \$db.psl.gz
+fi
+_EOF_
+  );
+  }
+
+  $bossScript->add(<<_EOF_
+if [ -s \$db.psl.gz ]; then
   pslCheck $pslTargetSizes \\
     -querySizes=\$downloadDir/rna.sizes \$db.psl.gz
+fi
 
 # extract RNA CDS information from genbank record
 # Note: $asmId.raFile.txt could be used instead of _rna.gbff.gz
 \$gbffToCds \$downloadDir/\${asmId}_rna.gbff.gz | sort > \$asmId.rna.cds
 
 # the NCBI _genomic.gff.gz file only contains cDNA_match records for transcripts
 # that do not *exactly* match the reference genome.  For all other transcripts
 # construct 'fake' PSL records representing the alignments of all cDNAs
 # that would be perfect matches to the reference genome.  The pslFixCdsJoinGap
 # will fixup those records with unusual alignments due to frameshifts of
 # various sorts as found in the rna.cds file:
 genePredToFakePsl -qSizes=\$downloadDir/rna.sizes $fakePslSizes \$db \$db.ncbiRefSeq.gp \\
   stdout \$db.fake.cds \\
      | pslFixCdsJoinGap stdin \$asmId.rna.cds \$db.fake.psl
+if [ -s \$db.psl.gz ]; then
   pslCat -nohead \$db.psl.gz | cut -f10,14 > \$db.psl.names
+fi
 if [ -s \$db.psl.names ]; then
   pslSomeRecords -tToo -not \$db.fake.psl \$db.psl.names \$db.someRecords.psl
 else
   cp -p \$db.fake.psl \$db.someRecords.psl
 fi
+if [ -s \$db.psl.gz ]; then
   pslSort dirs stdout \\
- ./tmpdir \$db.psl.gz \$db.someRecords.psl \\
-   | (pslCheck -quiet $pslTargetSizes -pass=stdout -fail=\$asmId.\$db.fail.psl stdin || true) \\
+   ./tmpdir \$db.psl.gz \$db.someRecords.psl | gzip -c > \$db.sorted.psl.gz
+else
+  pslSort dirs stdout \\
+   ./tmpdir \$db.someRecords.psl | gzip -c > \$db.sorted.psl.gz
+fi
+(pslCheck -quiet $pslTargetSizes -pass=stdout -fail=\$asmId.\$db.fail.psl \$db.sorted.psl.gz || true) \\
     | pslPosTarget stdin stdout \\
     | pslRecalcMatch -ignoreQMissing stdin $dbTwoBit \$downloadDir/\$asmId.rna.fa.gz stdout \\
      | sort -k14,14 -k16,16n | gzip -c > \$asmId.\$db.psl.gz
 rm -fr ./tmpdir
 pslCheck $pslTargetSizes \$asmId.\$db.psl.gz
 _EOF_
   );
   $bossScript->execute();
 } # doProcess
 
 #########################################################################
 # * step: load [dbHost]
 sub doLoad {
   my $runDir = "$buildDir";
   &HgAutomate::mustMkdir($runDir);
@@ -617,60 +666,96 @@
         > \$db.ncbiRefSeqPepTable.tab
 
 # and load the fasta peptides, again, only those for items that exist
 pslCat -nohead process/\$asmId.\$db.psl.gz | cut -f10 \\
    | sort -u > \$db.psl.used.rna.list
 cut -f5 process/\$asmId.\$db.ncbiRefSeqLink.tab \\
    | grep . | sort -u > \$db.mrnaAcc.name.list
 sort -u \$db.psl.used.rna.list \$db.mrnaAcc.name.list > \$db.rna.select.list
 cut -f1 process/\$db.ncbiRefSeq.gp | sort -u > \$db.gp.name.list
 comm -12 \$db.rna.select.list \$db.gp.name.list > \$db.toLoad.rna.list
 comm -23 \$db.rna.select.list \$db.gp.name.list > \$db.not.used.rna.list
 faSomeRecords download/\$asmId.rna.fa.gz \$db.toLoad.rna.list \$db.rna.fa
 grep '^>' \$db.rna.fa | sed -e 's/^>//' | sort > \$db.rna.found.list
 comm -13 \$db.rna.found.list \$db.gp.name.list > \$db.noRna.available.list
 
+_EOF_
+    );
+
+  my $nonNucNames="chrM";
+
+  my $haveLiftFile = 0;
+  if ( -s "$liftFile" ) {
+       $haveLiftFile = 1;
+  }
+  # if this is an assembly hub, find out what the non-nuclear names are
+  if ( $opt_assemblyHub) {
+     if ( -s "../../sequence/$asmId.nonNucChr.names") {
+     # with lift file means to use UCSC name in the nonNucChr.names
+     if ($haveLiftFile ) {
+        $nonNucNames=`cut -f1 ../../sequence/$asmId.nonNucChr.names | xargs echo | tr ' ' '|'`;
+     } else {
+        $nonNucNames=`cut -f2 ../../sequence/$asmId.nonNucChr.names | xargs echo | tr ' ' '|'`;
+     }
+     chomp $nonNucNames;
+     }
+  }
+     # if this is an assembly hub and it did NOT supply a lift file,
+     # then we don't need one, do not attempt to build
+    $bossScript->add(<<_EOF_
+
 # If \$db.noRna.available.list is not empty but items are on chrM,
 # make fake cDNA sequence for them using chrM sequence
 # since NCBI puts proteins, not coding RNAs, in the GFF.
 if [ -s \$db.noRna.available.list ]; then
   pslCat -nohead process/\$asmId.\$db.psl.gz \\
     | grep -Fwf \$db.noRna.available.list \\
-      | grep chrM > missingChrMFa.psl
+      | egrep "$nonNucNames" > missingChrMFa.psl
   if [ -s missingChrMFa.psl ]; then
     pslToBed missingChrMFa.psl stdout \\
       | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa
   fi
 fi
 
+if [ -s process/\$asmId.rna.cds.gz ]; then
+  zcat process/\$asmId.rna.cds.gz egrep '[0-9]+\\.\\.[0-9]\\+' \\
+    pslMismatchGapToBed -cdsFile=stdin -db=\$db -ignoreQNamePrefix=X \\
+      process/\$asmId.\$db.psl.gz \$target2bit \\
+        \$db.rna.fa ncbiRefSeqGenomicDiff || true
+
+  wget -O txAliDiff.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/txAliDiff.as'
+  bedToBigBed -type=bed9+ -tab -as=txAliDiff.as \\
+    ncbiRefSeqGenomicDiff.bed \$db.chrom.sizes ncbiRefSeqGenomicDiff.bb
+fi
+
 export totalBases=`ave -col=2 \$db.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`
 export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'`
 export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'`
 printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\
    "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt
 printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiRefSeq.txt
 
 rm -f \$db.ncbiRefSeq.bigGp \$asmId.exons.bed
 
 pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\
   process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.bigPsl
 bedToBigBed -type=bed12+13 -tab -as=bigPsl.as -extraIndex=name \\
   \$asmId.bigPsl \$db.chrom.sizes \$asmId.bigPsl.bb
 rm -f \$asmId.bigPsl
 _EOF_
     );
-  } else {
+  } else {	# processing for a database genome
 
     $bossScript->add(<<_EOF_
 # loading the genePred tracks, all genes in one, and subsets
 hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp
 $genePredCheckDb ncbiRefSeq
 
 if [ -s process/\$db.curated.gp ]; then
   hgLoadGenePred -genePredExt \$db ncbiRefSeqCurated process/\$db.curated.gp
   $genePredCheckDb ncbiRefSeqCurated
   if [ -s process/\$db.refseqSelect.curated.gp ]; then
     hgLoadGenePred -genePredExt \$db ncbiRefSeqSelect process/\$db.refseqSelect.curated.gp
     $genePredCheckDb ncbiRefSeqSelect
   fi
 fi
 
@@ -727,30 +812,42 @@
   pslCat -nohead process/\$asmId.\$db.psl.gz \\
     | grep -Fwf \$db.noRna.available.list \\
       | grep chrM > missingChrMFa.psl
   if [ -s missingChrMFa.psl ]; then
     pslToBed missingChrMFa.psl stdout \\
       | twoBitToFa -bed=stdin $dbTwoBit stdout >> \$db.rna.fa
   fi
 fi
 
 mkdir -p $gbdbDir
 ln -f -s `pwd`/\$db.rna.fa $gbdbDir/seqNcbiRefSeq.rna.fa
 hgLoadSeq -drop -seqTbl=seqNcbiRefSeq -extFileTbl=extNcbiRefSeq \$db $gbdbDir/seqNcbiRefSeq.rna.fa
 
 hgLoadPsl \$db -table=ncbiRefSeqPsl process/\$asmId.\$db.psl.gz
 
+if [ -s process/\$asmId.rna.cds.gz ]; then
+  zcat process/\$asmId.rna.cds.gz egrep '[0-9]+\\.\\.[0-9]\\+' \\
+    pslMismatchGapToBed -cdsFile=stdin -db=\$db -ignoreQNamePrefix=X \\
+      process/\$asmId.\$db.psl.gz $dbTwoBit \\
+        \$db.rna.fa ncbiRefSeqGenomicDiff || true
+
+  bedToBigBed -type=bed9+ -tab -as=~/kent/src/hg/lib/txAliDiff.as \\
+    ncbiRefSeqGenomicDiff.bed process/\$db.chrom.sizes ncbiRefSeqGenomicDiff.bb
+  rm -f $gbdbDir/ncbiRefSeqGenomicDiff.bb
+  ln -s `pwd`/ncbiRefSeqGenomicDiff.bb $gbdbDir/ncbiRefSeqGenomicDiff.bb
+fi
+
 if [ -d "/usr/local/apache/htdocs-hgdownload/goldenPath/archive" ]; then
  gtfFile=`ls \$db.*.ncbiRefSeq.gtf.gz`
  mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq
  rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/\$gtfFile
  ln -s `pwd`/\$db.*.ncbiRefSeq.gtf.gz \\
    /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/
 fi
 
 featureBits \$db ncbiRefSeq > fb.ncbiRefSeq.\$db.txt 2>&1
 cat fb.ncbiRefSeq.\$db.txt 2>&1
 _EOF_
     );
   }	#	if ($dbExists)
   $bossScript->execute();
 } # doLoad