src/hg/utils/automation/doNcbiRefSeq.pl deeb37cb48db2bb239792b0ed59889eeb271b0d6

deeb37cb48db2bb239792b0ed59889eeb271b0d6
hiram
  Wed Jan 8 17:10:16 2020 -0800
do not actually need the targeSizes option get chrom.sizes from target2bit refs #23891

diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl
index a139ab7..0d9fdc5 100755
--- src/hg/utils/automation/doNcbiRefSeq.pl
+++ src/hg/utils/automation/doNcbiRefSeq.pl
@@ -22,31 +22,30 @@
 my $gff3ToRefLink = "$Bin/gff3ToRefLink.pl";
 my $gbffToCds = "$Bin/gbffToCds.pl";
 my $ncbiRefSeqOtherIxIxx = "$Bin/ncbiRefSeqOtherIxIxx.pl";
 my $ncbiRefSeqOtherAttrs = "$Bin/ncbiRefSeqOtherAttrs.pl";
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_genbank
     $opt_subgroup
     $opt_species
     $opt_liftFile
     $opt_target2bit
-    $opt_targetSizes
     $opt_toGpWarnOnly
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download', func => \&doDownload },
       { name => 'process', func => \&doProcess },
       { name => 'load', func => \&doLoad },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $bigClusterHub = 'ku';
@@ -76,80 +75,77 @@
     db             - database to load with track tables
 
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date
                           (necessary when continuing at a later date).
     -toGpWarnOnly         add -warnAndContinue to the gff3ToGenePred operation
                           to avoid gene definitions that will not convert
     -liftFile pathName    a lift file to translate NCBI names to local genome
                           names
     -target2bit pathName  when not a local UCSC genome, use this 2bit file for
                           the target genome sequence
-    -targetSizes pathName when not a local UCSC genome, use this file for
-                          target genome chrom.sizes file
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 					'workhorse' => $defaultWorkhorse,
 					'fileServer' => $defaultFileServer,
 					'bigClusterHub' => $bigClusterHub,
 					'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Automates UCSC's ncbiRefSeq track build.  Steps:
     download: symlink local files or rsync required files from NCBI FTP site
     process: generate the track data files from the download data
     load: load the processed data into database tables
     cleanup: Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ncbiRefSeq.\$date unless -buildDir is given.
 
 Expects to find already done idKeys procedure and result file in:
    /hive/data/genomes/<db>/bed/idKeys/<db>.idKeys.txt
 See also: doIdKeys.pl command.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from
    \$db.2bit.
-NOTE: Override these assumptions with -target2Bit and -targetSizes options
+NOTE: Override these assumptions with the -target2Bit option
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: genbankRefseq subGroup species asmId db
 my ($genbankRefseq, $subGroup, $species, $asmId, $db, $ftpDir);
 # Other:
-my ($buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit, $targetSizes);
+my ($buildDir, $toGpWarnOnly, $dbExists, $liftFile, $target2bit);
 my ($secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'liftFile=s',
 		      'target2bit=s',
-		      'targetSizes=s',
 		      'toGpWarnOnly',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
   $workhorse = $opt_workhorse if ($opt_workhorse);
   $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
   $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub);
   $fileServer = $opt_fileServer if ($opt_fileServer);
 }
 
@@ -194,31 +190,30 @@
 _EOF_
     );
   } else {
     $bossScript->add(<<_EOF_
 # local file copies do not exist, download from NCBI:
 
 for F in _rna.gbff _rna.fna _protein.faa _genomic.gff
 do
    rsync -a -P \\
        rsync://ftp.ncbi.nlm.nih.gov/\$ftpDir/\$asmId\${F}.gz ./
 done
 _EOF_
     );
   }
 
-printf STDERR "# checking $local2Bit\n";
   if ( -s $local2Bit ) {
     $bossScript->add(<<_EOF_
 ln -f -s $local2Bit .
 _EOF_
     );
   } elsif ( -s "$outsideCopy/${asmId}_genomic.fna.gz") {
     $bossScript->add(<<_EOF_
 # build \$asmId.ncbi.2bit from local copy of genomic fasta
 
 faToTwoBit \$outsideCopy/\${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit
 _EOF_
     );
   } else {
     $bossScript->add(<<_EOF_
 # download genomic fasta and build \${asmId}.ncbi.2bit
@@ -236,57 +231,58 @@
      # check if db.ucscToRefSeq exists?
      my $sql = "show tables like 'ucscToRefSeq';";
      my $result = `hgsql $db -BN -e "$sql"`;
      chomp $result;
      if ( $result =~ m/ucscToRefSeq/ ) {
        $haveLiftFile = 1;
        $bossScript->add(<<_EOF_
 cd \$runDir
 hgsql -N -e 'select 0,name,chromEnd,chrom,chromEnd from ucscToRefSeq;' \${db} \\
      > \${asmId}To\${db}.lift
 _EOF_
        );
     }
   }
 
-  my $hostSizes = "$HgAutomate::clusterData/\$db/chrom.sizes";
-  $hostSizes = $targetSizes if (-s "$targetSizes");
+  my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit";
+  $dbTwoBit = $target2bit if (-s "$target2bit");
 
   if (! $haveLiftFile ) {
        $bossScript->add(<<_EOF_
 # generate idKeys for the NCBI sequence to translate names to UCSC equivalents
 
 rm -rf \$runDir/idKeys
 mkdir -p \$runDir/idKeys
 cd \$runDir/idKeys
 time ($doIdKeys -buildDir=\$runDir/idKeys -twoBit=\$runDir/\$asmId.ncbi.2bit \$db) > idKeys.log 2>&1
 cd \$runDir
+twoBitInfo $dbTwoBit stdout | sort -k2,2nr > \$db.chrom.sizes
 ln -f -s idKeys/\$db.idKeys.txt ./ncbi.\$asmId.idKeys.txt
 ln -f -s /hive/data/genomes/\$db/bed/idKeys/\$db.idKeys.txt ./ucsc.\$db.idKeys.txt
 # joining the idKeys establishes a lift file to translate chrom names
 join -t\$'\\t' ucsc.\$db.idKeys.txt ncbi.\$asmId.idKeys.txt \\
    | cut -f2-3 | sort \\
-     | join -t\$'\\t' - <(sort $hostSizes) \\
+     | join -t\$'\\t' - <(sort \$db.chrom.sizes) \\
        | awk -F\$'\\t' '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$2, \$3, \$1, \$3}' \\
           | sort -k3nr > \${asmId}To\${db}.lift
 _EOF_
        );
   }
 
   $bossScript->add(<<_EOF_
 cd \$runDir
-twoBitInfo \$asmId.ncbi.2bit stdout | sort -k2nr > $hostSizes
+twoBitInfo \$asmId.ncbi.2bit stdout | sort -k2nr > \$asmId.ncbi.chrom.sizes
 zcat \${asmId}_rna.fna.gz | sed -e 's/ .*//;' | gzip -c > \$asmId.rna.fa.gz
 faSize -detailed \${asmId}.rna.fa.gz | sort -k2nr > rna.sizes
 
 # genbank processor extracts infomation about the RNAs
 /hive/data/outside/genbank/bin/x86_64/gbProcess \\
     -inclXMs /dev/null \$asmId.raFile.txt \${asmId}_rna.gbff.gz
 _EOF_
   );
   $bossScript->execute();
 } # doDownload
 
 #########################################################################
 # * step: process [dbHost]
 sub doProcess {
   my $runDir = "$buildDir/process";
@@ -295,42 +291,39 @@
 
   my $whatItDoes = "process NCBI download files into track files.";
   # Use dbHost since genePredCheck -db=$db needs database
   my $bossScript = newBash HgRemoteScript("$runDir/doProcess.bash", $dbHost,
 				      $runDir, $whatItDoes);
 
   my $genePredCheckDb = "genePredCheck -db=\$db";
   if (! $dbExists) {
     $genePredCheckDb = "genePredCheck";
   }
   my $warnOnly = "";
   $warnOnly = "-warnAndContinue" if ($toGpWarnOnly);
 
   my $localLiftFile = "\$downloadDir/\${asmId}To\${db}.lift";
   $localLiftFile = $liftFile if (-s $liftFile);
-  my $hostSizes = "$HgAutomate::clusterData/\$db/chrom.sizes";
-  my $pslTargetSizes = "-targetSizes=$hostSizes -db=\$db";
-  if (-s "$targetSizes") {
-    $hostSizes = $targetSizes;
-    $pslTargetSizes = "-targetSizes=$hostSizes";
+  my $pslTargetSizes = "-db=\$db";
+  my $fakePslSizes = "";
+  if (-s "$target2bit") {
+    $pslTargetSizes = "-targetSizes=\$db.chrom.sizes";
+    $fakePslSizes = "-chromSize=\$db.chrom.sizes";
   }
-  my $fakePslSizes = $targetSizes ? "-chromSize=$targetSizes" : "";
 
   my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit";
   $dbTwoBit = $target2bit if (-s "$target2bit");
-printf STDERR "# DBG dbTwoBit: '%s'\n", $dbTwoBit;
-printf STDERR "# DBG: target2bit: '%s'\n", $target2bit;
 
   $bossScript->add(<<_EOF_
 # establish all variables to use here
 
 export asmId=$asmId
 export downloadDir=$downloadDir
 export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz
 export db=$db
 export gff3ToRefLink=$gff3ToRefLink
 export gbffToCds=$gbffToCds
 
 export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //'`
 if [ "\$annotationRelease" == "" ]; then
   export annotationRelease=\$asmId
 fi
@@ -380,47 +373,48 @@
 if [ -s \$db.curated.gp ]; then
   $genePredCheckDb \$db.curated.gp
 fi
 if [ -s \$db.predicted.gp ]; then
   $genePredCheckDb \$db.predicted.gp
 fi
 if [ -s \$db.other.gp ]; then
   $genePredCheckDb \$db.other.gp
 fi
 
 # join the refLink metadata with curated+predicted names
 cut -f1 \$db.ncbiRefSeq.gp | sort -u > \$asmId.\$db.name.list
 join -t\$'\\t' \$asmId.\$db.name.list \$asmId.refLink.tab > \$asmId.\$db.ncbiRefSeqLink.tab
 
 # Make bigBed with attributes in extra columns for ncbiRefSeqOther:
+twoBitInfo $dbTwoBit stdout | sort -k2,2n > \$db.chrom.sizes
 genePredToBed \$db.other.gp stdout | sort -k1,1 -k2n,2n > \$db.other.bed
 $ncbiRefSeqOtherAttrs \$db.other.bed \$asmId.attrs.txt > \$db.other.extras.bed
 bedToBigBed -type=bed12+13 -as=ncbiRefSeqOther.as -tab \\
   -extraIndex=name \\
-  \$db.other.extras.bed $hostSizes \$db.other.bb
+  \$db.other.extras.bed \$db.chrom.sizes \$db.other.bb
 
 # Make trix index for ncbiRefSeqOther
 $ncbiRefSeqOtherIxIxx \\
   ncbiRefSeqOther.as \$db.other.extras.bed > ncbiRefSeqOther.ix.tab
 
 ixIxx ncbiRefSeqOther.ix.tab ncbiRefSeqOther.ix{,x}
 
 # PSL data will be loaded into a psl type track to show the alignments
 (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff
 (zegrep -v "NG_" \$ncbiGffGz || true) \\
   | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff
-gff3ToPsl -dropT \$downloadDir/\$asmId.chrom.sizes \$downloadDir/rna.sizes \\
+gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\
   gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl
 simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\
   | liftUp -type=.psl stdout $localLiftFile drop stdin \\
    | gzip -c > \$db.psl.gz
 pslCheck $pslTargetSizes \\
    -querySizes=\$downloadDir/rna.sizes \$db.psl.gz
 
 # extract RNA CDS information from genbank record
 # Note: $asmId.raFile.txt could be used instead of _rna.gbff.gz
 \$gbffToCds \$downloadDir/\${asmId}_rna.gbff.gz | sort > \$asmId.rna.cds
 
 # the NCBI _genomic.gff.gz file only contains cDNA_match records for transcripts
 # that do not *exactly* match the reference genome.  For all other transcripts
 # construct 'fake' PSL records representing the alignments of all cDNAs
 # that would be perfect matches to the reference genome.  The pslFixCdsJoinGap
@@ -464,54 +458,52 @@
 
   my $genePredCheckDb = "genePredCheck -db=\$db";
   if (! $dbExists) {
     $genePredCheckDb = "genePredCheck";
   }
 
   $bossScript->add(<<_EOF_
 # establish all variables to use here
 
 export db=$db
 export asmId=$asmId
 
 _EOF_
   );
   if (! $dbExists) {
-    my $hostSizes = "$HgAutomate::clusterData/\$db/chrom.sizes";
-    $hostSizes = $targetSizes if (-s "$targetSizes");
     $bossScript->add(<<_EOF_
 export target2bit=$dbTwoBit
-export targetSizes=$hostSizes
 
+twoBitInfo \$target2bit stdout | sort -k2,2nr > \$db.chrom.sizes
 wget -O bigGenePred.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigGenePred.as'
 wget -O bigPsl.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigPsl.as'
 
 genePredToBigGenePred process/\$db.ncbiRefSeq.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeq.bigGp
 bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\
-  \$db.ncbiRefSeq.bigGp \$targetSizes \\
+  \$db.ncbiRefSeq.bigGp \$db.chrom.sizes \\
     \$db.ncbiRefSeq.bb
 if [ -s process/\$db.curated.gp ]; then
   genePredToBigGenePred process/\$db.curated.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqCurated.bigGp
   bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\
-  \$db.ncbiRefSeqCurated.bigGp \$targetSizes \\
+  \$db.ncbiRefSeqCurated.bigGp \$db.chrom.sizes \\
     \$db.ncbiRefSeqCurated.bb
   rm -f \$db.ncbiRefSeqCurated.bigGp
 fi
 if [ -s process/\$db.predicted.gp ]; then
   genePredToBigGenePred process/\$db.predicted.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqPredicted.bigGp
   bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\
-  \$db.ncbiRefSeqPredicted.bigGp \$targetSizes \\
+  \$db.ncbiRefSeqPredicted.bigGp \$db.chrom.sizes \\
     \$db.ncbiRefSeqPredicted.bb
   rm -f \$db.ncbiRefSeqPredicted.bigGp
 fi
 if [ -s "process/\$db.other.bb" ]; then
   ln -f -s process/\$db.other.bb \$db.ncbiRefSeqOther.bb
 fi
 if [ -s "process/ncbiRefSeqOther.ix" ]; then
   ln -f -s process/ncbiRefSeqOther.ix ./\$db.ncbiRefSeqOther.ix
   ln -f -s process/ncbiRefSeqOther.ixx ./\$db.ncbiRefSeqOther.ixx
 fi
 ln -f -s process/ncbiRefSeqVersion.txt ./\$db.ncbiRefSeqVersion.txt
 # select only coding genes to have CDS records
 
 awk -F" " '\$6 != \$7 {print \$1;}' process/\$db.ncbiRefSeq.gp \\
   | sort -u > coding.cds.name.list
@@ -544,42 +536,42 @@
 comm -13 \$db.rna.found.list \$db.gp.name.list > \$db.noRna.available.list
 
 # If \$db.noRna.available.list is not empty but items are on chrM,
 # make fake cDNA sequence for them using chrM sequence
 # since NCBI puts proteins, not coding RNAs, in the GFF.
 if [ -s \$db.noRna.available.list ]; then
   pslCat -nohead process/\$asmId.\$db.psl.gz \\
     | grep -Fwf \$db.noRna.available.list \\
       | grep chrM > missingChrMFa.psl
   if [ -s missingChrMFa.psl ]; then
     pslToBed missingChrMFa.psl stdout \\
       | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa
   fi
 fi
 
-export totalBases=`ave -col=2 \$targetSizes | grep "^total" | awk '{printf "%d", \$2}'`
+export totalBases=`ave -col=2 \$db.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`
 export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'`
 export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'`
 printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\
    "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt
 
 rm -f \$db.ncbiRefSeq.bigGp
 
 pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\
   process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.\$db.bigPsl
 bedToBigBed -type=bed12+13 -tab -as=bigPsl.as \\
-  \$asmId.\$db.bigPsl \$targetSizes \$asmId.\$db.bigPsl.bb
+  \$asmId.\$db.bigPsl \$db.chrom.sizes \$asmId.\$db.bigPsl.bb
 _EOF_
     );
   } else {
 
     $bossScript->add(<<_EOF_
 # loading the genePred tracks, all genes in one, and subsets
 hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp
 $genePredCheckDb ncbiRefSeq
 
 if [ -s process/\$db.curated.gp ]; then
   hgLoadGenePred -genePredExt \$db ncbiRefSeqCurated process/\$db.curated.gp
   $genePredCheckDb ncbiRefSeqCurated
 fi
 
 if [ -s process/\$db.predicted.gp ]; then
@@ -674,31 +666,30 @@
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 5);
 
 $toGpWarnOnly = 0;
 $toGpWarnOnly = 1 if ($opt_toGpWarnOnly);
 $liftFile = $opt_liftFile ? $opt_liftFile : "";
-$targetSizes = $opt_targetSizes ? $opt_targetSizes : "";
 $target2bit = $opt_target2bit ? $opt_target2bit : "";
 
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 
 # expected command line arguments after options are processed
 ($genbankRefseq, $subGroup, $species, $asmId, $db) = @ARGV;
 $ftpDir = "genomes/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId";
 
 if ( -z "$liftFile" && ! -s "/hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt") {
   die "ERROR: can not find /hive/data/genomes/$db/bed/idKeys/$db.idKeys.txt\n\t  need to run doIdKeys.pl for $db before this procedure.";
 }
 
 # Force debug and verbose until this is looking pretty solid:
 # $opt_debug = 1;