4a157fbcc40beb3cd5279d221dac018d353ec78f
hiram
  Tue Oct 1 13:57:48 2024 -0700
genome-source has now become githubusercontent refs #34449

diff --git src/hg/utils/automation/doSameSpeciesLiftOver.pl src/hg/utils/automation/doSameSpeciesLiftOver.pl
index ad3668e..aa596b1 100755
--- src/hg/utils/automation/doSameSpeciesLiftOver.pl
+++ src/hg/utils/automation/doSameSpeciesLiftOver.pl
@@ -12,82 +12,92 @@
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_ooc
     $opt_target2Bit
     $opt_targetSizes
     $opt_query2Bit
     $opt_querySizes
+    $opt_chainRam
+    $opt_chainCpu
     $opt_localTmp
-    $opt_ram
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'align',   func => \&doAlign },
       { name => 'chain',   func => \&doChain },
       { name => 'net',     func => \&doNet },
       { name => 'load',    func => \&doLoad },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
+my $ramG = '4g';
+my $cpu = 1;
+my $blatRam = '4g';	# -ram=Ng argument
+my $blatCpu = 1;	# -cpu=N argument
+my $chainRam = '16g';	# -chainRam=Ng argument
+my $chainCpu = 1;	# -chainCpu=N argument
 
 # This could be made into an option:
 # BLAT -fastMap will not work with query chunks greater than 5000
 my $splitSize = '5000';  
 my $splitOverlap = '500';  
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base fromDb toDb
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$fromDb/$HgAutomate::trackBuild/blat.\$toDb.\$date
                           (necessary when continuing at a later date).
     -ooc /path/11.ooc     Use this instead of the default
                           /hive/data/genomes/fromDb/11.ooc
                           Can be "none".
     -target2Bit /path/target.2bit  Full path to target sequence (fromDb)
     -query2Bit /path/query.2bit    Full path to query sequence (toDb)
     -targetSizes /path/target.chrom.sizes  Full path to target chrom.sizes (fromDb)
     -querySizes  /path/query.chrom.sizes   Full path to query chrom.sizes (toDb)
+    -chainRam  Ng  Cluster ram size for chain step, default: -chainRam=$chainRam
+    -chainCpu  N   Cluster CPUs number for chain step, default: -chainCpu=$chainCpu
     -localTmp  /dev/shm  Full path to temporary storage for heavy I/O usage
-    -ram Ng               set -ram=Ng argument to para create command (default 8g)
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 						'fileServer' => '',
+						'ram' => $ramG,
+						'cpu' => $cpu,
 						'bigClusterHub' => '');
   print STDERR "
 Automates UCSC's same-species liftOver (blat/chain/net) pipeline, based on
 Kate's suite of makeLo-* scripts:
     align: Aligns the assemblies using blat -fastMap on a big cluster.
     chain: Chains the alignments on a big cluster.
     net:   Nets the alignments, uses netChainSubset to extract liftOver chains.
     load:  Installs liftOver chain files, calls hgAddLiftOverChain on $dbHost.
     cleanup: Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$fromDb/$HgAutomate::trackBuild/blat.\$toDb.\$date unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
@@ -110,32 +120,33 @@
 
 # Other:
 my ($buildDir);
 my ($tSeq, $tSizes, $qSeq, $qSizes, $QDb, $fileServer);
 my ($liftOverChainDir, $liftOverChainFile, $liftOverChainPath, $dbExists);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'ooc=s',
 		      'target2Bit=s',
 		      'targetSizes=s',
 		      'query2Bit=s',
 		      'querySizes=s',
+		      'chainRam=s',
+		      'chainCpu=i',
 		      'localTmp=s',
-		      'ram=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
   $localTmp = $opt_localTmp ? $opt_localTmp : $localTmp;
 }
 
 
 sub getClusterSeqs {
   # Choose cluster and look for already-installed 2bit files on appropriate
   # cluster-scratch storage.  Exit with an error message if we can't find them.
@@ -184,34 +195,30 @@
         join("/, ", @okFilesystems) . "/ -- please distribute.\n";
     }
   }
   &HgAutomate::verbose(1, "Using $paraHub, $tSeqScratch and $qSeqScratch\n");
   return ($paraHub, $tSeqScratch, $qSeqScratch);
 } # getClusterSeqs
 
 
 
 #########################################################################
 # * step: align [bigClusterHub]
 sub doAlign {
   my $runDir = "$buildDir/run.blat";
   &HgAutomate::mustMkdir($runDir);
 
-  my $ramG = "-ram=8g";
-  if ($opt_ram) {
-     $ramG = "-ram=$opt_ram";
-  }
   my $ooc = "/hive/data/genomes/$tDb/11.ooc";
   if ($opt_ooc) {
     if ($opt_ooc eq 'none') {
       $ooc = "";
     } else {
       $ooc = "$opt_ooc";
     }
   }
   my $dashOoc = "-ooc=$ooc";
   my $pslDir = "$runDir/psl";
 
   &HgAutomate::checkCleanSlate('align', 'chain', $pslDir, 'run.time');
 
   my ($paraHub, $tSeqScratch, $qSeqScratch) = &getClusterSeqs();
   if (! &HgAutomate::machineHasFile($paraHub, $ooc)) {
@@ -282,30 +289,31 @@
 
 # Move final result into place:
 mv tmpOut.psl \$outPsl
 
 popd
 rm -rf \$tmpDir
 _EOF_
   ;
   close($fh);
   &HgAutomate::run("chmod a+x $runDir/job.csh");
 
   &HgAutomate::makeGsub($runDir,
 			'job.csh $(path1) $(path2) {check out line ' .
 			 $pslDir . '/$(file1)/$(file2).psl}');
 
+  my $paraRun = &HgAutomate::paraRun($blatRam, $blatCpu);
   my $whatItDoes = "It performs a cluster run of blat -fastMap.";
   my $bossScript = new HgRemoteScript("$runDir/doAlign.csh", $paraHub,
 				      $runDir, $whatItDoes);
 
   # Don't allow target sequences to be split because we don't lift them
   # nor do we cat them before chaining.  Use the max target seq size
   # as the chunkSize for partitionSequence.pl on the target.
   my $tpSize = `awk '{print \$2;}' $tSizes | sort -nr | head -1`;
   chomp $tpSize;
   # However, $tDb might be a collection of zillions of tiny scaffolds.
   # So to ensure reasonable cluster batch size, make sure that chunkSize
   # is at least 10,000,000 for the target.
   my $minTpSize = 10000000;
   $tpSize = $minTpSize if ($tpSize < $minTpSize);
 
@@ -318,35 +326,31 @@
 $Bin/partitionSequence.pl $tpSize 0 $tSeqScratch \\
    $tSizes 2000 \\
   -lstDir=tParts > t.lst
 rm -rf qParts
 $Bin/partitionSequence.pl 10000000 0 $qSeqScratch \\
   $qSizes 1000 \\
   -lstDir=qParts > q.lst
 
 mkdir $pslDir
 foreach f (`cat t.lst`)
   mkdir $pslDir/\$f:t
 end
 
 $gensub2 t.lst q.lst gsub jobList
 
-/parasol/bin/para $ramG make jobList
-/parasol/bin/para check
-/parasol/bin/para time > run.time
-cat run.time
-
+$paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doAlign
 
 
 #########################################################################
 # * step: chain [smallClusterHub]
 
 sub makePslPartsLst {
   # $pslDir/$tPart/ files look like either
   #   part006.lst.psl --> make this into a single job.
   #   $qDb.2bit:$seq:$start-$end.psl --> cat $qDb.2bit:$seq:*.psl into a job.
   # ==> Make a list of any part*.lst.psl plus collapsed roots
   #     (one $qDb.2bit:$seq: per $seq).
@@ -425,31 +429,31 @@
 | chainBridge -linearGap=medium stdin $tSeqScratch $qSeqScratch \\
     \$tmpOut
 mv \$tmpOut \$outChain
 chmod 664 \$outChain
 _EOF_
   ;
   close($fh);
   &HgAutomate::run("chmod a+x $runDir/job.csh");
 
   &HgAutomate::makeGsub($runDir,
 			'job.csh $(path1) ' .
 			'{check out line+ chainRaw/$(path1).chain}');
   my $whatItDoes = "It does a cluster run to chain the blat alignments.";
   my $bossScript = new HgRemoteScript("$runDir/doChain.csh", $paraHub,
 				      $runDir, $whatItDoes);
-  my $paraRun = &HgAutomate::paraRun();
+  my $paraRun = &HgAutomate::paraRun($chainRam, $chainCpu);
   my $gensub2 = &HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 mkdir chainRaw
 foreach d ($pslDir/*)
   mkdir chainRaw/\$d:t
 end
 
 $gensub2 pslParts.lst single gsub jobList
 $paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doChain
 
 
@@ -541,32 +545,32 @@
 ln -s $liftOverChainPath $HgAutomate::goldenPath/$tDb/liftOver/
 
 # Link from genome browser fileserver:
 mkdir -p $HgAutomate::gbdb/$tDb/liftOver
 rm -f $HgAutomate::gbdb/$tDb/liftOver/$liftOverChainFile
 ln -s $liftOverChainPath $HgAutomate::gbdb/$tDb/liftOver/
 
 # Add an entry to liftOverChain table in central database (specified in
 # ~/.hg.conf) so that hgLiftOver will know that this is available:
 hgAddLiftOverChain $tDb $qDb
 _EOF_
     );
   } else {
     $bossScript->add(<<_EOF_
 hgLoadChain -test -noBin -tIndex $tDb chain$QDb $buildDir/$liftOverChainFile
-wget -O bigChain.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigChain.as'
-wget -O bigLink.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigLink.as'
+wget --no-check-certificate -O bigChain.as 'https://raw.githubusercontent.com/ucscGenomeBrowser/kent/refs/heads/master/src/hg/lib/bigChain.as'
+wget --no-check-certificate -O bigLink.as 'https://raw.githubusercontent.com/ucscGenomeBrowser/kent/refs/heads/master/src/hg/lib/bigLink.as'
 sed 's/.000000//' chain.tab | awk 'BEGIN {OFS="\\t"} {print \$2, \$4, \$5, \$11, 1000, \$8, \$3, \$6, \$7, \$9, \$10, \$1}' > chain${QDb}.tab
 bedToBigBed -type=bed6+6 -as=bigChain.as -tab chain${QDb}.tab $tSizes chain${QDb}.bb
 awk 'BEGIN {OFS="\\t"} {print \$1, \$2, \$3, \$5, \$4}' link.tab | sort -k1,1 -k2,2n > chain${QDb}Link.tab
 bedToBigBed -type=bed4+1 -as=bigLink.as -tab chain${QDb}Link.tab $tSizes chain${QDb}Link.bb
 set totalBases = `ave -col=2 $tSizes | grep "^total" | awk '{printf "%d", \$2}'`
 set basesCovered = `bedSingleCover.pl chain${QDb}Link.tab | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'`
 set percentCovered = `echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'`
 printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" "\$totalBases" "\$percentCovered" > fb.$tDb.chain.${QDb}Link.txt
 rm -f link.tab chain.tab bigChain.as bigLink.as chain${QDb}.tab chain${QDb}Link.tab
 _EOF_
     );
   }
   $bossScript->execute();
 } # doLoad
 
@@ -656,30 +660,34 @@
 
 &checkOptions();
 
 &usage(1) if (scalar(@ARGV) != 2);
 ($tDb, $qDb) = @ARGV;
 
 # may be working on a 2bit file that does not have a database browser
 $dbExists = 0;
 $dbExists = 1 if (&HgAutomate::databaseExists($dbHost, $tDb));
 
 &getSeqAndSizes();
 $QDb = ucfirst($qDb);
 $liftOverChainDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver";
 $liftOverChainFile = "${tDb}To${QDb}.over.chain.gz";
 $liftOverChainPath = "$liftOverChainDir/$liftOverChainFile";
+$chainRam = $opt_chainRam ? $opt_chainRam : $chainRam;
+$chainCpu = $opt_chainCpu ? $opt_chainCpu : $chainCpu;
+$blatRam = $opt_ram ? $opt_ram : $blatRam;
+$blatCpu = $opt_cpu ? $opt_cpu : $blatCpu;
 
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/blat.$qDb.$date";
 
 if (! -d $buildDir) {
   if ($stepper->stepPrecedes('align', $stepper->getStartStep())) {
     die "$buildDir does not exist; try running again with -buildDir.\n";
   }
   &HgAutomate::mustMkdir($buildDir);
 }
 
 $stepper->execute();