4a157fbcc40beb3cd5279d221dac018d353ec78f hiram Tue Oct 1 13:57:48 2024 -0700 genome-source has now become githubusercontent refs #34449 diff --git src/hg/utils/automation/doSameSpeciesLiftOver.pl src/hg/utils/automation/doSameSpeciesLiftOver.pl index ad3668e..aa596b1 100755 --- src/hg/utils/automation/doSameSpeciesLiftOver.pl +++ src/hg/utils/automation/doSameSpeciesLiftOver.pl @@ -12,82 +12,92 @@ use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_ooc $opt_target2Bit $opt_targetSizes $opt_query2Bit $opt_querySizes + $opt_chainRam + $opt_chainCpu $opt_localTmp - $opt_ram /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'align', func => \&doAlign }, { name => 'chain', func => \&doChain }, { name => 'net', func => \&doNet }, { name => 'load', func => \&doLoad }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; +my $ramG = '4g'; +my $cpu = 1; +my $blatRam = '4g'; # -ram=Ng argument +my $blatCpu = 1; # -cpu=N argument +my $chainRam = '16g'; # -chainRam=Ng argument +my $chainCpu = 1; # -chainCpu=N argument # This could be made into an option: # BLAT -fastMap will not work with query chunks greater than 5000 my $splitSize = '5000'; my $splitOverlap = '500'; my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base fromDb toDb options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$fromDb/$HgAutomate::trackBuild/blat.\$toDb.\$date (necessary when continuing at a later date). -ooc /path/11.ooc Use this instead of the default /hive/data/genomes/fromDb/11.ooc Can be "none". -target2Bit /path/target.2bit Full path to target sequence (fromDb) -query2Bit /path/query.2bit Full path to query sequence (toDb) -targetSizes /path/target.chrom.sizes Full path to target chrom.sizes (fromDb) -querySizes /path/query.chrom.sizes Full path to query chrom.sizes (toDb) + -chainRam Ng Cluster ram size for chain step, default: -chainRam=$chainRam + -chainCpu N Cluster CPUs number for chain step, default: -chainCpu=$chainCpu -localTmp /dev/shm Full path to temporary storage for heavy I/O usage - -ram Ng set -ram=Ng argument to para create command (default 8g) _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => '', 'fileServer' => '', + 'ram' => $ramG, + 'cpu' => $cpu, 'bigClusterHub' => ''); print STDERR " Automates UCSC's same-species liftOver (blat/chain/net) pipeline, based on Kate's suite of makeLo-* scripts: align: Aligns the assemblies using blat -fastMap on a big cluster. chain: Chains the alignments on a big cluster. net: Nets the alignments, uses netChainSubset to extract liftOver chains. load: Installs liftOver chain files, calls hgAddLiftOverChain on $dbHost. cleanup: Removes or compresses intermediate files. All operations are performed in the build directory which is $HgAutomate::clusterData/\$fromDb/$HgAutomate::trackBuild/blat.\$toDb.\$date unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: @@ -110,32 +120,33 @@ # Other: my ($buildDir); my ($tSeq, $tSizes, $qSeq, $qSizes, $QDb, $fileServer); my ($liftOverChainDir, $liftOverChainFile, $liftOverChainPath, $dbExists); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'ooc=s', 'target2Bit=s', 'targetSizes=s', 'query2Bit=s', 'querySizes=s', + 'chainRam=s', + 'chainCpu=i', 'localTmp=s', - 'ram=s', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); $localTmp = $opt_localTmp ? $opt_localTmp : $localTmp; } sub getClusterSeqs { # Choose cluster and look for already-installed 2bit files on appropriate # cluster-scratch storage. Exit with an error message if we can't find them. @@ -184,34 +195,30 @@ join("/, ", @okFilesystems) . "/ -- please distribute.\n"; } } &HgAutomate::verbose(1, "Using $paraHub, $tSeqScratch and $qSeqScratch\n"); return ($paraHub, $tSeqScratch, $qSeqScratch); } # getClusterSeqs ######################################################################### # * step: align [bigClusterHub] sub doAlign { my $runDir = "$buildDir/run.blat"; &HgAutomate::mustMkdir($runDir); - my $ramG = "-ram=8g"; - if ($opt_ram) { - $ramG = "-ram=$opt_ram"; - } my $ooc = "/hive/data/genomes/$tDb/11.ooc"; if ($opt_ooc) { if ($opt_ooc eq 'none') { $ooc = ""; } else { $ooc = "$opt_ooc"; } } my $dashOoc = "-ooc=$ooc"; my $pslDir = "$runDir/psl"; &HgAutomate::checkCleanSlate('align', 'chain', $pslDir, 'run.time'); my ($paraHub, $tSeqScratch, $qSeqScratch) = &getClusterSeqs(); if (! &HgAutomate::machineHasFile($paraHub, $ooc)) { @@ -282,30 +289,31 @@ # Move final result into place: mv tmpOut.psl \$outPsl popd rm -rf \$tmpDir _EOF_ ; close($fh); &HgAutomate::run("chmod a+x $runDir/job.csh"); &HgAutomate::makeGsub($runDir, 'job.csh $(path1) $(path2) {check out line ' . $pslDir . '/$(file1)/$(file2).psl}'); + my $paraRun = &HgAutomate::paraRun($blatRam, $blatCpu); my $whatItDoes = "It performs a cluster run of blat -fastMap."; my $bossScript = new HgRemoteScript("$runDir/doAlign.csh", $paraHub, $runDir, $whatItDoes); # Don't allow target sequences to be split because we don't lift them # nor do we cat them before chaining. Use the max target seq size # as the chunkSize for partitionSequence.pl on the target. my $tpSize = `awk '{print \$2;}' $tSizes | sort -nr | head -1`; chomp $tpSize; # However, $tDb might be a collection of zillions of tiny scaffolds. # So to ensure reasonable cluster batch size, make sure that chunkSize # is at least 10,000,000 for the target. my $minTpSize = 10000000; $tpSize = $minTpSize if ($tpSize < $minTpSize); @@ -318,35 +326,31 @@ $Bin/partitionSequence.pl $tpSize 0 $tSeqScratch \\ $tSizes 2000 \\ -lstDir=tParts > t.lst rm -rf qParts $Bin/partitionSequence.pl 10000000 0 $qSeqScratch \\ $qSizes 1000 \\ -lstDir=qParts > q.lst mkdir $pslDir foreach f (`cat t.lst`) mkdir $pslDir/\$f:t end $gensub2 t.lst q.lst gsub jobList -/parasol/bin/para $ramG make jobList -/parasol/bin/para check -/parasol/bin/para time > run.time -cat run.time - +$paraRun _EOF_ ); $bossScript->execute(); } # doAlign ######################################################################### # * step: chain [smallClusterHub] sub makePslPartsLst { # $pslDir/$tPart/ files look like either # part006.lst.psl --> make this into a single job. # $qDb.2bit:$seq:$start-$end.psl --> cat $qDb.2bit:$seq:*.psl into a job. # ==> Make a list of any part*.lst.psl plus collapsed roots # (one $qDb.2bit:$seq: per $seq). @@ -425,31 +429,31 @@ | chainBridge -linearGap=medium stdin $tSeqScratch $qSeqScratch \\ \$tmpOut mv \$tmpOut \$outChain chmod 664 \$outChain _EOF_ ; close($fh); &HgAutomate::run("chmod a+x $runDir/job.csh"); &HgAutomate::makeGsub($runDir, 'job.csh $(path1) ' . '{check out line+ chainRaw/$(path1).chain}'); my $whatItDoes = "It does a cluster run to chain the blat alignments."; my $bossScript = new HgRemoteScript("$runDir/doChain.csh", $paraHub, $runDir, $whatItDoes); - my $paraRun = &HgAutomate::paraRun(); + my $paraRun = &HgAutomate::paraRun($chainRam, $chainCpu); my $gensub2 = &HgAutomate::gensub2(); $bossScript->add(<<_EOF_ mkdir chainRaw foreach d ($pslDir/*) mkdir chainRaw/\$d:t end $gensub2 pslParts.lst single gsub jobList $paraRun _EOF_ ); $bossScript->execute(); } # doChain @@ -541,32 +545,32 @@ ln -s $liftOverChainPath $HgAutomate::goldenPath/$tDb/liftOver/ # Link from genome browser fileserver: mkdir -p $HgAutomate::gbdb/$tDb/liftOver rm -f $HgAutomate::gbdb/$tDb/liftOver/$liftOverChainFile ln -s $liftOverChainPath $HgAutomate::gbdb/$tDb/liftOver/ # Add an entry to liftOverChain table in central database (specified in # ~/.hg.conf) so that hgLiftOver will know that this is available: hgAddLiftOverChain $tDb $qDb _EOF_ ); } else { $bossScript->add(<<_EOF_ hgLoadChain -test -noBin -tIndex $tDb chain$QDb $buildDir/$liftOverChainFile -wget -O bigChain.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigChain.as' -wget -O bigLink.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigLink.as' +wget --no-check-certificate -O bigChain.as 'https://raw.githubusercontent.com/ucscGenomeBrowser/kent/refs/heads/master/src/hg/lib/bigChain.as' +wget --no-check-certificate -O bigLink.as 'https://raw.githubusercontent.com/ucscGenomeBrowser/kent/refs/heads/master/src/hg/lib/bigLink.as' sed 's/.000000//' chain.tab | awk 'BEGIN {OFS="\\t"} {print \$2, \$4, \$5, \$11, 1000, \$8, \$3, \$6, \$7, \$9, \$10, \$1}' > chain${QDb}.tab bedToBigBed -type=bed6+6 -as=bigChain.as -tab chain${QDb}.tab $tSizes chain${QDb}.bb awk 'BEGIN {OFS="\\t"} {print \$1, \$2, \$3, \$5, \$4}' link.tab | sort -k1,1 -k2,2n > chain${QDb}Link.tab bedToBigBed -type=bed4+1 -as=bigLink.as -tab chain${QDb}Link.tab $tSizes chain${QDb}Link.bb set totalBases = `ave -col=2 $tSizes | grep "^total" | awk '{printf "%d", \$2}'` set basesCovered = `bedSingleCover.pl chain${QDb}Link.tab | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'` set percentCovered = `echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" "\$totalBases" "\$percentCovered" > fb.$tDb.chain.${QDb}Link.txt rm -f link.tab chain.tab bigChain.as bigLink.as chain${QDb}.tab chain${QDb}Link.tab _EOF_ ); } $bossScript->execute(); } # doLoad @@ -656,30 +660,34 @@ &checkOptions(); &usage(1) if (scalar(@ARGV) != 2); ($tDb, $qDb) = @ARGV; # may be working on a 2bit file that does not have a database browser $dbExists = 0; $dbExists = 1 if (&HgAutomate::databaseExists($dbHost, $tDb)); &getSeqAndSizes(); $QDb = ucfirst($qDb); $liftOverChainDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver"; $liftOverChainFile = "${tDb}To${QDb}.over.chain.gz"; $liftOverChainPath = "$liftOverChainDir/$liftOverChainFile"; +$chainRam = $opt_chainRam ? $opt_chainRam : $chainRam; +$chainCpu = $opt_chainCpu ? $opt_chainCpu : $chainCpu; +$blatRam = $opt_ram ? $opt_ram : $blatRam; +$blatCpu = $opt_cpu ? $opt_cpu : $blatCpu; my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/blat.$qDb.$date"; if (! -d $buildDir) { if ($stepper->stepPrecedes('align', $stepper->getStartStep())) { die "$buildDir does not exist; try running again with -buildDir.\n"; } &HgAutomate::mustMkdir($buildDir); } $stepper->execute();