24c443b10b97e5b1e81df9ee5cb393131e7b873a hiram Wed May 5 11:40:43 2021 -0700 better handling of assembly hub chain net tracks refs #26988 diff --git src/hg/utils/automation/doBlastzChainNet.pl src/hg/utils/automation/doBlastzChainNet.pl index 4e58c1d..b48cadd 100755 --- src/hg/utils/automation/doBlastzChainNet.pl +++ src/hg/utils/automation/doBlastzChainNet.pl @@ -48,30 +48,31 @@ use vars qw/ $opt_blastzOutRoot $opt_swap $opt_chainMinScore $opt_chainLinearGap $opt_tRepeats $opt_qRepeats $opt_readmeOnly $opt_ignoreSelf $opt_syntenicNet $opt_noDbNameCheck $opt_inclHap $opt_noLoadChainSplit $opt_loadChainSplit $opt_swapDir + $opt_asmId $opt_skipDownload $opt_trackHub /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'partition', func => \&doPartition }, { name => 'blastz', func => \&doBlastzClusterRun }, { name => 'cat', func => \&doCatRun }, { name => 'chainRun', func => \&doChainRun }, { name => 'chainMerge', func => \&doChainMerge }, { name => 'net', func => \&netChains }, { name => 'load', func => \&loadUp }, { name => 'download', func => \&doDownloads }, { name => 'cleanup', func => \&cleanup }, @@ -117,30 +118,32 @@ a new directory: $HgAutomate::clusterData/\$qDb/$HgAutomate::trackBuild/blastz.\$tDb.swap/ -chainMinScore n Add -minScore=n (default: $defaultChainMinScore) to the axtChain command. -chainLinearGap type Add -linearGap=<loose|medium|filename> to the axtChain command. (default: loose) -tRepeats table Add -tRepeats=table to netClass (default: rmsk) -qRepeats table Add -qRepeats=table to netClass (default: rmsk) -ignoreSelf Do not assume self alignments even if tDb == qDb -syntenicNet Perform optional syntenicNet step -noDbNameCheck ignore Db name format -inclHap include haplotypes *_hap* in chain/net, default not -loadChainSplit load split chain tables, default is not split tables -swapDir path directory to work in for swap, default: /hive/data/genomes/qDb/bed/blastz.tDb.swap/ + -asmId assemblyHubId full name for assembly hub, + e.g.: GCF_007474595.1_mLynCan4_v1.p -skipDownload do not construct the downloads directory -trackHub construct big* files for track hub _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => '', 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates UCSC's blastz/chain/net pipeline: 1. Big cluster run of blastz. 2. Small cluster consolidation of blastz result files. 3. Small cluster chaining run. 4. Sorting and netting of chains on the fileserver @@ -256,31 +259,31 @@ 15. SEQ2_SELF=1 specifies the SEQ2 is already specially split for self alignments and to use SEQ2 sequence for self alignment, not just a copy of SEQ1 16. TMPDIR - specifies directory on cluster node to keep temporary files Typically TMPDIR=/scratch/tmp 17. All other variables in DEF will be ignored! " if ($detailed); exit $status; } # Globals: my %defVars = (); my ($DEF, $tDb, $qDb, $QDb, $isSelf, $selfSplit, $buildDir, $fileServer); -my ($swapDir, $splitRef, $inclHap, $secondsStart, $secondsEnd, $dbExists, $qDbExists); +my ($swapDir, $asmId, $splitRef, $inclHap, $secondsStart, $secondsEnd, $dbExists, $qDbExists); sub isInDirList { # Return TRUE if $dir is under (begins with) something in dirList. my ($dir, @dirList) = @_; my $pat = '^(' . join('|', @dirList) . ')(/.*)?$'; return ($dir =~ m@$pat@); } sub enforceClusterNoNo { # Die right away if user is trying to put cluster output somewhere # off-limits. my ($dir, $desc) = @_; if (&isInDirList($dir, @clusterNoNo)) { die "\ncluster outputs are forbidden to go to " . join (' or ', @clusterNoNo) . " so please choose a different " . @@ -303,30 +306,31 @@ @HgAutomate::commonOptionSpec, "blastzOutRoot=s", "swap", "chainMinScore=i", "chainLinearGap=s", "tRepeats=s", "qRepeats=s", "readmeOnly", "ignoreSelf", "syntenicNet", "noDbNameCheck", "inclHap", "noLoadChainSplit", "loadChainSplit", "swapDir=s", + "asmId=s", "skipDownload", "trackHub" ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); if ($opt_swap) { if ($opt_continue) { if ($stepper->stepPrecedes($opt_continue, 'net')) { warn "\nIf -swap is specified, then -continue must specify a step ". "of \"net\" or later.\n"; &usage(1); @@ -1206,49 +1210,52 @@ return if ($opt_skipDownload); # Make an md5sum.txt file. my $net = $isSelf ? "" : "$tDb.$qDb.net.gz"; if (! -s "$runDir/$net") { $net = ""; } my $whatItDoes = "It makes an md5sum.txt file for downloadable files, with relative paths matching what the user will see on the download server, and installs the over.chain file in the liftOver dir."; my $bossScript = new HgRemoteScript("$runDir/makeMd5sum.csh", $workhorse, $runDir, $whatItDoes, $DEF); my $over = $tDb . "To$QDb.over.chain.gz"; my $altOver = "$tDb.$qDb.over.chain.gz"; my $liftOverDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver"; + if ($tDb =~ m/^GC/) { + $liftOverDir = &HgAutomate::asmHubBuildDir($asmId) . "/liftOver"; + } $bossScript->add(<<_EOF_ mkdir -p $liftOverDir md5sum $tDb.$qDb.all.chain.gz $net > md5sum.txt _EOF_ ); if (! $isSelf) { my $axt = ($splitRef ? "md5sum axtNet/*.gz >> axtChain/md5sum.txt" : "cd axtNet\nmd5sum *.gz >> ../axtChain/md5sum.txt"); $bossScript->add(<<_EOF_ rm -f $liftOverDir/$over cp -p $altOver $liftOverDir/$over cd .. $axt _EOF_ ); } $bossScript->execute(); -} +} # sub makeDownloads sub getBlastzParams { my %vars; # Return parameters in BLASTZ_Q file, or defaults, for README.txt. my $matrix = " A C G T A 91 -114 -31 -123 C -114 100 -125 -31 G -31 -125 100 -114 T -123 -31 -114 91"; if ($defVars{'BLASTZ_Q'}) { my $readLineLimit = 100; # safety valve to get out if reading nonsense my $linesRead = 0; my $fh = &HgAutomate::mustOpen($defVars{'BLASTZ_Q'}); my $line; @@ -1338,31 +1345,38 @@ "A similar process was followed for $qDb, with chunks of " . &commafy($chunkPlusLap2) . " overlapping by " . &commafy($defVars{SEQ2_LAP}) . "."; } $lap .= " Following alignment, the coordinates of the chunk alignments were corrected by the blastz-normalizeLav script written by Scott Schwartz of Penn State."; return $lap; } sub dumpDownloadReadme { # Write a file (README.txt) describing the download files. my ($file) = @_; my $fh = &HgAutomate::mustOpen(">$file"); - my ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $tDb); + my ($tGenome, $tDate, $tSource, $tAsmName); + if ($tDb =~ m/^GC/) { + ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $asmId); + $tAsmName = $asmId; + } else { + ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $tDb); + $tAsmName = $tDb; + } my ($qGenome, $qDate, $qSource) = &HgAutomate::getAssemblyInfo($dbHost, $qDb); my $dir = $splitRef ? 'axtNet/*.' : ''; my $synNet = $splitRef ? "mafSynNet/*.maf.gz - filtered net files for syntenic alignments only, in MAF format, see also, description of MAF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format5" : "$tDb.$qDb.synNet.maf.gz - filtered net file for syntenic alignments only, in MAF format, see also, description of MAF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format5 - $tDb.$qDb.syn.net.gz - filtered net file for syntenic alignments only"; my ($matrix, $o, $e, $k, $l, $h, $blastzOther) = &getBlastzParams(); my $defaultMatrix = $defVars{'BLASTZ_Q'} ? '' : ' the default matrix'; my $lap = &describeOverlapping(); @@ -1380,31 +1394,31 @@ Transposons that have been inserted since the $qGenome/$tGenome split were removed from the assemblies before alignment using the fasta-subseq and strip_rpts programs from Penn State. The abbreviated genomes were aligned with lastz, and the transposons were then added back in (i.e. the alignment coordinates were adjusted) using the restore_rpts program from Penn State."; } } my $desc = $isSelf ? "This directory contains alignments of $tGenome ($tDb, $tDate, $tSource) to itself." : "This directory contains alignments of the following assemblies: - target/reference: $tGenome - ($tDb, $tDate, + ($tAsmName, $tDate, $tSource) - query: $qGenome ($qDb, $qDate, $qSource)"; print $fh "$desc Files included in this directory: - md5sum.txt: md5sum checksums for the files in this directory - $tDb.$qDb.all.chain.gz: chained lastz alignments. The chain format is described in http://genome.ucsc.edu/goldenPath/help/chain.html . @@ -1426,32 +1440,32 @@ - reciprocalBest/ directory, contains reciprocal-best netted chains for $tDb-$qDb "; } if ($opt_swap) { my $TDb = ucfirst($tDb); print $fh "The chainSwap program was used to translate $qDb-referenced chained lastz alignments to $tDb into $tDb-referenced chains aligned to $qDb. See the download directory goldenPath/$qDb/vs$TDb/README.txt for more information about the $qDb-referenced lastz and chaining process. "; } else { print $fh ($isSelf ? -"The $tDb assembly was aligned to itself" : -"The $tDb and $qDb assemblies were aligned"); +"The $tAsmName assembly was aligned to itself" : +"The $tAsmName and $qDb assemblies were aligned"); my $chainMinScore = $opt_chainMinScore ? "$opt_chainMinScore" : $defaultChainMinScore; my $chainLinearGap = $opt_chainLinearGap ? "$opt_chainLinearGap" : $defaultChainLinearGap; print $fh " by the lastz alignment program, which is available from Webb Miller's lab at Penn State University (http://www.bx.psu.edu/miller_lab/). $lap $abridging The lastz scoring matrix (Q parameter) used was$defaultMatrix: $matrix with a gap open penalty of O=$o and a gap extension penalty of E=$e. The minimum score for an alignment to be kept was K=$k for the first pass and L=$l for the second pass, which restricted the search space to the @@ -1529,75 +1543,87 @@ sub installDownloads { # construct symlinks for released files to download directory # load liftOver chains into hgcentral my $runDir = "$buildDir/axtChain"; # Make sure previous stage was successful. my $successFile = "$runDir/$tDb.$qDb.all.chain.gz"; if (! $isSelf && -s "$runDir/$tDb.$qDb.net.gz") { $successFile = "$runDir/$tDb.$qDb.net.gz"; } if (! -e $successFile && ! $opt_debug) { die "installDownloads: looks like previous stage was not successful " . "(can't find $successFile).\n"; } + my $goldenPath = $HgAutomate::goldenPath; + if ($tDb =~ m/^GC/) { + $goldenPath = &HgAutomate::asmHubDownloadDir($tDb); + } &dumpDownloadReadme("$runDir/README.txt"); my $over = $tDb . "To$QDb.over.chain.gz"; my $liftOverDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver"; - my $gpLiftOverDir = "$HgAutomate::goldenPath/$tDb/liftOver"; + my $gpLiftOverDir = "$goldenPath/$tDb/liftOver"; my $gbdbLiftOverDir = "$HgAutomate::gbdb/$tDb/liftOver"; my $andNets = $isSelf ? "." : ", nets and axtNet,\n" . "# and copies the liftOver chains to the liftOver download dir."; my $whatItDoes = "It creates the download directory for chains$andNets"; my $bossScript = new HgRemoteScript("$runDir/installDownloads.csh", $dbHost, $runDir, $whatItDoes, $DEF); $bossScript->add(<<_EOF_ -mkdir -p $HgAutomate::goldenPath/$tDb -rm -rf $HgAutomate::goldenPath/$tDb/vs$QDb -mkdir -p $HgAutomate::goldenPath/$tDb/vs$QDb -cd $HgAutomate::goldenPath/$tDb/vs$QDb +mkdir -p $goldenPath/$tDb +rm -rf $goldenPath/$tDb/vs$QDb +mkdir -p $goldenPath/$tDb/vs$QDb +cd $goldenPath/$tDb/vs$QDb ln -s $runDir/$tDb.$qDb.all.chain.gz . ln -s $runDir/README.txt . ln -s $runDir/md5sum.txt . _EOF_ ); if (! $isSelf) { my $axt = ($splitRef ? "mkdir -p axtNet\n" . "ln -s $buildDir/axtNet/*.axt.gz axtNet/" : "ln -s $buildDir/axtNet/$tDb.$qDb.net.axt.gz ."); if ( -s "$runDir/$tDb.$qDb.net.gz") { $bossScript->add(<<_EOF_ ln -s $runDir/$tDb.$qDb.net.gz . _EOF_ ); } $bossScript->add(<<_EOF_ $axt mkdir -p $gpLiftOverDir rm -f $gpLiftOverDir/$over ln -s $liftOverDir/$over $gpLiftOverDir/$over +_EOF_ + ); + if ($tDb !~ m/^GC/) { + $bossScript->add(<<_EOF_ mkdir -p $gbdbLiftOverDir rm -f $gbdbLiftOverDir/$over ln -s $liftOverDir/$over $gbdbLiftOverDir/$over hgAddLiftOverChain -minMatch=0.1 -multiple -path=$gbdbLiftOverDir/$over \\ $tDb $qDb +_EOF_ + ); + } + $bossScript->add(<<_EOF_ # Update (or create) liftOver/md5sum.txt with the new .over.chain.gz. if (-e $gpLiftOverDir/md5sum.txt) then set tmpFile = `mktemp -t tmpMd5.XXXXXX` csh -c "grep -v $over $gpLiftOverDir/md5sum.txt || true" > \$tmpFile md5sum $gpLiftOverDir/$over \\ | sed -e 's\@$gpLiftOverDir/\@\@' >> \$tmpFile sort \$tmpFile > $gpLiftOverDir/md5sum.txt rm \$tmpFile else md5sum $gpLiftOverDir/$over | sed -e 's\@$gpLiftOverDir/\@\@' \\ > $gpLiftOverDir/md5sum.txt endif _EOF_ ); } @@ -1854,30 +1880,33 @@ &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; ($DEF) = @ARGV; $inclHap = ""; $inclHap = "-inclHap" if ($opt_inclHap); &loadDef($DEF); &checkDef(); my $seq1IsSplit = (`wc -l < $defVars{SEQ1_LEN}` <= $HgAutomate::splitThreshold); my $seq2IsSplit = (`wc -l < $defVars{SEQ2_LEN}` <= $HgAutomate::splitThreshold); +# might be an assembly hub build +$asmId = $opt_asmId ? $opt_asmId : ""; + # Undocumented option for quickly generating a README from DEF: if ($opt_readmeOnly) { $splitRef = $opt_swap ? $seq2IsSplit : $seq1IsSplit; &swapGlobals() if $opt_swap; &dumpDownloadReadme("/tmp/README.txt"); exit 0; } my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $defVars{'BASE'} || "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/blastz.$qDb.$date"; if ($opt_swap) { my $inChain = &getAllChain("$buildDir/axtChain");