2ce16411b0f909e7e1dfd27a01af34316c260f2f hiram Mon Jul 5 11:58:27 2021 -0700 catching up the edge cases for pair wise alignment to assembly hub genome no redmine diff --git src/hg/utils/automation/doBlastzChainNet.pl src/hg/utils/automation/doBlastzChainNet.pl index 536ed63..0417402 100755 --- src/hg/utils/automation/doBlastzChainNet.pl +++ src/hg/utils/automation/doBlastzChainNet.pl @@ -48,31 +48,32 @@ use vars qw/ $opt_blastzOutRoot $opt_swap $opt_chainMinScore $opt_chainLinearGap $opt_tRepeats $opt_qRepeats $opt_readmeOnly $opt_ignoreSelf $opt_syntenicNet $opt_noDbNameCheck $opt_inclHap $opt_noLoadChainSplit $opt_loadChainSplit $opt_swapDir - $opt_asmId + $opt_tAsmId + $opt_qAsmId $opt_skipDownload $opt_trackHub /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'partition', func => \&doPartition }, { name => 'blastz', func => \&doBlastzClusterRun }, { name => 'cat', func => \&doCatRun }, { name => 'chainRun', func => \&doChainRun }, { name => 'chainMerge', func => \&doChainMerge }, { name => 'net', func => \&netChains }, { name => 'load', func => \&loadUp }, { name => 'download', func => \&doDownloads }, { name => 'cleanup', func => \&cleanup }, @@ -118,31 +119,33 @@ a new directory: $HgAutomate::clusterData/\$qDb/$HgAutomate::trackBuild/blastz.\$tDb.swap/ -chainMinScore n Add -minScore=n (default: $defaultChainMinScore) to the axtChain command. -chainLinearGap type Add -linearGap= to the axtChain command. (default: loose) -tRepeats table Add -tRepeats=table to netClass (default: rmsk) -qRepeats table Add -qRepeats=table to netClass (default: rmsk) -ignoreSelf Do not assume self alignments even if tDb == qDb -syntenicNet Perform optional syntenicNet step -noDbNameCheck ignore Db name format -inclHap include haplotypes *_hap* in chain/net, default not -loadChainSplit load split chain tables, default is not split tables -swapDir path directory to work in for swap, default: /hive/data/genomes/qDb/bed/blastz.tDb.swap/ - -asmId assemblyHubId full name for assembly hub, + -tAsmId assemblyHubId full name for assembly hub as target: + e.g.: GCF_007474595.1_mLynCan4_v1.p + -qAsmId assemblyHubId full name for assembly hub as target: e.g.: GCF_007474595.1_mLynCan4_v1.p -skipDownload do not construct the downloads directory -trackHub construct big* files for track hub _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => '', 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates UCSC's blastz/chain/net pipeline: 1. Big cluster run of blastz. 2. Small cluster consolidation of blastz result files. 3. Small cluster chaining run. @@ -259,31 +262,31 @@ 15. SEQ2_SELF=1 specifies the SEQ2 is already specially split for self alignments and to use SEQ2 sequence for self alignment, not just a copy of SEQ1 16. TMPDIR - specifies directory on cluster node to keep temporary files Typically TMPDIR=/scratch/tmp 17. All other variables in DEF will be ignored! " if ($detailed); exit $status; } # Globals: my %defVars = (); my ($DEF, $tDb, $qDb, $QDb, $isSelf, $selfSplit, $buildDir, $fileServer); -my ($swapDir, $asmId, $splitRef, $inclHap, $secondsStart, $secondsEnd, $dbExists, $qDbExists); +my ($swapDir, $tAsmId, $qAsmId, $splitRef, $inclHap, $secondsStart, $secondsEnd, $dbExists, $qDbExists); sub isInDirList { # Return TRUE if $dir is under (begins with) something in dirList. my ($dir, @dirList) = @_; my $pat = '^(' . join('|', @dirList) . ')(/.*)?$'; return ($dir =~ m@$pat@); } sub enforceClusterNoNo { # Die right away if user is trying to put cluster output somewhere # off-limits. my ($dir, $desc) = @_; if (&isInDirList($dir, @clusterNoNo)) { die "\ncluster outputs are forbidden to go to " . join (' or ', @clusterNoNo) . " so please choose a different " . @@ -306,31 +309,32 @@ @HgAutomate::commonOptionSpec, "blastzOutRoot=s", "swap", "chainMinScore=i", "chainLinearGap=s", "tRepeats=s", "qRepeats=s", "readmeOnly", "ignoreSelf", "syntenicNet", "noDbNameCheck", "inclHap", "noLoadChainSplit", "loadChainSplit", "swapDir=s", - "asmId=s", + "tAsmId=s", + "qAsmId=s", "skipDownload", "trackHub" ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); if ($opt_swap) { if ($opt_continue) { if ($stepper->stepPrecedes($opt_continue, 'net')) { warn "\nIf -swap is specified, then -continue must specify a step ". "of \"net\" or later.\n"; &usage(1); @@ -921,33 +925,34 @@ sub swapGlobals { # Swap our global variables ($buildDir, $tDb, $qDb and %defVars SEQ1/SEQ2) # so that the remaining steps need no tweaks for -swap. $buildDir = $swapDir; my $tmp = $qDb; $qDb = $tDb; $tDb = $tmp; $QDb = $isSelf ? 'Self' : ucfirst($qDb); foreach my $var ('DIR', 'LEN', 'CHUNK', 'LAP', 'SMSK') { $tmp = $defVars{"SEQ1_$var"}; $defVars{"SEQ1_$var"} = $defVars{"SEQ2_$var"}; $defVars{"SEQ2_$var"} = $tmp; } $defVars{'BASE'} = $swapDir; + $tAsmId = $opt_qAsmId ? $opt_qAsmId : ""; + $qAsmId = $opt_tAsmId ? $opt_tAsmId : ""; } - sub doChainMerge { # If -swap, swap chains from other org; otherwise, merge the results # from the chainRun step. if ($opt_swap) { &swapChains(); &swapGlobals(); } else { &postProcessChains(); } } sub netChains { # Turn chains into nets (,axt,maf,.over.chain). # Don't do this for self alignments. @@ -1211,31 +1216,31 @@ # Make an md5sum.txt file. my $net = $isSelf ? "" : "$tDb.$qDb.net.gz"; if (! -s "$runDir/$net") { $net = ""; } my $whatItDoes = "It makes an md5sum.txt file for downloadable files, with relative paths matching what the user will see on the download server, and installs the over.chain file in the liftOver dir."; my $bossScript = new HgRemoteScript("$runDir/makeMd5sum.csh", $workhorse, $runDir, $whatItDoes, $DEF); my $over = $tDb . "To$QDb.over.chain.gz"; my $altOver = "$tDb.$qDb.over.chain.gz"; my $liftOverDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver"; if ($tDb =~ m/^GC/) { - $liftOverDir = &HgAutomate::asmHubBuildDir($asmId) . "/liftOver"; + $liftOverDir = &HgAutomate::asmHubBuildDir($tAsmId) . "/liftOver"; } $bossScript->add(<<_EOF_ mkdir -p $liftOverDir md5sum $tDb.$qDb.all.chain.gz $net > md5sum.txt _EOF_ ); if (! $isSelf) { my $axt = ($splitRef ? "md5sum axtNet/*.gz >> axtChain/md5sum.txt" : "cd axtNet\nmd5sum *.gz >> ../axtChain/md5sum.txt"); $bossScript->add(<<_EOF_ rm -f $liftOverDir/$over cp -p $altOver $liftOverDir/$over cd .. $axt @@ -1347,37 +1352,44 @@ &commafy($defVars{SEQ2_LAP}) . "."; } $lap .= " Following alignment, the coordinates of the chunk alignments were corrected by the blastz-normalizeLav script written by Scott Schwartz of Penn State."; return $lap; } sub dumpDownloadReadme { # Write a file (README.txt) describing the download files. my ($file) = @_; my $fh = &HgAutomate::mustOpen(">$file"); my ($tGenome, $tDate, $tSource, $tAsmName); if ($tDb =~ m/^GC/) { - ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $asmId); - $tAsmName = $asmId; + ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $tAsmId); + $tAsmName = $tAsmId; } else { ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $tDb); $tAsmName = $tDb; } - my ($qGenome, $qDate, $qSource) = &HgAutomate::getAssemblyInfo($dbHost, $qDb); + my ($qGenome, $qDate, $qSource, $qAsmName); + if ($qDb =~ m/^GC/) { + ($qGenome, $qDate, $qSource) = &HgAutomate::getAssemblyInfo($dbHost, $qAsmId); + $qAsmName = $qAsmId; + } else { + ($qGenome, $qDate, $qSource) = &HgAutomate::getAssemblyInfo($dbHost, $qDb); + $qAsmName = $qDb; + } my $dir = $splitRef ? 'axtNet/*.' : ''; my $synNet = $splitRef ? "mafSynNet/*.maf.gz - filtered net files for syntenic alignments only, in MAF format, see also, description of MAF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format5" : "$tDb.$qDb.synNet.maf.gz - filtered net file for syntenic alignments only, in MAF format, see also, description of MAF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format5 - $tDb.$qDb.syn.net.gz - filtered net file for syntenic alignments only"; my ($matrix, $o, $e, $k, $l, $h, $blastzOther) = &getBlastzParams(); my $defaultMatrix = $defVars{'BLASTZ_Q'} ? '' : ' the default matrix'; my $lap = &describeOverlapping(); my $abridging = ""; @@ -1398,31 +1410,31 @@ alignment coordinates were adjusted) using the restore_rpts program from Penn State."; } } my $desc = $isSelf ? "This directory contains alignments of $tGenome ($tDb, $tDate, $tSource) to itself." : "This directory contains alignments of the following assemblies: - target/reference: $tGenome ($tAsmName, $tDate, $tSource) - query: $qGenome - ($qDb, $qDate, + ($qAsmName, $qDate, $qSource)"; print $fh "$desc Files included in this directory: - md5sum.txt: md5sum checksums for the files in this directory - $tDb.$qDb.all.chain.gz: chained lastz alignments. The chain format is described in http://genome.ucsc.edu/goldenPath/help/chain.html . "; if (! $isSelf) { print $fh " - $tDb.$qDb.net.gz: \"net\" file that describes rearrangements between @@ -1441,31 +1453,31 @@ for $tDb-$qDb "; } if ($opt_swap) { my $TDb = ucfirst($tDb); print $fh "The chainSwap program was used to translate $qDb-referenced chained lastz alignments to $tDb into $tDb-referenced chains aligned to $qDb. See the download directory goldenPath/$qDb/vs$TDb/README.txt for more information about the $qDb-referenced lastz and chaining process. "; } else { print $fh ($isSelf ? "The $tAsmName assembly was aligned to itself" : -"The $tAsmName and $qDb assemblies were aligned"); +"The $tAsmName and $qAsmName assemblies were aligned"); my $chainMinScore = $opt_chainMinScore ? "$opt_chainMinScore" : $defaultChainMinScore; my $chainLinearGap = $opt_chainLinearGap ? "$opt_chainLinearGap" : $defaultChainLinearGap; print $fh " by the lastz alignment program, which is available from Webb Miller's lab at Penn State University (http://www.bx.psu.edu/miller_lab/). $lap $abridging The lastz scoring matrix (Q parameter) used was$defaultMatrix: $matrix with a gap open penalty of O=$o and a gap extension penalty of E=$e. The minimum score for an alignment to be kept was K=$k for the first pass and L=$l for the second pass, which restricted the search space to the @@ -1550,30 +1562,34 @@ my $successFile = "$runDir/$tDb.$qDb.all.chain.gz"; if (! $isSelf && -s "$runDir/$tDb.$qDb.net.gz") { $successFile = "$runDir/$tDb.$qDb.net.gz"; } if (! -e $successFile && ! $opt_debug) { die "installDownloads: looks like previous stage was not successful " . "(can't find $successFile).\n"; } my $goldenPath = $HgAutomate::goldenPath; if ($tDb =~ m/^GC/) { $goldenPath = &HgAutomate::asmHubDownloadDir($tDb); } &dumpDownloadReadme("$runDir/README.txt"); my $over = $tDb . "To$QDb.over.chain.gz"; my $liftOverDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver"; + if ($tDb =~ m/^GC/) { + $liftOverDir = &HgAutomate::asmHubBuildDir($tAsmId) . "/liftOver"; + } +printf STDERR "# DBG: tDb '%s' have liftOverDir: '%s'\n", $tDb, $liftOverDir; my $gpLiftOverDir = "$goldenPath/$tDb/liftOver"; my $gbdbLiftOverDir = "$HgAutomate::gbdb/$tDb/liftOver"; my $andNets = $isSelf ? "." : ", nets and axtNet,\n" . "# and copies the liftOver chains to the liftOver download dir."; my $whatItDoes = "It creates the download directory for chains$andNets"; my $bossScript = new HgRemoteScript("$runDir/installDownloads.csh", $dbHost, $runDir, $whatItDoes, $DEF); $bossScript->add(<<_EOF_ mkdir -p $goldenPath/$tDb rm -rf $goldenPath/$tDb/vs$QDb mkdir -p $goldenPath/$tDb/vs$QDb cd $goldenPath/$tDb/vs$QDb ln -s $runDir/$tDb.$qDb.all.chain.gz . ln -s $runDir/README.txt . @@ -1888,31 +1904,32 @@ $secondsStart = `date "+%s"`; chomp $secondsStart; ($DEF) = @ARGV; $inclHap = ""; $inclHap = "-inclHap" if ($opt_inclHap); &loadDef($DEF); &checkDef(); my $seq1IsSplit = (`wc -l < $defVars{SEQ1_LEN}` <= $HgAutomate::splitThreshold); my $seq2IsSplit = (`wc -l < $defVars{SEQ2_LEN}` <= $HgAutomate::splitThreshold); # might be an assembly hub build -$asmId = $opt_asmId ? $opt_asmId : ""; +$tAsmId = $opt_tAsmId ? $opt_tAsmId : ""; +$qAsmId = $opt_qAsmId ? $opt_qAsmId : ""; # Undocumented option for quickly generating a README from DEF: if ($opt_readmeOnly) { $splitRef = $opt_swap ? $seq2IsSplit : $seq1IsSplit; &swapGlobals() if $opt_swap; &dumpDownloadReadme("/tmp/README.txt"); exit 0; } my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $defVars{'BASE'} || "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/blastz.$qDb.$date"; if ($opt_swap) {