24c443b10b97e5b1e81df9ee5cb393131e7b873a hiram Wed May 5 11:40:43 2021 -0700 better handling of assembly hub chain net tracks refs #26988 diff --git src/hg/utils/automation/doBlastzChainNet.pl src/hg/utils/automation/doBlastzChainNet.pl index 4e58c1d..b48cadd 100755 --- src/hg/utils/automation/doBlastzChainNet.pl +++ src/hg/utils/automation/doBlastzChainNet.pl @@ -1,1949 +1,1978 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/doBlastzChainNet.pl instead. # $Id: doBlastzChainNet.pl,v 1.33 2010/04/12 16:33:12 hiram Exp $ # to-do items: # - lots of testing # - better logging: right now it just passes stdout and stderr, # leaving redirection to a logfile up to the user # - -swapBlastz, -loadBlastz # - -tDb, -qDb # - -tUnmasked, -qUnmasked # - -axtBlastz # - another Gill wish list item: save a lav header (involves run-blastz-ucsc) # - 2bit / multi-sequence support when abridging? # - reciprocal best? # - hgLoadSeq of query instead of assuming there's a $qDb database? use Getopt::Long; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; use File::Basename; # Hardcoded paths/command sequences: my $getFileServer = '/cluster/bin/scripts/fileServer'; my $blastzRunUcsc = "$Bin/blastz-run-ucsc"; my $partition = "$Bin/partitionSequence.pl"; my $clusterLocal = '/scratch/hg'; my $clusterSortaLocal = '/iscratch/i'; my @clusterNAS = ('/cluster/bluearc', '/san/sanvol1'); my $clusterNAS = join('/... or ', @clusterNAS) . '/...'; my @clusterNoNo = ('/cluster/home', '/projects'); my @fileServerNoNo = ('kkhome', 'kks00'); my @fileServerNoLogin = ('kkusr01', '10.1.1.3', '10.1.10.11', 'sanhead1', 'sanhead2', 'sanhead3', 'sanhead4', 'sanhead5', 'sanhead6', 'sanhead7', 'sanhead8'); # Option variable names, both common and peculiar to doBlastz: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_blastzOutRoot $opt_swap $opt_chainMinScore $opt_chainLinearGap $opt_tRepeats $opt_qRepeats $opt_readmeOnly $opt_ignoreSelf $opt_syntenicNet $opt_noDbNameCheck $opt_inclHap $opt_noLoadChainSplit $opt_loadChainSplit $opt_swapDir + $opt_asmId $opt_skipDownload $opt_trackHub /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'partition', func => \&doPartition }, { name => 'blastz', func => \&doBlastzClusterRun }, { name => 'cat', func => \&doCatRun }, { name => 'chainRun', func => \&doChainRun }, { name => 'chainMerge', func => \&doChainMerge }, { name => 'net', func => \&netChains }, { name => 'load', func => \&loadUp }, { name => 'download', func => \&doDownloads }, { name => 'cleanup', func => \&cleanup }, { name => 'syntenicNet',func => \&doSyntenicNet } ] ); # Option defaults: # my $bigClusterHub = 'swarm'; my $bigClusterHub = 'ku'; # my $smallClusterHub = 'encodek'; my $smallClusterHub = 'ku'; my $dbHost = 'hgwdev'; my $workhorse = 'hgwdev'; my $defaultChainLinearGap = "loose"; my $defaultChainMinScore = "1000"; # from axtChain itself my $defaultTRepeats = ""; # for netClass option tRepeats my $defaultQRepeats = ""; # for netClass option qRepeats my $defaultSeq1Limit = 30; my $defaultSeq2Limit = 100; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; my $base = $0; $base =~ s/^(.*\/)?//; # Basic help (for incorrect usage): print STDERR " usage: $base DEF options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -blastzOutRoot dir Directory path where outputs of the blastz cluster run will be stored. By default, they will be stored in the $HgAutomate::clusterData build directory , but this option can specify something more cluster- friendly: $clusterNAS . If dir does not already exist it will be created. Blastz outputs are removed in the cleanup step. -swap DEF has already been used to create chains; swap those chains (target for query), then net etc. in a new directory: $HgAutomate::clusterData/\$qDb/$HgAutomate::trackBuild/blastz.\$tDb.swap/ -chainMinScore n Add -minScore=n (default: $defaultChainMinScore) to the axtChain command. -chainLinearGap type Add -linearGap=<loose|medium|filename> to the axtChain command. (default: loose) -tRepeats table Add -tRepeats=table to netClass (default: rmsk) -qRepeats table Add -qRepeats=table to netClass (default: rmsk) -ignoreSelf Do not assume self alignments even if tDb == qDb -syntenicNet Perform optional syntenicNet step -noDbNameCheck ignore Db name format -inclHap include haplotypes *_hap* in chain/net, default not -loadChainSplit load split chain tables, default is not split tables -swapDir path directory to work in for swap, default: /hive/data/genomes/qDb/bed/blastz.tDb.swap/ + -asmId assemblyHubId full name for assembly hub, + e.g.: GCF_007474595.1_mLynCan4_v1.p -skipDownload do not construct the downloads directory -trackHub construct big* files for track hub _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => '', 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates UCSC's blastz/chain/net pipeline: 1. Big cluster run of blastz. 2. Small cluster consolidation of blastz result files. 3. Small cluster chaining run. 4. Sorting and netting of chains on the fileserver (no nets for self-alignments). 5. Generation of liftOver-suitable chains from nets+chains on fileserver (not done for self-alignments). 6. Generation of axtNet and mafNet files on the fileserver (not for self). 7. Addition of gap/repeat info to nets on hgwdev (not for self). 8. Loading of chain and net tables on hgwdev (no nets for self). 9. Setup of download directory on hgwdev. 10.Optional (-syntenicNet flag): Generation of syntenic mafNet files. DEF is a Scott Schwartz-style bash script containing blastz parameters. This script makes a lot of assumptions about conventional placements of certain files, and what will be in the DEF vars. Stick to the conventions described in the -help output, pray to the cluster gods, and all will go well. :) To use this script outside the UCSC infrastructure, use options: -dbHost=localhost (when there is no local genome database to load results) -smallClusterHub=localhost -bigClusterHub=localhost -fileServer=localhost This assumes the process is performed on your parasol hub machine, and thus all the references to 'localhost' are this parasol hub machine. Verify your .ssh keys are correct: 'ssh localhost' should function OK. -swapDir=/some/path/blastz.targetDb.swap/ work directory for -swap -skipDownload - leaves all constructed files in the working directory -trackHub - constructs bigChain and bigMaf files to use in a track hub "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/ is the main directory for database/assembly \$db. $HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/blastz.\$qDb.\$date/ will be the directory created for this run, where \$tDb is the target/reference db and \$qDb is the query. (Can be overridden, see #10 below.) $dbHost:$HgAutomate::goldenPath/\$tDb/vs\$QDb/ (or vsSelf) is the directory where downloadable files need to go. LiftOver chains (not applicable for self-alignments) go in this file: $HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/liftOver/\$tDbTo\$QDb.over.chain.gz a copy is kept here (in case the liftOver/ copy is overwritten): $HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/blastz.\$qDb.\$date/\$tDb.\$qDb.over.chain.gz and symbolic links to the liftOver/ file are put here: $dbHost:$HgAutomate::goldenPath/\$tDb/liftOver/\$tDbTo\$QDb.over.chain.gz $dbHost:$HgAutomate::gbdb/\$tDb/liftOver/\$tDbTo\$QDb.over.chain.gz 2. DEF's SEQ1* variables describe the target/reference assembly. DEF's SEQ2* variables describe the query assembly. If those are the same assembly, then we're doing self-alignments and will drop aligned blocks that cross the diagonal. 3. DEF's SEQ1_DIR is either a directory containing one nib file per target sequence (usually chromosome), OR a complete path to a single .2bit file containing all target sequences. This directory should be in $clusterLocal or $clusterSortaLocal . SEQ2_DIR: ditto for query. 4. DEF's SEQ1_LEN is a tab-separated dump of the target database table chromInfo -- or at least a file that contains all sequence names in the first column, and corresponding sizes in the second column. Normally this will be $HgAutomate::clusterData/\$tDb/chrom.sizes, but for a scaffold-based assembly, it is a good idea to put it in $clusterSortaLocal or $clusterNAS because it will be a large file and it is read by blastz-run-ucsc (big cluster script). SEQ2_LEN: ditto for query. 5. DEF's SEQ1_CHUNK and SEQ1_LAP determine the step size and overlap size of chunks into which large target sequences are to be split before alignment. SEQ2_CHUNK and SEQ2_LAP: ditto for query. 6. DEF's SEQ1_LIMIT and SEQ2_LIMIT decide what the maximum number of sequences should be for any partitioned file (the files created in the tParts and qParts directories). This limit only effects SEQ1 or SEQ2 when they are 2bit files. Some 2bit files have too many contigs. This reduces the number of blastz hippos (jobs taking forever compared to the other jobs). SEQ1_LIMIT defaults to $defaultSeq1Limit and SEQ2_LIMIT defaults to $defaultSeq2Limit. 7. DEF's BLASTZ_ABRIDGE_REPEATS should be set to something nonzero if abridging of lineage-specific repeats is to be performed. If so, the following additional constraints apply: a. Both target and query assemblies must be structured as one nib file per sequence in SEQ*_DIR (sorry, this rules out scaffold-based assemblies). b. SEQ1_SMSK must be set to a directory containing one file per target sequence, with the name pattern \$seq.out.spec. This file must be a RepeatMasker .out file (usually filtered by DateRepeats). The directory should be under $clusterLocal or $clusterSortaLocal . SEQ2_SMSK: ditto for query. 8. DEF's BLASTZ_[A-Z] variables will be translated into blastz command line options (e.g. BLASTZ_H=foo --> H=foo, BLASTZ_Q=foo --> Q=foo). For human-mouse evolutionary distance/sensitivity, none of these are necessary (blastz-run-ucsc defaults will be used). Here's what we have used for human-fugu and other very-distant pairs: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=$HgAutomate::clusterData/blastz/HoxD55.q Blastz parameter tuning is somewhat of an art and is beyond the scope here. Webb Miller and Jim can provide guidance on how to set these for a new pair of organisms. 9. DEF's PATH variable, if set, must specify a path that contains programs necessary for blastz to run: blastz, and if BLASTZ_ABRIDGE_REPEATS is set, then also fasta-subseq, strip_rpts, restore_rpts, and revcomp. If DEF does not contain a PATH, blastz-run-ucsc will use its own default. 10. DEF's BLASTZ variable can specify an alternate path for blastz. 11. DEF's BASE variable can specify the blastz/chain/net build directory (defaults to $HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/blastz.\$qDb.\$date/). 12. SEQ?_CTGDIR specifies sequence source with the contents of full chrom sequences and the contig randoms and chrUn. This keeps the contigs separate during the blastz and chaining so that chains won't go through across multiple contigs on the randoms. 13. SEQ?_CTGLEN specifies a length file to be used in conjunction with the special SEQ?_CTGDIR file specified above which contains the random contigs. 14. SEQ?_LIFT specifies a lift file to lift sequences in the SEQ?_CTGDIR to their random and chrUn positions. This is useful for a 2bit file that has both full chrom sequences and the contigs for the randoms. 15. SEQ2_SELF=1 specifies the SEQ2 is already specially split for self alignments and to use SEQ2 sequence for self alignment, not just a copy of SEQ1 16. TMPDIR - specifies directory on cluster node to keep temporary files Typically TMPDIR=/scratch/tmp 17. All other variables in DEF will be ignored! " if ($detailed); exit $status; } # Globals: my %defVars = (); my ($DEF, $tDb, $qDb, $QDb, $isSelf, $selfSplit, $buildDir, $fileServer); -my ($swapDir, $splitRef, $inclHap, $secondsStart, $secondsEnd, $dbExists, $qDbExists); +my ($swapDir, $asmId, $splitRef, $inclHap, $secondsStart, $secondsEnd, $dbExists, $qDbExists); sub isInDirList { # Return TRUE if $dir is under (begins with) something in dirList. my ($dir, @dirList) = @_; my $pat = '^(' . join('|', @dirList) . ')(/.*)?$'; return ($dir =~ m@$pat@); } sub enforceClusterNoNo { # Die right away if user is trying to put cluster output somewhere # off-limits. my ($dir, $desc) = @_; if (&isInDirList($dir, @clusterNoNo)) { die "\ncluster outputs are forbidden to go to " . join (' or ', @clusterNoNo) . " so please choose a different " . "$desc instead of $dir .\n\n"; } # use this only if it exists, this is UCSC infrastructure: if ( -e $getFileServer ) { my $testFileServer = `$getFileServer $dir/`; if (scalar(grep /^$testFileServer$/, @fileServerNoNo)) { die "\ncluster outputs are forbidden to go to fileservers " . join (' or ', @fileServerNoNo) . " so please choose a different " . "$desc instead of $dir (which is hosted on $testFileServer).\n\n"; } } } sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, @HgAutomate::commonOptionSpec, "blastzOutRoot=s", "swap", "chainMinScore=i", "chainLinearGap=s", "tRepeats=s", "qRepeats=s", "readmeOnly", "ignoreSelf", "syntenicNet", "noDbNameCheck", "inclHap", "noLoadChainSplit", "loadChainSplit", "swapDir=s", + "asmId=s", "skipDownload", "trackHub" ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); if ($opt_swap) { if ($opt_continue) { if ($stepper->stepPrecedes($opt_continue, 'net')) { warn "\nIf -swap is specified, then -continue must specify a step ". "of \"net\" or later.\n"; &usage(1); } } else { # If -swap is given but -continue is not, force -continue and tell # $stepper to reevaluate options: $opt_continue = 'chainMerge'; $err = $stepper->processOptions(); usage(1) if ($err); } if ($opt_stop) { if ($stepper->stepPrecedes($opt_stop, 'chainMerge')) { warn "\nIf -swap is specified, then -stop must specify a step ". "of \"chainMerge\" or later.\n"; &usage(1); } } } if ($opt_blastzOutRoot) { if ($opt_blastzOutRoot !~ m@^/\S+/\S+@) { warn "\n-blastzOutRoot must specify a full path.\n"; &usage(1); } &enforceClusterNoNo($opt_blastzOutRoot, '-blastzOutRoot'); if (! &isInDirList($opt_blastzOutRoot, @clusterNAS)) { warn "\n-blastzOutRoot is intended to specify something on " . "$clusterNAS, but I'll trust your judgment " . "and use $opt_blastzOutRoot\n\n"; } } $workhorse = $opt_workhorse if ($opt_workhorse); $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub); $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub); } ######################################################################### # The following routines were taken almost verbatim from blastz-run-ucsc, # so may be good candidates for libification! unless that would slow down # blastz-run-ucsc... # nfsNoodge() was removed from loadDef() and loadSeqSizes() -- since this # script will not be run on the cluster, we should fully expect files to # be immediately visible. sub loadDef { # Read parameters from a bash script with Scott's param variable names: my ($def) = @_; my $fh = &HgAutomate::mustOpen("$def"); while (<$fh>) { s/^\s*export\s+//; next if (/^\s*#/ || /^\s*$/); if (/(\w+)\s*=\s*(.*)/) { my ($var, $val) = ($1, $2); while ($val =~ /\$(\w+)/) { my $subst = $defVars{$1}; if (defined $subst) { $val =~ s/\$$1/$subst/; } else { die "Can't find value to substitute for \$$1 in $DEF var $var.\n"; } } $defVars{$var} = $val; } } close($fh); } sub loadSeqSizes { # Load up sequence -> size mapping from $sizeFile into $hashRef. my ($sizeFile, $hashRef) = @_; my $fh = &HgAutomate::mustOpen("$sizeFile"); while (<$fh>) { chomp; my ($seq, $size) = split; $hashRef->{$seq} = $size; } close($fh); } # end shared stuff from blastz-run-ucsc ######################################################################### sub requireVar { my ($var) = @_; die "Error: $DEF is missing variable $var\n" if (! defined $defVars{$var}); } sub requirePath { my ($var) = @_; my $val = $defVars{$var}; die "Error: $DEF $var=$val must specify a complete path\n" if ($val !~ m@^/\S+/\S+@); if ( -d $val ) { my $fileCount = `find $val -maxdepth 1 -type f | wc -l`; chomp $fileCount; if ($fileCount < 1) { die "Error: $DEF variable: $var=$val specifies an empty directory.\n"; } } elsif ( ! -s $val ) { die "Error: $DEF variable: $var=$val is not a file or directory.\n"; } } sub requireNum { my ($var) = @_; my $val = $defVars{$var}; die "Error: $DEF variable $var=$val must specify a number.\n" if ($val !~ /^\d+$/); } my $oldDbFormat = '[a-z][a-z](\d+)?'; my $newDbFormat = '[a-z][a-z][a-z][A-Z][a-z][a-z0-9](\d+)?'; my $patchDbFormat = 'grc[A-Z][0-9]+P[0-9]+'; sub getDbFromPath { # Require that $val is a full path that contains a recognizable db as # one of its elements (possibly the last one). my ($var) = @_; my $val = $defVars{$var}; my $db; my $dbFromName = basename($val); $dbFromName =~ s/.2bit//; if (! $opt_noDbNameCheck) { if ( $val =~ m@^/\S+/($oldDbFormat|$newDbFormat|$patchDbFormat)((\.2bit)|(/(\S+)?))?$@) { $db = $1; } else { die "Error: $DEF variable $var=$val must be a full path with " . "a recognizable database as one of its elements.\n" } } if ($opt_noDbNameCheck) { $db = $dbFromName; } else { if (! defined($db)) { if ($val =~ m#^/hive/data/genomes/#) { $val =~ s#^/hive/data/genomes/##; $val =~ s#/.*##; $db = $val; warn "Warning: assuming database $db from /hive/data/genomes/<db>/ path\n"; } elsif ($val =~ m#^/scratch/data/#) { $val =~ s#^/scratch/data/##; $val =~ s#/.*##; $db = $val; warn "Warning: assuming database $db from /scratch/data/<db>/ path\n"; } } } return $db; } sub checkDef { # Make sure %defVars contains what we need and looks consistent with # our assumptions. foreach my $s ('SEQ1_', 'SEQ2_') { foreach my $req ('DIR', 'LEN', 'CHUNK', 'LAP') { &requireVar("$s$req"); } &requirePath($s . 'DIR'); &requirePath($s . 'LEN'); &requireNum($s . 'CHUNK'); &requireNum($s . 'LAP'); } $tDb = &getDbFromPath('SEQ1_DIR'); $qDb = &getDbFromPath('SEQ2_DIR'); $isSelf = $opt_ignoreSelf ? 0 : ($tDb eq $qDb); # special split on SEQ2 for Self alignments $selfSplit = $defVars{'SEQ2_SELF'} || 0; $QDb = $isSelf ? 'Self' : ucfirst($qDb); if ($isSelf && $opt_swap) { die "-swap is not supported for self-alignments\n" . "($DEF has $tDb as both target and query).\n"; } HgAutomate::verbose(1, "$DEF looks OK!\n" . "\ttDb=$tDb\n\tqDb=$qDb\n\ts1d=$defVars{SEQ1_DIR}\n" . "\tisSelf=$isSelf\n"); if ($defVars{'SEQ1_SMSK'} || $defVars{'SEQ2_SMSK'} || $defVars{'BLASTZ_ABRIDGE_REPEATS'}) { &requireVar('BLASTZ_ABRIDGE_REPEATS'); foreach my $s ('SEQ1_', 'SEQ2_') { my $var = $s. 'SMSK'; &requireVar($var); &requirePath($var); } HgAutomate::verbose(1, "Abridging repeats!\n"); } } sub doPartition { # Partition the sequence up before blastz. my $paraHub = $opt_blastzOutRoot ? $bigClusterHub : $workhorse; my $runDir = "$buildDir/run.blastz"; my $targetList = "$tDb.lst"; my $queryList = $isSelf ? $targetList : ($opt_ignoreSelf ? "$qDb.ignoreSelf.lst" : "$qDb.lst"); if ($selfSplit) { $queryList = "$qDb.selfSplit.lst" } my $tPartDir = '-lstDir tParts'; my $qPartDir = '-lstDir qParts'; my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : '../psl'; my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'}; my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'}; my $seq1Len = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'}; my $seq2Len = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'}; my $seq1Limit = (defined $defVars{'SEQ1_LIMIT'}) ? $defVars{'SEQ1_LIMIT'} : $defaultSeq1Limit; my $seq2Limit = (defined $defVars{'SEQ2_LIMIT'}) ? $defVars{'SEQ2_LIMIT'} : $defaultSeq2Limit; my $seq2MaxLength = `awk '{print \$2}' $seq2Len | sort -rn | head -1`; chomp $seq2MaxLength; my $bundleParts = 0; # OK to bundle parts list bits into 2bit files when not abridging $bundleParts = 1 if ( ! $defVars{'BLASTZ_ABRIDGE_REPEATS'} ); my $partitionTargetCmd = ("$partition $defVars{SEQ1_CHUNK} $defVars{SEQ1_LAP} " . "$seq1Dir $seq1Len -xdir xdir.sh -rawDir $outRoot $seq1Limit " . "$tPartDir > $targetList"); my $partitionQueryCmd = (($isSelf && (! $selfSplit)) ? '# Self-alignment ==> use target partition for both.' : "$partition $defVars{SEQ2_CHUNK} $defVars{SEQ2_LAP} " . "$seq2Dir $seq2Len $seq2Limit " . "$qPartDir > $queryList"); &HgAutomate::mustMkdir($runDir); my $whatItDoes = "It computes partitions of target and query sequences into chunks of the specified size for the blastz cluster run. The actual splitting of sequence is not performed here, but later on by blastz cluster jobs."; my $bossScript = newBash HgRemoteScript("$runDir/doPartition.bash", $paraHub, $runDir, $whatItDoes, $DEF); $bossScript->add(<<_EOF_ $partitionTargetCmd export L1=`wc -l < $targetList` $partitionQueryCmd export L2=`wc -l < $queryList` export L=`echo \$L1 \$L2 | awk '{print \$1*\$2}'` echo "cluster batch jobList size: \$L = \$L1 * \$L2" _EOF_ ); if ($bundleParts) { $bossScript->add(<<_EOF_ if [ -d tParts ]; then echo 'constructing tParts/*.2bit files' ls tParts/*.lst | sed -e 's#tParts/##; s#.lst##;' | while read tPart do sed -e 's#.*.2bit:##;' tParts/\$tPart.lst \\ | twoBitToFa -seqList=stdin $seq1Dir stdout \\ | faToTwoBit stdin tParts/\$tPart.2bit done fi if [ -d qParts ]; then echo 'constructing qParts/*.2bit files' ls qParts/*.lst | sed -e 's#qParts/##; s#.lst##;' | while read qPart do sed -e 's#.*.2bit:##;' qParts/\$qPart.lst \\ | twoBitToFa -seqList=stdin $seq2Dir stdout \\ | faToTwoBit stdin qParts/\$qPart.2bit done fi _EOF_ ); } $bossScript->execute(); my $mkOutRootHost = $opt_blastzOutRoot ? $paraHub : $fileServer; my $mkOutRoot = $opt_blastzOutRoot ? "mkdir -p $opt_blastzOutRoot;" : ""; &HgAutomate::run("$HgAutomate::runSSH $mkOutRootHost " . "'(cd $runDir; $mkOutRoot csh -ef xdir.sh)'"); } sub doBlastzClusterRun { # Set up and perform the big-cluster blastz run. my $paraHub = $bigClusterHub; my $runDir = "$buildDir/run.blastz"; my $targetList = "$tDb.lst"; my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : '../psl'; my $queryList = $isSelf ? $targetList : ($opt_ignoreSelf ? "$qDb.ignoreSelf.lst" : "$qDb.lst"); if ($selfSplit) { $queryList = "$qDb.selfSplit.lst" } # First, make sure we're starting clean. if (-e "$runDir/run.time") { die "doBlastzClusterRun: looks like this was run successfully already " . "(run.time exists). Either run with -continue cat or some later " . "stage, or move aside/remove $runDir/ and run again.\n"; } elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) { die "doBlastzClusterRun: looks like we are not starting with a clean " . "slate. Please move aside or remove $runDir/ and run again.\n"; } # Second, make sure we got through the partitioning already if (! -e "$runDir/$targetList" && ! $opt_debug) { die "doBlastzClusterRun: there's no target list file " . "so start over without the -continue align.\n"; } if (! -e "$runDir/$queryList" && ! $opt_debug) { die "doBlastzClusterRun: there's no query list file" . "so start over without the -continue align.\n"; } my $templateCmd = ("$blastzRunUcsc -outFormat psl " . ($isSelf ? '-dropSelf ' : '') . '$(path1) $(path2) ../DEF ' . '{check out exists ' . $outRoot . '/$(file1)/$(file1)_$(file2).psl }'); &HgAutomate::makeGsub($runDir, $templateCmd); `touch "$runDir/para_hub_$paraHub"`; my $whatItDoes = "It sets up and performs the big cluster blastz run."; my $bossScript = new HgRemoteScript("$runDir/doClusterRun.csh", $paraHub, $runDir, $whatItDoes, $DEF); my $paraRun = &HgAutomate::paraRun(); my $gensub2 = &HgAutomate::gensub2(); $bossScript->add(<<_EOF_ $gensub2 $targetList $queryList gsub jobList $paraRun _EOF_ ); $bossScript->execute(); } # sub doBlastzClusterRun {} sub doCatRun { # Do a small cluster run to concatenate the lowest level of chunk result # files from the big cluster blastz run. This brings results up to the # next level: per-target-chunk results, which may still need to be # concatenated into per-target-sequence in the next step after this one -- # chaining. my $paraHub = $smallClusterHub; my $runDir = "$buildDir/run.cat"; # First, make sure we're starting clean. if (-e "$runDir/run.time") { die "doCatRun: looks like this was run successfully already " . "(run.time exists). Either run with -continue chainRun or some later " . "stage, or move aside/remove $runDir/ and run again.\n"; } elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) { die "doCatRun: looks like we are not starting with a clean " . "slate. Please move aside or remove $runDir/ and run again.\n"; } # Make sure previous stage was successful. my $successFile = "$buildDir/run.blastz/run.time"; if (! -e $successFile && ! $opt_debug) { die "doCatRun: looks like previous stage was not successful (can't find " . "$successFile).\n"; } &HgAutomate::mustMkdir($runDir); &HgAutomate::makeGsub($runDir, "./cat.csh \$(path1) {check out exists ../pslParts/\$(file1).psl.gz}"); `touch "$runDir/para_hub_$paraHub"`; my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : '../psl'; my $fh = &HgAutomate::mustOpen(">$runDir/cat.csh"); print $fh <<_EOF_ #!/bin/csh -ef find $outRoot/\$1/ -name "*.psl" | xargs cat | gzip -c > \$2 _EOF_ ; close($fh); my $whatItDoes = "It sets up and performs a small cluster run to concatenate all files in each subdirectory of $outRoot into a per-target-chunk file."; my $bossScript = new HgRemoteScript("$runDir/doCatRun.csh", $paraHub, $runDir, $whatItDoes, $DEF); my $paraRun = &HgAutomate::paraRun(); my $gensub2 = &HgAutomate::gensub2(); $bossScript->add(<<_EOF_ (cd $outRoot; find . -maxdepth 1 -type d | grep '^./') \\ | sed -e 's#/\$##; s#^./##' > tParts.lst chmod a+x cat.csh $gensub2 tParts.lst single gsub jobList mkdir -p ../pslParts $paraRun _EOF_ ); $bossScript->execute(); } # sub doCatRun {} sub makePslPartsLst { # Create a pslParts.lst file the subdirectories of pslParts; if some # are for subsequences of the same sequence, make a single .lst line # for the sequence (single chaining job with subseqs' alignments # catted together). Otherwise (i.e. subdirs that contain small # target seqs glommed together by partitionSequences) make one .lst # line per partition. return if ($opt_debug); opendir(P, "$buildDir/pslParts") || die "Couldn't open directory $buildDir/pslParts for reading: $!\n"; my @parts = readdir(P); closedir(P); my $partsLst = "$buildDir/axtChain/run/pslParts.lst"; my $fh = &HgAutomate::mustOpen(">$partsLst"); my %seqs = (); my $count = 0; foreach my $p (@parts) { $p =~ s@^/.*/@@; $p =~ s@/$@@; $p =~ s/\.psl\.gz//; next if ($p eq '.' || $p eq '..'); if ($p =~ m@^(\S+:\S+):\d+-\d+$@) { # Collapse subsequences (subranges of a sequence) down to one entry # per sequence: $seqs{$1} = 1; } else { print $fh "$p\n"; $count++; } } foreach my $p (keys %seqs) { print $fh "$p:\n"; $count++; } close($fh); if ($count < 1) { die "makePslPartsLst: didn't find any pslParts/ items."; } } sub doChainRun { # Do a small cluster run to chain alignments to each target sequence. my $paraHub = $smallClusterHub; my $runDir = "$buildDir/axtChain/run"; # First, make sure we're starting clean. if (-e "$runDir/run.time") { die "doChainRun: looks like this was run successfully already " . "(run.time exists). Either run with -continue chainMerge or some " . "later stage, or move aside/remove $runDir/ and run again.\n"; } elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) { die "doChainRun: looks like we are not starting with a clean " . "slate. Please move aside or remove $runDir/ and run again.\n"; } # Make sure previous stage was successful. my $successFile = "$buildDir/run.cat/run.time"; if (! -e $successFile && ! $opt_debug) { die "doChainRun: looks like previous stage was not successful (can't " . "find $successFile).\n"; } &HgAutomate::mustMkdir($runDir); &HgAutomate::makeGsub($runDir, "chain.csh \$(file1) {check out line+ chain/\$(file1).chain}"); `touch "$runDir/para_hub_$paraHub"`; my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'}; my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'}; my $matrix = $defVars{'BLASTZ_Q'} ? "-scoreScheme=$defVars{BLASTZ_Q} " : ""; my $minScore = $opt_chainMinScore ? "-minScore=$opt_chainMinScore" : ""; my $linearGap = $opt_chainLinearGap ? "-linearGap=$opt_chainLinearGap" : "-linearGap=$defaultChainLinearGap"; my $fh = &HgAutomate::mustOpen(">$runDir/chain.csh"); print $fh <<_EOF_ #!/bin/csh -ef zcat ../../pslParts/\$1*.psl.gz \\ | axtChain -psl -verbose=0 $matrix $minScore $linearGap stdin \\ $seq1Dir \\ $seq2Dir \\ stdout \\ | chainAntiRepeat $seq1Dir \\ $seq2Dir \\ stdin \$2 _EOF_ ; if (exists($defVars{'SEQ1_LIFT'})) { print $fh <<_EOF_ set c=\$2:t:r echo "lifting \$2 to \${c}.lifted.chain" liftUp liftedChain/\${c}.lifted.chain \\ $defVars{'SEQ1_LIFT'} carry \$2 rm \$2 mv liftedChain/\${c}.lifted.chain \$2 _EOF_ ; } if (exists($defVars{'SEQ2_LIFT'})) { print $fh <<_EOF_ set c=\$2:t:r echo "lifting \$2 to \${c}.lifted.chain" liftUp -chainQ liftedChain/\${c}.lifted.chain \\ $defVars{'SEQ2_LIFT'} carry \$2 rm \$2 mv liftedChain/\${c}.lifted.chain \$2 _EOF_ ; } close($fh); &makePslPartsLst(); my $whatItDoes = "It sets up and performs a small cluster run to chain all alignments to each target sequence."; my $bossScript = new HgRemoteScript("$runDir/doChainRun.csh", $paraHub, $runDir, $whatItDoes, $DEF); my $paraRun = &HgAutomate::paraRun(); my $gensub2 = &HgAutomate::gensub2(); $bossScript->add(<<_EOF_ chmod a+x chain.csh $gensub2 pslParts.lst single gsub jobList mkdir -p chain liftedChain $paraRun rmdir liftedChain _EOF_ ); $bossScript->execute(); } # sub doChainRun {} sub postProcessChains { # chainMergeSort etc. my $runDir = "$buildDir/axtChain"; my $chain = "$tDb.$qDb.all.chain.gz"; # First, make sure we're starting clean. if (-e "$runDir/$chain") { die "postProcessChains: looks like this was run successfully already " . "($chain exists). Either run with -continue net or some later " . "stage, or move aside/remove $runDir/$chain and run again.\n"; } elsif (-e "$runDir/all.chain" || -e "$runDir/all.chain.gz") { die "postProcessChains: looks like this was run successfully already " . "(all.chain[.gz] exists). Either run with -continue net or some later " . "stage, or move aside/remove $runDir/all.chain[.gz] and run again.\n"; } elsif (-e "$runDir/chain" && ! $opt_debug) { die "postProcessChains: looks like we are not starting with a clean " . "slate. Please move aside or remove $runDir/chain and run again.\n"; } # Make sure previous stage was successful. my $successFile = "$buildDir/axtChain/run/run.time"; if (! -e $successFile && ! $opt_debug) { die "postProcessChains: looks like previous stage was not successful " . "(can't find $successFile).\n"; } my $cmd="$HgAutomate::runSSH $workhorse nice "; $cmd .= "'find $runDir/run/chain -name \"*.chain\" "; $cmd .= "| chainMergeSort -inputList=stdin "; $cmd .= "| nice gzip -c > $runDir/$chain'"; &HgAutomate::run($cmd); if ($splitRef) { &HgAutomate::run("$HgAutomate::runSSH $fileServer nice " . "chainSplit $runDir/chain $runDir/$chain"); } &HgAutomate::nfsNoodge("$runDir/$chain"); } # sub postProcessChains {} sub getAllChain { # Find the most likely candidate for all.chain from a previous run/step. my ($runDir) = @_; my $chain; if (-e "$runDir/$tDb.$qDb.all.chain.gz") { $chain = "$tDb.$qDb.all.chain.gz"; } elsif (-e "$runDir/$tDb.$qDb.all.chain") { $chain = "$tDb.$qDb.all.chain"; } elsif (-e "$runDir/all.chain.gz") { $chain = "all.chain.gz"; } elsif (-e "$runDir/all.chain") { $chain = "all.chain"; } elsif ($opt_debug) { $chain = "$tDb.$qDb.all.chain.gz"; } return $chain; } sub swapChains { # chainMerge step for -swap: chainSwap | chainSort. my $runDir = "$swapDir/axtChain"; my $inChain = &getAllChain("$buildDir/axtChain"); my $swappedChain = "$qDb.$tDb.all.chain.gz"; # First, make sure we're starting clean. if (-e "$runDir/$swappedChain") { die "swapChains: looks like this was run successfully already " . "($runDir/$swappedChain exists). Either run with -continue net or some " . "later stage, or move aside/remove $runDir/$swappedChain and run again.\n"; } elsif (-e "$runDir/all.chain" || -e "$runDir/all.chain.gz") { die "swapChains: looks like this was run successfully already " . "($runDir/all.chain[.gz] exists). Either run with -continue net or some " . "later stage, or move aside/remove $runDir/all.chain[.gz] and run again.\n"; } # Main routine already made sure that $buildDir/axtChain/all.chain is there. &HgAutomate::run("$HgAutomate::runSSH $workhorse nice " . "'chainSwap $buildDir/axtChain/$inChain stdout " . "| nice chainSort stdin stdout " . "| nice gzip -c > $runDir/$swappedChain'"); &HgAutomate::nfsNoodge("$runDir/$swappedChain"); if ($splitRef) { &HgAutomate::run("$HgAutomate::runSSH $fileServer nice " . "chainSplit $runDir/chain $runDir/$swappedChain"); } } # sub swapChains {} sub swapGlobals { # Swap our global variables ($buildDir, $tDb, $qDb and %defVars SEQ1/SEQ2) # so that the remaining steps need no tweaks for -swap. $buildDir = $swapDir; my $tmp = $qDb; $qDb = $tDb; $tDb = $tmp; $QDb = $isSelf ? 'Self' : ucfirst($qDb); foreach my $var ('DIR', 'LEN', 'CHUNK', 'LAP', 'SMSK') { $tmp = $defVars{"SEQ1_$var"}; $defVars{"SEQ1_$var"} = $defVars{"SEQ2_$var"}; $defVars{"SEQ2_$var"} = $tmp; } $defVars{'BASE'} = $swapDir; } sub doChainMerge { # If -swap, swap chains from other org; otherwise, merge the results # from the chainRun step. if ($opt_swap) { &swapChains(); &swapGlobals(); } else { &postProcessChains(); } } sub netChains { # Turn chains into nets (,axt,maf,.over.chain). # Don't do this for self alignments. return if ($isSelf); my $runDir = "$buildDir/axtChain"; # First, make sure we're starting clean. if (-d "$buildDir/mafNet") { die "netChains: looks like this was run successfully already " . "(mafNet exists). Either run with -continue load or some later " . "stage, or move aside/remove $buildDir/mafNet " . "and $runDir/noClass.net and run again.\n"; } elsif (-e "$runDir/noClass.net") { die "netChains: looks like we are not starting with a " . "clean slate. Please move aside or remove $runDir/noClass.net " . "and run again.\n"; } # Make sure previous stage was successful. my $chain = &getAllChain($runDir); if (! defined $chain && ! $opt_debug) { die "netChains: looks like previous stage was not successful " . "(can't find [$tDb.$qDb.]all.chain[.gz]).\n"; } my $whatItDoes = "It generates nets (without repeat/gap stats -- those are added later on $dbHost) from chains, and generates axt, maf and .over.chain from the nets."; my $bossScript = new HgRemoteScript("$runDir/netChains.csh", $workhorse, $runDir, $whatItDoes, $DEF); $bossScript->add(<<_EOF_ # Make nets ("noClass", i.e. without rmsk/class stats which are added later): chainPreNet $inclHap $chain $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} stdout \\ | chainNet $inclHap stdin -minSpace=1 $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} stdout /dev/null \\ | netSyntenic stdin noClass.net # Make liftOver chains: netChainSubset -verbose=0 noClass.net $chain stdout \\ | chainStitchId stdin stdout | gzip -c > $tDb.$qDb.over.chain.gz _EOF_ ); my $seq1Dir = $defVars{'SEQ1_DIR'}; my $seq2Dir = $defVars{'SEQ2_DIR'}; if ($splitRef) { $bossScript->add(<<_EOF_ # Make axtNet for download: one .axt per $tDb seq. netSplit noClass.net net cd .. mkdir -p axtNet foreach f (axtChain/net/*.net) netToAxt \$f axtChain/chain/\$f:t:r.chain \\ $seq1Dir $seq2Dir stdout \\ | axtSort stdin stdout \\ | gzip -c > axtNet/\$f:t:r.$tDb.$qDb.net.axt.gz end # Make mafNet for multiz: one .maf per $tDb seq. mkdir -p mafNet foreach f (axtNet/*.$tDb.$qDb.net.axt.gz) axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. \$f \\ $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\ stdout \\ | gzip -c > mafNet/\$f:t:r:r:r:r:r.maf.gz end _EOF_ ); if ($opt_trackHub) { $bossScript->add(<<_EOF_ mkdir -p bigMaf echo "##maf version=1 scoring=blastz" > bigMaf/$tDb.$qDb.net.maf zegrep -h -v "^#" mafNet/*.maf.gz >> bigMaf/$tDb.$qDb.net.maf echo "##eof maf" >> bigMaf/$tDb.$qDb.net.maf gzip bigMaf/$tDb.$qDb.net.maf _EOF_ ); } } else { $bossScript->add(<<_EOF_ # Make axtNet for download: one .axt for all of $tDb. mkdir -p ../axtNet netToAxt -verbose=0 noClass.net $chain \\ $seq1Dir $seq2Dir stdout \\ | axtSort stdin stdout \\ | gzip -c > ../axtNet/$tDb.$qDb.net.axt.gz # Make mafNet for multiz: one .maf for all of $tDb. mkdir -p ../mafNet axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. ../axtNet/$tDb.$qDb.net.axt.gz \\ $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\ stdout \\ | gzip -c > ../mafNet/$tDb.$qDb.net.maf.gz _EOF_ ); if ($opt_trackHub) { $bossScript->add(<<_EOF_ mkdir -p ../bigMaf ln -s ../mafNet/$tDb.$qDb.net.maf.gz ../bigMaf _EOF_ ); } } if ($opt_trackHub) { $bossScript->add(<<_EOF_ cd $buildDir/bigMaf wget --no-check-certificate -O bigMaf.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigMaf.as' wget --no-check-certificate -O mafSummary.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/mafSummary.as' mafToBigMaf $tDb $tDb.$qDb.net.maf.gz stdout \\ | sort -k1,1 -k2,2n > $tDb.$qDb.net.txt bedToBigBed -type=bed3+1 -as=bigMaf.as -tab \\ $tDb.$qDb.net.txt $defVars{SEQ1_LEN} $tDb.$qDb.net.bb hgLoadMafSummary -minSeqSize=1 -test $tDb $tDb.$qDb.net.summary \\ $tDb.$qDb.net.maf.gz cut -f2- $tDb.$qDb.net.summary.tab | sort -k1,1 -k2,2n \\ > $tDb.$qDb.net.summary.bed bedToBigBed -type=bed3+4 -as=mafSummary.as -tab \\ $tDb.$qDb.net.summary.bed $defVars{SEQ1_LEN} \\ $tDb.$qDb.net.summary.bb rm -f $tDb.$qDb.net.txt $tDb.$qDb.net.summary.tab \\ $tDb.$qDb.net.summary.bed _EOF_ ); } $bossScript->execute(); } # sub netChains {} sub loadUp { # Load chains; add repeat/gap stats to net; load nets. my $runDir = "$buildDir/axtChain"; my $QDbLink = "chain$QDb" . "Link"; # First, make sure we're starting clean. if (-e "$runDir/$tDb.$qDb.net" || -e "$runDir/$tDb.$qDb.net.gz") { die "loadUp: looks like this was run successfully already " . "($tDb.$qDb.net[.gz] exists). Either run with -continue download, " . "or move aside/remove $runDir/$tDb.$qDb.net[.gz] and run again.\n"; } # Make sure previous stage was successful. Depends upon what was done: my $otherCheck = "$buildDir/mafNet"; if ($opt_trackHub) { $otherCheck = "$buildDir/bigMaf"; &HgAutomate::nfsNoodge("$otherCheck/$tDb.$qDb.net.maf"); } my $successDir = $isSelf ? "$runDir/$tDb.$qDb.all.chain.gz" : "$otherCheck"; if (! -e $successDir && ! $opt_debug) { die "loadUp: looks like previous stage was not successful " . "(can't find $successDir).\n"; } my $whatItDoes = "It loads the chain tables into $tDb, adds gap/repeat stats to the .net file, and loads the net table."; my $bossScript = new HgRemoteScript("$runDir/loadUp.csh", $dbHost, $runDir, $whatItDoes, $DEF); $bossScript->add(<<_EOF_ # Load chains: _EOF_ ); if ($opt_loadChainSplit && $splitRef) { ### XXX to be done: fixup this loop to make the bigBed files for the chain ####### instead of this database loading table (split table code could be ####### eliminated . . . ) $bossScript->add(<<_EOF_ cd $runDir/chain foreach c (`awk '{print \$1;}' $defVars{SEQ1_LEN}`) set f = \$c.chain if (! -e \$f) then echo no chains for \$c set f = /dev/null endif hgLoadChain $tDb \${c}_chain$QDb \$f end _EOF_ ); } else { if (! $opt_trackHub && $dbExists) { $bossScript->add(<<_EOF_ cd $runDir hgLoadChain -tIndex $tDb chain$QDb $tDb.$qDb.all.chain.gz _EOF_ ); } else { $bossScript->add(<<_EOF_ cd $runDir hgLoadChain -test -noBin -tIndex $tDb chain$QDb $tDb.$qDb.all.chain.gz wget --no-check-certificate -O bigChain.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigChain.as' wget --no-check-certificate -O bigLink.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigLink.as' sed 's/.000000//' chain.tab | awk 'BEGIN {OFS="\\t"} {print \$2, \$4, \$5, \$11, 1000, \$8, \$3, \$6, \$7, \$9, \$10, \$1}' > chain${QDb}.tab bedToBigBed -type=bed6+6 -as=bigChain.as -tab chain${QDb}.tab $defVars{SEQ1_LEN} chain${QDb}.bb awk 'BEGIN {OFS="\\t"} {print \$1, \$2, \$3, \$5, \$4}' link.tab | sort -k1,1 -k2,2n > chain${QDb}Link.tab bedToBigBed -type=bed4+1 -as=bigLink.as -tab chain${QDb}Link.tab $defVars{SEQ1_LEN} chain${QDb}Link.bb set totalBases = `ave -col=2 $defVars{SEQ1_LEN} | grep "^total" | awk '{printf "%d", \$2}'` set basesCovered = `bedSingleCover.pl chain${QDb}Link.tab | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'` set percentCovered = `echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" "\$totalBases" "\$percentCovered" > ../fb.$tDb.chain${QDb}Link.txt rm -f link.tab rm -f chain.tab _EOF_ ); } } if (! $isSelf) { my $tRepeats = $opt_tRepeats ? "-tRepeats=$opt_tRepeats" : $defaultTRepeats; my $qRepeats = $opt_qRepeats ? "-qRepeats=$opt_qRepeats" : $defaultQRepeats; if ($opt_swap) { $tRepeats = $opt_qRepeats ? "-tRepeats=$opt_qRepeats" : $defaultQRepeats; $qRepeats = $opt_tRepeats ? "-qRepeats=$opt_tRepeats" : $defaultTRepeats; } if (! $opt_trackHub && $dbExists) { if ($qDbExists) { $bossScript->add(<<_EOF_ # Add gap/repeat stats to the net file using database tables: cd $runDir netClass -verbose=0 $tRepeats $qRepeats -noAr noClass.net $tDb $qDb $tDb.$qDb.net # Load nets: netFilter -minGap=10 $tDb.$qDb.net \\ | hgLoadNet -verbose=0 $tDb net$QDb stdin _EOF_ ); } else { $bossScript->add(<<_EOF_ cp -p noClass.net $tDb.$qDb.net netFilter -minGap=10 noClass.net \\ | hgLoadNet -test -noBin -warn -verbose=0 $tDb net$QDb stdin mv align.tab net$QDb.tab _EOF_ ); } $bossScript->add(<<_EOF_ cd $buildDir featureBits $tDb $QDbLink >&fb.$tDb.$QDbLink.txt cat fb.$tDb.$QDbLink.txt _EOF_ ); } else { $bossScript->add(<<_EOF_ cp -p noClass.net $tDb.$qDb.net netFilter -minGap=10 noClass.net \\ | hgLoadNet -test -noBin -warn -verbose=0 $tDb net$QDb stdin mv align.tab net$QDb.tab _EOF_ ); } } $bossScript->execute(); # maybe also peek in trackDb and see if entries need to be added for chain/net } # sub loadUp {} sub makeDownloads { # Compress the netClassed .net for download (other files should have been # compressed already). my $runDir = "$buildDir/axtChain"; if (-s "$runDir/$tDb.$qDb.net") { &HgAutomate::run("$HgAutomate::runSSH $fileServer nice " . "gzip $runDir/$tDb.$qDb.net"); } return if ($opt_skipDownload); # Make an md5sum.txt file. my $net = $isSelf ? "" : "$tDb.$qDb.net.gz"; if (! -s "$runDir/$net") { $net = ""; } my $whatItDoes = "It makes an md5sum.txt file for downloadable files, with relative paths matching what the user will see on the download server, and installs the over.chain file in the liftOver dir."; my $bossScript = new HgRemoteScript("$runDir/makeMd5sum.csh", $workhorse, $runDir, $whatItDoes, $DEF); my $over = $tDb . "To$QDb.over.chain.gz"; my $altOver = "$tDb.$qDb.over.chain.gz"; my $liftOverDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver"; + if ($tDb =~ m/^GC/) { + $liftOverDir = &HgAutomate::asmHubBuildDir($asmId) . "/liftOver"; + } $bossScript->add(<<_EOF_ mkdir -p $liftOverDir md5sum $tDb.$qDb.all.chain.gz $net > md5sum.txt _EOF_ ); if (! $isSelf) { my $axt = ($splitRef ? "md5sum axtNet/*.gz >> axtChain/md5sum.txt" : "cd axtNet\nmd5sum *.gz >> ../axtChain/md5sum.txt"); $bossScript->add(<<_EOF_ rm -f $liftOverDir/$over cp -p $altOver $liftOverDir/$over cd .. $axt _EOF_ ); } $bossScript->execute(); -} +} # sub makeDownloads sub getBlastzParams { my %vars; # Return parameters in BLASTZ_Q file, or defaults, for README.txt. my $matrix = " A C G T A 91 -114 -31 -123 C -114 100 -125 -31 G -31 -125 100 -114 T -123 -31 -114 91"; if ($defVars{'BLASTZ_Q'}) { my $readLineLimit = 100; # safety valve to get out if reading nonsense my $linesRead = 0; my $fh = &HgAutomate::mustOpen($defVars{'BLASTZ_Q'}); my $line; my $matrixFound = 0; while (!$matrixFound && ($linesRead < $readLineLimit) && ($line = <$fh>)) { ++$linesRead; next if (($line =~ m/^#/) || ($line =~ m/^$/)); if ($line =~ m/^\s*A\s+C\s+G\s+T\s*$/) { $matrixFound = 1; } else { chomp $line; $line =~ s/\s+//g; $line =~ s/#.*//; die "can not find tag=value in $defVars{BLASTZ_Q}" if ($line !~ /=/); my ($tag, $value) = split('=',$line); # ignore O E gap_open_penalty gap_extend_penalty next if ($tag eq "O" || $tag eq "E" || $tag eq "gap_open_penalty" || $tag eq "gap_extend_penalty"); $vars{$tag} = $value; } } die "can not find score matrix in $defVars{BLASTZ_Q}" if (!$matrixFound); $line =~ s/^ // if (length($line) > 22); $matrix = ' ' . $line; foreach my $base ('A', 'C', 'G', 'T') { $line = <$fh>; die "Too few lines of $defVars{BLASTZ_Q}" if (! $line); if ($line !~ /^[ACGT]?\s*-?\d+\s+-?\d+\s+-?\d+\s+-?\d+\s*$/) { die "Can't parse this line of $defVars{BLASTZ_Q}:\n$line"; } $line =~ s/^[ACGT] //; $matrix .= " $base " . $line; } chomp $matrix; $line = <$fh>; if ($line && $line =~ /\S/) { warn "\nWarning: BLASTZ_Q matrix file $defVars{BLASTZ_Q} has " . "additional contents after the matrix -- those are ignored " . "by blastz.\n\n"; } close($fh); } my $o = $defVars{'BLASTZ_O'} || 400; my $e = $defVars{'BLASTZ_E'} || 30; my $k = $defVars{'BLASTZ_K'} || 3000; my $l = $defVars{'BLASTZ_L'} || 3000; my $h = $defVars{'BLASTZ_H'} || 2000; my $blastzOther = ''; foreach my $var (sort keys %defVars) { if ($var =~ /^BLASTZ_(\w)$/) { my $p = $1; if ($p ne 'K' && $p ne 'L' && $p ne 'H' && $p ne 'Q') { if ($blastzOther eq '') { $blastzOther = 'Other lastz parameters specifically set for this species pair:'; } $blastzOther .= "\n $p=$defVars{$var}"; } } } return ($matrix, $o, $e, $k, $l, $h, $blastzOther); } sub commafy { # Assuming $num is a number, add commas where appropriate. my ($num) = @_; $num =~ s/(\d)(\d\d\d)$/$1,$2/; $num =~ s/(\d)(\d\d\d),/$1,$2,/g; return($num); } sub describeOverlapping { # Return some text describing how large sequences were split. my $lap; my $chunkPlusLap1 = $defVars{'SEQ1_CHUNK'} + $defVars{'SEQ1_LAP'}; my $chunkPlusLap2 = $defVars{'SEQ2_CHUNK'} + $defVars{'SEQ2_LAP'}; if ($chunkPlusLap1 == $chunkPlusLap2) { $lap .= "Any sequences larger\n" . "than " . &commafy($chunkPlusLap1) . " bases were split into chunks of " . &commafy($chunkPlusLap1) . " bases overlapping by " . &commafy($defVars{SEQ1_LAP}) . " bases for alignment."; } else { $lap .= "Any $tDb sequences larger\n" . "than " . &commafy($chunkPlusLap1) . " bases were split into chunks of " . &commafy($chunkPlusLap1) . " bases overlapping by " . &commafy($defVars{SEQ1_LAP}) . " bases for alignment. " . "A similar process was followed for $qDb, with chunks of " . &commafy($chunkPlusLap2) . " overlapping by " . &commafy($defVars{SEQ2_LAP}) . "."; } $lap .= " Following alignment, the coordinates of the chunk alignments were corrected by the blastz-normalizeLav script written by Scott Schwartz of Penn State."; return $lap; } sub dumpDownloadReadme { # Write a file (README.txt) describing the download files. my ($file) = @_; my $fh = &HgAutomate::mustOpen(">$file"); - my ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $tDb); + my ($tGenome, $tDate, $tSource, $tAsmName); + if ($tDb =~ m/^GC/) { + ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $asmId); + $tAsmName = $asmId; + } else { + ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $tDb); + $tAsmName = $tDb; + } my ($qGenome, $qDate, $qSource) = &HgAutomate::getAssemblyInfo($dbHost, $qDb); my $dir = $splitRef ? 'axtNet/*.' : ''; my $synNet = $splitRef ? "mafSynNet/*.maf.gz - filtered net files for syntenic alignments only, in MAF format, see also, description of MAF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format5" : "$tDb.$qDb.synNet.maf.gz - filtered net file for syntenic alignments only, in MAF format, see also, description of MAF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format5 - $tDb.$qDb.syn.net.gz - filtered net file for syntenic alignments only"; my ($matrix, $o, $e, $k, $l, $h, $blastzOther) = &getBlastzParams(); my $defaultMatrix = $defVars{'BLASTZ_Q'} ? '' : ' the default matrix'; my $lap = &describeOverlapping(); my $abridging = ""; if ($defVars{'BLASTZ_ABRIDGE_REPEATS'}) { if ($isSelf) { $abridging = " All repetitive sequences identified by RepeatMasker were removed from the assembly before alignment using the fasta-subseq and strip_rpts programs from Penn State. The abbreviated genome was aligned with lastz, and the transposons were then added back in (i.e. the alignment coordinates were adjusted) using the restore_rpts program from Penn State."; } else { $abridging = " Transposons that have been inserted since the $qGenome/$tGenome split were removed from the assemblies before alignment using the fasta-subseq and strip_rpts programs from Penn State. The abbreviated genomes were aligned with lastz, and the transposons were then added back in (i.e. the alignment coordinates were adjusted) using the restore_rpts program from Penn State."; } } my $desc = $isSelf ? "This directory contains alignments of $tGenome ($tDb, $tDate, $tSource) to itself." : "This directory contains alignments of the following assemblies: - target/reference: $tGenome - ($tDb, $tDate, + ($tAsmName, $tDate, $tSource) - query: $qGenome ($qDb, $qDate, $qSource)"; print $fh "$desc Files included in this directory: - md5sum.txt: md5sum checksums for the files in this directory - $tDb.$qDb.all.chain.gz: chained lastz alignments. The chain format is described in http://genome.ucsc.edu/goldenPath/help/chain.html . "; if (! $isSelf) { print $fh " - $tDb.$qDb.net.gz: \"net\" file that describes rearrangements between the species and the best $qGenome match to any part of the $tGenome genome. The net format is described in http://genome.ucsc.edu/goldenPath/help/net.html . - $dir$tDb.$qDb.net.axt.gz: chained and netted alignments, i.e. the best chains in the $tGenome genome, with gaps in the best chains filled in by next-best chains where possible. The axt format is described in http://genome.ucsc.edu/goldenPath/help/axt.html . - $synNet - reciprocalBest/ directory, contains reciprocal-best netted chains for $tDb-$qDb "; } if ($opt_swap) { my $TDb = ucfirst($tDb); print $fh "The chainSwap program was used to translate $qDb-referenced chained lastz alignments to $tDb into $tDb-referenced chains aligned to $qDb. See the download directory goldenPath/$qDb/vs$TDb/README.txt for more information about the $qDb-referenced lastz and chaining process. "; } else { print $fh ($isSelf ? -"The $tDb assembly was aligned to itself" : -"The $tDb and $qDb assemblies were aligned"); +"The $tAsmName assembly was aligned to itself" : +"The $tAsmName and $qDb assemblies were aligned"); my $chainMinScore = $opt_chainMinScore ? "$opt_chainMinScore" : $defaultChainMinScore; my $chainLinearGap = $opt_chainLinearGap ? "$opt_chainLinearGap" : $defaultChainLinearGap; print $fh " by the lastz alignment program, which is available from Webb Miller's lab at Penn State University (http://www.bx.psu.edu/miller_lab/). $lap $abridging The lastz scoring matrix (Q parameter) used was$defaultMatrix: $matrix with a gap open penalty of O=$o and a gap extension penalty of E=$e. The minimum score for an alignment to be kept was K=$k for the first pass and L=$l for the second pass, which restricted the search space to the regions between two alignments found in the first pass. The minimum score for alignments to be interpolated between was H=$h. $blastzOther The .lav format lastz output was translated to the .psl format with lavToPsl, then chained by the axtChain program.\n Chain minimum score: $chainMinScore, and linearGap matrix of "; if ($chainLinearGap =~ m/loose/) { print $fh "(loose): tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 "; } elsif ($chainLinearGap =~ m/medium/) { print $fh "(medium): tableSize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 350 425 450 600 900 2900 22900 57900 117900 217900 317900 tGap 350 425 450 600 900 2900 22900 57900 117900 217900 317900 bothGap 750 825 850 1000 1300 3300 23300 58300 118300 218300 318300 "; } else { print $fh "(specified):\n", `cat $chainLinearGap`, "\n"; } } if (! $isSelf) { print $fh " Chained alignments were processed into nets by the chainNet, netSyntenic, and netClass programs. Best-chain alignments in axt format were extracted by the netToAxt program."; } print $fh " All programs run after lastz were written by Jim Kent at UCSC. ---------------------------------------------------------------- If you plan to download a large file or multiple files from this directory, we recommend you use ftp rather than downloading the files via our website. To do so, ftp to hgdownload.soe.ucsc.edu, then go to the directory goldenPath/$tDb/vs$QDb/. To download multiple files, use the \"mget\" command: mget <filename1> <filename2> ... - or - mget -a (to download all files in the current directory) All files in this directory are freely available for public use. -------------------------------------------------------------------- References Harris, R.S. (2007) Improved pairwise alignment of genomic DNA Ph.D. Thesis, The Pennsylvania State University Chiaromonte F, Yap VB, Miller W. Scoring pairwise genomic sequence alignments. Pac Symp Biocomput. 2002:115-26. Kent WJ, Baertsch R, Hinrichs A, Miller W, Haussler D. Evolution's cauldron: Duplication, deletion, and rearrangement in the mouse and human genomes. Proc Natl Acad Sci U S A. 2003 Sep 30;100(20):11484-9. Schwartz S, Kent WJ, Smit A, Zhang Z, Baertsch R, Hardison RC, Haussler D, Miller W. Human-Mouse Alignments with BLASTZ. Genome Res. 2003 Jan;13(1):103-7. "; close($fh); } sub installDownloads { # construct symlinks for released files to download directory # load liftOver chains into hgcentral my $runDir = "$buildDir/axtChain"; # Make sure previous stage was successful. my $successFile = "$runDir/$tDb.$qDb.all.chain.gz"; if (! $isSelf && -s "$runDir/$tDb.$qDb.net.gz") { $successFile = "$runDir/$tDb.$qDb.net.gz"; } if (! -e $successFile && ! $opt_debug) { die "installDownloads: looks like previous stage was not successful " . "(can't find $successFile).\n"; } + my $goldenPath = $HgAutomate::goldenPath; + if ($tDb =~ m/^GC/) { + $goldenPath = &HgAutomate::asmHubDownloadDir($tDb); + } &dumpDownloadReadme("$runDir/README.txt"); my $over = $tDb . "To$QDb.over.chain.gz"; my $liftOverDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver"; - my $gpLiftOverDir = "$HgAutomate::goldenPath/$tDb/liftOver"; + my $gpLiftOverDir = "$goldenPath/$tDb/liftOver"; my $gbdbLiftOverDir = "$HgAutomate::gbdb/$tDb/liftOver"; my $andNets = $isSelf ? "." : ", nets and axtNet,\n" . "# and copies the liftOver chains to the liftOver download dir."; my $whatItDoes = "It creates the download directory for chains$andNets"; my $bossScript = new HgRemoteScript("$runDir/installDownloads.csh", $dbHost, $runDir, $whatItDoes, $DEF); $bossScript->add(<<_EOF_ -mkdir -p $HgAutomate::goldenPath/$tDb -rm -rf $HgAutomate::goldenPath/$tDb/vs$QDb -mkdir -p $HgAutomate::goldenPath/$tDb/vs$QDb -cd $HgAutomate::goldenPath/$tDb/vs$QDb +mkdir -p $goldenPath/$tDb +rm -rf $goldenPath/$tDb/vs$QDb +mkdir -p $goldenPath/$tDb/vs$QDb +cd $goldenPath/$tDb/vs$QDb ln -s $runDir/$tDb.$qDb.all.chain.gz . ln -s $runDir/README.txt . ln -s $runDir/md5sum.txt . _EOF_ ); if (! $isSelf) { my $axt = ($splitRef ? "mkdir -p axtNet\n" . "ln -s $buildDir/axtNet/*.axt.gz axtNet/" : "ln -s $buildDir/axtNet/$tDb.$qDb.net.axt.gz ."); if ( -s "$runDir/$tDb.$qDb.net.gz") { $bossScript->add(<<_EOF_ ln -s $runDir/$tDb.$qDb.net.gz . _EOF_ ); } $bossScript->add(<<_EOF_ $axt mkdir -p $gpLiftOverDir rm -f $gpLiftOverDir/$over ln -s $liftOverDir/$over $gpLiftOverDir/$over +_EOF_ + ); + if ($tDb !~ m/^GC/) { + $bossScript->add(<<_EOF_ mkdir -p $gbdbLiftOverDir rm -f $gbdbLiftOverDir/$over ln -s $liftOverDir/$over $gbdbLiftOverDir/$over hgAddLiftOverChain -minMatch=0.1 -multiple -path=$gbdbLiftOverDir/$over \\ $tDb $qDb +_EOF_ + ); + } + $bossScript->add(<<_EOF_ # Update (or create) liftOver/md5sum.txt with the new .over.chain.gz. if (-e $gpLiftOverDir/md5sum.txt) then set tmpFile = `mktemp -t tmpMd5.XXXXXX` csh -c "grep -v $over $gpLiftOverDir/md5sum.txt || true" > \$tmpFile md5sum $gpLiftOverDir/$over \\ | sed -e 's\@$gpLiftOverDir/\@\@' >> \$tmpFile sort \$tmpFile > $gpLiftOverDir/md5sum.txt rm \$tmpFile else md5sum $gpLiftOverDir/$over | sed -e 's\@$gpLiftOverDir/\@\@' \\ > $gpLiftOverDir/md5sum.txt endif _EOF_ ); } $bossScript->execute(); # maybe also peek in trackDb and see if entries need to be added for chain/net } sub doDownloads { # Create compressed files for download and make links from test server's # goldenPath/ area. &makeDownloads(); return if ($opt_skipDownload); &installDownloads(); } sub cleanup { # Remove intermediate files. my $runDir = $buildDir; my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : "$buildDir/psl"; my $rootCanal = ($opt_blastzOutRoot ? "rmdir --ignore-fail-on-non-empty $opt_blastzOutRoot" : ''); my $doSymLink = 0; my $baseName = basename($buildDir); my $dirName = dirname($buildDir); $doSymLink = 1 if ($dirName =~ m#.*/$tDb/bed$#); my $whatItDoes = "It cleans up files after a successful blastz/chain/net/install series. It uses rm -f so failures should be ignored (e.g. if a partial cleanup has already been performed)."; my $bossScript = new HgRemoteScript("$buildDir/cleanUp.csh", $fileServer, $runDir, $whatItDoes, $DEF); $bossScript->add(<<_EOF_ rm -fr $outRoot/ $rootCanal rm -fr $buildDir/axtChain/run/chain/ rm -fr $buildDir/axtChain/run/err/ rm -fr $buildDir/run.blastz/err/ # avoid no-match error exit when *.2bit does not exist /bin/csh -c "rm -fr $buildDir/run.blastz/tParts/*.2bit || true" /bin/csh -c "rm -fr $buildDir/run.blastz/qParts/*.2bit || true" rm -fr $buildDir/run.cat/err/ rm -f $buildDir/axtChain/noClass.net rm -f $buildDir/run.blastz/batch.bak rm -f $buildDir/run.cat/batch.bak rm -f $buildDir/axtChain/run/batch.bak _EOF_ ); if ($splitRef) { $bossScript->add(<<_EOF_ rm -fr $buildDir/axtChain/net/ rm -fr $buildDir/axtChain/chain/ _EOF_ ); } if ($doSymLink) { $bossScript->add(<<_EOF_ cd $dirName rm -f lastz.$qDb ln -s $baseName lastz.$qDb _EOF_ ); } $bossScript->execute(); } sub doSyntenicNet { # Create syntenic net mafs for multiz my $whatItDoes = "It filters the net for synteny and creates syntenic net MAF files for multiz. Use this option when the query genome is high-coverage and not too distant from the reference. Suppressed unless -syntenicNet is included."; if (not $opt_syntenicNet) { return; } my $runDir = "$buildDir/axtChain"; # First, make sure we're starting clean. my $successDir = "$buildDir/mafSynNet"; if (-e $successDir) { die "doSyntenicNet: looks like this was run successfully already " . "($successDir). To re-run, " . "move aside/remove $successDir and run again.\n"; } # Make sure previous stage was successful. my $successFile = "$runDir/$tDb.$qDb.net.gz"; if (! -e "$successFile" && ! $opt_debug) { die "doSyntenicNet: looks like previous stage was not successful " . "(can't find $successFile).\n"; } my $bossScript = new HgRemoteScript("$runDir/netSynteny.csh", $workhorse, $runDir, $whatItDoes, $DEF); if ($opt_loadChainSplit && $splitRef) { $bossScript->add(<<_EOF_ # filter net for synteny and create syntenic net mafs netFilter -syn $tDb.$qDb.net.gz \\ | netSplit stdin synNet chainSplit chain $tDb.$qDb.all.chain.gz cd .. mkdir -p $successDir foreach f (axtChain/synNet/*.net) netToAxt \$f axtChain/chain/\$f:t:r.chain \\ $defVars{'SEQ1_DIR'} $defVars{'SEQ2_DIR'} stdout \\ | axtSort stdin stdout \\ | axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. stdin \\ $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\ stdout \\ | gzip -c > mafSynNet/\$f:t:r:r:r:r:r.maf.gz end rm -fr $runDir/synNet rm -fr $runDir/chain cd mafSynNet md5sum *.maf.gz > md5sum.txt _EOF_ ); if (! $opt_skipDownload) { $bossScript->add(<<_EOF_ mkdir -p $HgAutomate::goldenPath/$tDb/vs$QDb/mafSynNet cd $HgAutomate::goldenPath/$tDb/vs$QDb/mafSynNet ln -s $buildDir/mafSynNet/* . _EOF_ ); } } else { # scaffold-based assembly # filter net for synteny and create syntenic net mafs $bossScript->add(<<_EOF_ netFilter -syn $tDb.$qDb.net.gz | gzip -c > $tDb.$qDb.syn.net.gz netChainSubset -verbose=0 $tDb.$qDb.syn.net.gz $tDb.$qDb.all.chain.gz stdout \\ | chainStitchId stdin stdout | gzip -c > $tDb.$qDb.syn.chain.gz _EOF_ ); if (! $opt_trackHub && $dbExists) { $bossScript->add(<<_EOF_ set lineCount = `zcat $tDb.$qDb.syn.chain.gz | wc -l` if (\$lineCount > 0) then hgLoadChain -tIndex $tDb chainSyn$QDb $tDb.$qDb.syn.chain.gz endif _EOF_ ); if ($qDbExists) { $bossScript->add(<<_EOF_ netFilter -minGap=10 $tDb.$qDb.syn.net.gz \\ | hgLoadNet -verbose=0 $tDb netSyn$QDb stdin endif _EOF_ ); } } else { $bossScript->add(<<_EOF_ set lineCount = `zcat $tDb.$qDb.syn.chain.gz | wc -l` if (\$lineCount > 0) then hgLoadChain -test -noBin -tIndex $tDb chainSyn$QDb $tDb.$qDb.syn.chain.gz wget --no-check-certificate -O bigChain.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigChain.as' wget --no-check-certificate -O bigLink.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigLink.as' sed 's/.000000//' chain.tab | awk 'BEGIN {OFS="\\t"} {print \$2, \$4, \$5, \$11, 1000, \$8, \$3, \$6, \$7, \$9, \$10, \$1}' > chainSyn${QDb}.tab bedToBigBed -type=bed6+6 -as=bigChain.as -tab chainSyn${QDb}.tab $defVars{SEQ1_LEN} chainSyn${QDb}.bb awk 'BEGIN {OFS="\\t"} {print \$1, \$2, \$3, \$5, \$4}' link.tab | sort -k1,1 -k2,2n > chainSyn${QDb}Link.tab bedToBigBed -type=bed4+1 -as=bigLink.as -tab chainSyn${QDb}Link.tab $defVars{SEQ1_LEN} chainSyn${QDb}Link.bb set totalBases = `ave -col=2 $defVars{SEQ1_LEN} | grep "^total" | awk '{printf "%d", \$2}'` set basesCovered = `bedSingleCover.pl chainSyn${QDb}Link.tab | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'` set percentCovered = `echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" "\$totalBases" "\$percentCovered" > ../fb.$tDb.chainSyn${QDb}Link.txt netFilter -minGap=10 $tDb.$qDb.syn.net.gz \\ | hgLoadNet -test -noBin -warn -verbose=0 $tDb netSyn$QDb stdin mv align.tab netSyn$QDb.tab endif rm -f link.tab rm -f chain.tab _EOF_ ); } $bossScript->add(<<_EOF_ if (\$lineCount > 0) then netToAxt $tDb.$qDb.syn.net.gz $tDb.$qDb.all.chain.gz \\ $defVars{'SEQ1_DIR'} $defVars{'SEQ2_DIR'} stdout \\ | axtSort stdin stdout \\ | axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. stdin \\ $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\ stdout \\ | gzip -c > $tDb.$qDb.synNet.maf.gz md5sum $tDb.$qDb.syn.net.gz $tDb.$qDb.synNet.maf.gz > synNet.md5sum.txt endif _EOF_ ); if ($opt_trackHub) { $bossScript->add(<<_EOF_ if (\$lineCount > 0) then mkdir -p ../bigMaf cd ../bigMaf wget --no-check-certificate -O bigMaf.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigMaf.as' wget --no-check-certificate -O mafSummary.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/mafSummary.as' mafToBigMaf $tDb ../axtChain/$tDb.$qDb.synNet.maf.gz stdout \\ | sort -k1,1 -k2,2n > $tDb.$qDb.synNet.txt bedToBigBed -type=bed3+1 -as=bigMaf.as -tab $tDb.$qDb.synNet.txt \\ $defVars{SEQ1_LEN} $tDb.$qDb.synNet.bb hgLoadMafSummary -minSeqSize=1 -test $tDb $tDb.$qDb.synNet.summary \\ ../axtChain/$tDb.$qDb.synNet.maf.gz cut -f2- $tDb.$qDb.synNet.summary.tab | sort -k1,1 -k2,2n \\ > $tDb.$qDb.synNet.summary.bed bedToBigBed -type=bed3+4 -as=mafSummary.as -tab \\ $tDb.$qDb.synNet.summary.bed \\ $defVars{SEQ1_LEN} $tDb.$qDb.synNet.summary.bb rm -f $tDb.$qDb.synNet.txt $tDb.$qDb.synNet.summary.tab \\ $tDb.$qDb.synNet.summary.bed endif _EOF_ ); } if (! $opt_skipDownload) { $bossScript->add(<<_EOF_ mkdir -p $HgAutomate::goldenPath/$tDb/vs$QDb cd $HgAutomate::goldenPath/$tDb/vs$QDb if (-s $runDir/synNet.md5sum.txt ) then ln -s $runDir/$tDb.$qDb.syn.net.gz . ln -s $runDir/$tDb.$qDb.synNet.maf.gz . cat $runDir/synNet.md5sum.txt >> md5sum.txt sort -u md5sum.txt > tmp.sum cat tmp.sum > md5sum.txt rm -f tmp.sum endif _EOF_ ); } if (! $opt_trackHub && $dbExists) { $bossScript->add(<<_EOF_ cd "$buildDir" if (\$lineCount > 0) then featureBits $tDb chainSyn${QDb}Link >&fb.$tDb.chainSyn${QDb}Link.txt cat fb.$tDb.chainSyn${QDb}Link.txt endif _EOF_ ); } } $bossScript->execute(); } ######################################################################### # # -- main -- # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); #$opt_debug = 1; &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; ($DEF) = @ARGV; $inclHap = ""; $inclHap = "-inclHap" if ($opt_inclHap); &loadDef($DEF); &checkDef(); my $seq1IsSplit = (`wc -l < $defVars{SEQ1_LEN}` <= $HgAutomate::splitThreshold); my $seq2IsSplit = (`wc -l < $defVars{SEQ2_LEN}` <= $HgAutomate::splitThreshold); +# might be an assembly hub build +$asmId = $opt_asmId ? $opt_asmId : ""; + # Undocumented option for quickly generating a README from DEF: if ($opt_readmeOnly) { $splitRef = $opt_swap ? $seq2IsSplit : $seq1IsSplit; &swapGlobals() if $opt_swap; &dumpDownloadReadme("/tmp/README.txt"); exit 0; } my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $defVars{'BASE'} || "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/blastz.$qDb.$date"; if ($opt_swap) { my $inChain = &getAllChain("$buildDir/axtChain"); if (! defined $inChain) { die "-swap: Can't find $buildDir/axtChain/[$tDb.$qDb.]all.chain[.gz]\n" . "which is required for -swap.\n"; } if ($opt_swapDir) { $swapDir = $opt_swapDir; } else { $swapDir = "$HgAutomate::clusterData/$qDb/$HgAutomate::trackBuild/blastz.$tDb.swap"; } &HgAutomate::mustMkdir("$swapDir/axtChain"); $splitRef = $seq2IsSplit; &HgAutomate::verbose(1, "Swapping from $buildDir/axtChain/$inChain\n" . "to $swapDir/axtChain/$qDb.$tDb.all.chain.gz .\n"); } else { if (! -d $buildDir) { &HgAutomate::mustMkdir($buildDir); } if (! $opt_blastzOutRoot && $stepper->stepPrecedes($stepper->getStartStep(), 'chainRun')) { &enforceClusterNoNo($buildDir, 'blastz/chain/net build directory (or use -blastzOutRoot)'); } $splitRef = $seq1IsSplit; &HgAutomate::verbose(1, "Building in $buildDir\n"); } if (! -e "$buildDir/DEF") { &HgAutomate::run("cp $DEF $buildDir/DEF"); } $fileServer = &HgAutomate::chooseFileServer($opt_swap ? $swapDir : $buildDir); # may be working on a 2bit file that does not have a database browser $dbExists = 0; $dbExists = 1 if (&HgAutomate::databaseExists($dbHost, $tDb)); # may be working with a query that does not have a database $qDbExists = 0; $qDbExists = 1 if (&HgAutomate::databaseExists($dbHost, $qDb)); # When running -swap, swapGlobals() happens at the end of the chainMerge step. # However, if we also use -continue with some step later than chainMerge, we # need to call swapGlobals before executing the remaining steps. if ($opt_swap && $stepper->stepPrecedes('chainMerge', $stepper->getStartStep())) { &swapGlobals(); } $stepper->execute(); $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60; HgAutomate::verbose(1, "\n *** All done ! Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n"); HgAutomate::verbose(1, " *** Make sure that goldenPath/$tDb/vs$QDb/README.txt is accurate.\n") if ($stepper->stepPrecedes('load', $stepper->getStopStep())); HgAutomate::verbose(1, " *** Add {chain,net}$QDb tracks to trackDb.ra if necessary.\n") if ($stepper->stepPrecedes('net', $stepper->getStopStep())); HgAutomate::verbose(1, "\n\n");