88603b198e1787fb26de0479d1782532fbb71b67 hiram Tue Jul 23 10:14:37 2019 -0700 added cytoBand navigation refs #23734 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 939efcf..514dee5 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -24,30 +24,31 @@ use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_sourceDir $opt_augustusSpecies $opt_ucscNames /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'gatewayPage', func => \&doGatewayPage }, + { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, { name => 'repeatMasker', func => \&doRepeatMasker }, { name => 'simpleRepeat', func => \&doSimpleRepeat }, { name => 'allGaps', func => \&doAllGaps }, { name => 'idKeys', func => \&doIdKeys }, { name => 'windowMasker', func => \&doWindowMasker }, { name => 'addMask', func => \&doAddMask }, { name => 'gapOverlap', func => \&doGapOverlap }, { name => 'tandemDups', func => \&doTandemDups }, { name => 'cpgIslands', func => \&doCpgIslands }, { name => 'augustus', func => \&doAugustus }, { name => 'trackDb', func => \&doTrackDb }, { name => 'cleanup', func => \&doCleanup }, ] ); @@ -95,30 +96,31 @@ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => $fileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates build of assembly hub. Steps: download: sets up sym link working hierarchy from already mirrored files from NCBI in: $sourceDir/{genbank|refseq}/ sequence: establish AGP and 2bit file from NCBI directory assemblyGap: create assembly and gap bigBed files and indexes for assembly track names gatewayPage: create html/asmId.description.html contents + cytoBand: create cytoBand track and navigation ideogram gc5Base: create bigWig file for gc5Base track repeatMasker: run repeat masker cluster run and create bigBed files for the composite track categories of repeats simpleRepeat: run trf cluster run and create bigBed file for simple repeats allGaps: calculate all actual real gaps due to N's in sequence, can be more than were specified in the AGP file idKeys: calculate md5sum for each sequence in the assembly to be used to find identical sequences in similar assemblies windowMasker: run windowMasker cluster run, create windowMasker bigBed file and compute intersection with repeatMasker results addMask: combine the higher masking of (windowMasker or repeatMasker) with trf simpleRepeats into one 2bit file cpgIslands: run CpG islands cluster runs for both masked and unmasked sequences and create bigBed files for this composite track trackDb: create trackDb.txt file for assembly hub to include all constructed @@ -705,30 +707,63 @@ > \$asmId.description.html 2> \$asmId.names.tab \$HOME/kent/src/hg/utils/automation/genbank/buildStats.pl \\ ../\$asmId.chrom.sizes 2> \$asmId.build.stats.txt touch -r ../download/\${asmId}_assembly_report.txt \$asmId.description.html $photoLink else printf "# gatewayPage step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # gatewayPage ######################################################################### +# * step: cytoBand [workhorse] +sub doCytoBand { + my $runDir = "$buildDir/trackData/cytoBand"; + &HgAutomate::mustMkdir($runDir); + + my $whatItDoes = "construct cytoBand track and navigation ideogram"; + my $bossScript = newBash HgRemoteScript("$runDir/doCytoBand.bash", + $workhorse, $runDir, $whatItDoes); + + if ( ! -s "$buildDir/$asmId.chrom.sizes" ) { + printf STDERR "ERROR: sequence step not completed\n"; + printf STDERR "can not find: $buildDir/$asmId.chrom.sizes\n"; + exit 255; + } + + $bossScript->add(<<_EOF_ +export asmId=$asmId + +if [ ../../\$asmId.chrom.sizes -nt \$asmId.cytoBand.bb ]; then + awk '{printf "%s\\t0\\t%d\\t\\tgneg\\n", \$1, \$2}' ../../\$asmId.chrom.sizes | sort -k1,1 -k2,2n > \$asmId.cytoBand.bed + bedToBigBed -type=bed4+1 -as=\$HOME/kent/src/hg/lib/cytoBand.as -tab \$asmId.cytoBand.bed ../../\$asmId.chrom.sizes \$asmId.cytoBand.bb + + touch -r ../../\$asmId.chrom.sizes \$asmId.cytoBand.bb +else + printf "# cytoBand step previously completed\\n" 1>&2 + exit 0 +fi +_EOF_ + ); + $bossScript->execute(); +} # cytoBand + +######################################################################### # * step: gc5Base [workhorse] sub doGc5Base { my $runDir = "$buildDir/trackData/gc5Base"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct gc5Base bigWig track data"; my $bossScript = newBash HgRemoteScript("$runDir/doGc5Base.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ ../../\$asmId.2bit -nt \$asmId.gc5Base.bw ]; then hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 test \\ ../../\$asmId.2bit \\ @@ -981,31 +1016,31 @@ ######################################################################### # * step: windowMasker [workhorse] sub doWindowMasker { my $runDir = "$buildDir/trackData/windowMasker"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run windowMasker procedure"; my $bossScript = newBash HgRemoteScript("$runDir/doWindowMasker.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId ### if [ ../../\$asmId.unmasked.2bit -nt fb.\$asmId.rmsk.windowmaskerSdust.txt ]; then -if [ ../../\$asmId.unmasked.2bit -nt faSize.\$asmId.wmsk.sdust.txt ]; then +if [ ../../\$asmId.unmasked.2bit -nt faSize.\$asmId.cleanWMSdust.txt ]; then \$HOME/kent/src/hg/utils/automation/doWindowMasker.pl -stop=twobit -buildDir=`pwd` -dbHost=$dbHost \\ -workhorse=$workhorse -unmaskedSeq=$buildDir/\$asmId.unmasked.2bit \$asmId bedInvert.pl ../../\$asmId.chrom.sizes ../allGaps/\$asmId.allGaps.bed.gz \\ > not.gap.bed bedIntersect -minCoverage=0.0000000014 windowmasker.sdust.bed \\ not.gap.bed stdout | sort -k1,1 -k2,2n > cleanWMask.bed twoBitMask $buildDir/\$asmId.unmasked.2bit cleanWMask.bed \\ \$asmId.cleanWMSdust.2bit twoBitToFa \$asmId.cleanWMSdust.2bit stdout \\ | faSize stdin > faSize.\$asmId.cleanWMSdust.txt zcat ../repeatMasker/\$asmId.sorted.fa.out.gz | sed -e 's/^ *//; /^\$/d;' \\ | egrep -v "^SW|^score" | awk '{printf "%s\\t%d\\t%d\\n", \$5, \$6-1, \$7}' \\ | bedSingleCover.pl stdin > rmsk.bed intersectRmskWM=`bedIntersect -minCoverage=0.0000000014 cleanWMask.bed \\ rmsk.bed stdout | bedSingleCover.pl stdin | ave -col=4 stdin \\