03886581a00e4d626316a657c170287b1f266e35 hiram Mon Sep 12 15:55:14 2022 -0700 beginning a oneAndDone browser build script and therefore adding a new argument to the assembly build for a concise UCSC style dbName refs #29811 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index f709fac..637607e 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -23,30 +23,31 @@ # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_sourceDir $opt_species $opt_rmskSpecies $opt_ncbiRmsk $opt_noRmsk $opt_augustusSpecies $opt_noAugustus $opt_xenoRefSeq $opt_noXenoRefSeq $opt_ucscNames + $opt_dbName /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'chromAlias', func => \&doChromAlias }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, { name => 'repeatMasker', func => \&doRepeatMasker }, { name => 'simpleRepeat', func => \&doSimpleRepeat }, { name => 'allGaps', func => \&doAllGaps }, { name => 'idKeys', func => \&doIdKeys }, @@ -66,30 +67,31 @@ # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; my $species = ""; # usually found in asmId_assembly_report.txt my $ftpDir = ""; # will be determined from given asmId my $rmskSpecies = ""; my $noRmsk = 0; # when RepeatMasker is not possible, such as bacteria my $ncbiRmsk = 0; # when =1 call doRepeatMasker.pl # with -ncbiRmsk=path.out.gz and -liftSpec=... my $augustusSpecies = "human"; my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq"; my $noAugustus = 0; # bacteria do *not* create an augustus track my $noXenoRefSeq = 0; # bacteria do *not* create a xenoRefSeq track my $ucscNames = 0; # default 'FALSE' (== 0) +my $dbName = ""; # default uses NCBI asmId for name, can specify: abcDef1 my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; # key is original accession name from the remove.dups.list, value is 1 my %dupAccessionList; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): @@ -97,30 +99,31 @@ usage: $base [options] asmId required arguments: asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Construct assembly hub in dir instead of default $HgAutomate::clusterData/asmHubs/refseqBuild/GC[AF]/123/456/789/asmId/ -sourceDir dir Find assembly in dir instead of default: $sourceDir/GC[AF]/123/456/789/asmId -ucscNames Translate NCBI/INSDC/RefSeq names to UCSC names default is to use the given NCBI/INSDC/RefSeq names + -dbName <name> name for UCSC style database name, e.g. 'abcDef1' -species <name> use this species designation if there is no asmId_assembly_report.txt with an 'Organism name:' entry to obtain species -rmskSpecies <name> to override default 'species' name for repeat masker the default is found in the asmId_asssembly_report.txt e.g. -rmskSpecies=viruses -noRmsk when RepeatMasker is not possible, such as bacteria -ncbiRmsk use NCBI rm.out.gz file instead of local cluster run for repeat masking -augustusSpecies <human|chicken|zebrafish> default 'human' -noAugustus do *not* create the Augustus gene track -noXenoRefSeq do *not* create the Xeno RefSeq gene track -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz expanded directory of mrnas/ and xenoRefMrna.sizes, default $xenoRefSeq @@ -191,30 +194,31 @@ # Command line args: asmId my ( $asmId); # Other: my ($buildDir, $secondsStart, $secondsEnd, $assemblySource); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'sourceDir=s', 'rmskSpecies=s', 'noRmsk', 'ncbiRmsk', 'augustusSpecies=s', 'xenoRefSeq=s', + 'dbName=s', 'noXenoRefSeq', 'noAugustus', 'ucscNames', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### ######################################################################### @@ -842,35 +846,38 @@ my $agpSource = "$nonNucAsm/unlocalized_scaffolds/AGP"; my $agpOutput = "$runDir/$asmId.nonNucUnlocalized.agp.gz"; my $agpNames = "$runDir/$asmId.nonNucUnlocalized.names"; my $fastaOut = "$runDir/$asmId.nonNucUnlocalized.fa.gz"; $partsDone += 1; if (needsUpdate($nonNucChr2scaf, $agpOutput)) { unlocalizedAgp($nonNucChr2scaf, $agpSource, $agpOutput, $agpNames); `touch -r $nonNucChr2scaf $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { unlocalizedFasta($nonNucChr2scaf, $twoBitFile, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } + my $defaultName = $asmId; + $defaultName = $dbName if (length($dbName)); + $bossScript->add(<<_EOF_ export asmId="$asmId" +export dbName="$defaultName" - -if [ -s ../\$asmId.chrom.sizes ]; then +if [ -s ../\$dbName.chrom.sizes ]; then printf "sequence step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); ### construct sequence when no AGP files exist if (0 == $partsDone) { printf STDERR "# partsDone is zero, creating fake AGP\n"; if ($ucscNames) { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names @@ -880,92 +887,92 @@ $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\ | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\ | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\ | awk '{printf "%s\\t%s\\n", \$1, \$2}' | sort > \$asmId.fake.names _EOF_ ); } } else { printf STDERR "partsDone: %d\n", $partsDone; } $bossScript->add(<<_EOF_ -zcat *.agp.gz | gzip > ../\$asmId.agp.gz -faToTwoBit *.fa.gz ../\$asmId.2bit -faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit -twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt +zcat *.agp.gz | gzip > ../\$dbName.agp.gz +faToTwoBit *.fa.gz ../\$dbName.2bit +faToTwoBit -noMask *.fa.gz ../\$dbName.unmasked.2bit +twoBitDup ../\$dbName.unmasked.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then - printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2 + printf "ERROR: duplicate sequences found in ../\$dbName.unmasked.2bit\\n" 1>&2 cat \$asmId.dups.txt 1>&2 awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list - mv ../\$asmId.unmasked.2bit ../\$asmId.unmasked.dups.2bit - twoBitToFa ../\$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\ + mv ../\$dbName.unmasked.2bit ../\$dbName.unmasked.dups.2bit + twoBitToFa ../\$dbName.unmasked.dups.2bit stdout | faSomeRecords -exclude \\ stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz - rm -f ../\$asmId.2bit ../\$asmId.unmasked.2bit - faToTwoBit \$asmId.noDups.fasta.gz ../\$asmId.2bit - faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$asmId.unmasked.2bit + rm -f ../\$dbName.2bit ../\$dbName.unmasked.2bit + faToTwoBit \$asmId.noDups.fasta.gz ../\$dbName.2bit + faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$dbName.unmasked.2bit fi gzip -f \$asmId.dups.txt -touch -r ../download/\$asmId.2bit ../\$asmId.2bit -touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit -touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz -twoBitInfo ../\$asmId.2bit stdout | sort -k2nr > ../\$asmId.chrom.sizes -touch -r ../\$asmId.2bit ../\$asmId.chrom.sizes +touch -r ../download/\$asmId.2bit ../\$dbName.2bit +touch -r ../download/\$asmId.2bit ../\$dbName.unmasked.2bit +touch -r ../download/\$asmId.2bit ../\$dbName.agp.gz +twoBitInfo ../\$dbName.2bit stdout | sort -k2nr > ../\$dbName.chrom.sizes +touch -r ../\$dbName.2bit ../\$dbName.chrom.sizes # verify everything is there twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes -export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"` +export newTotal=`ave -col=2 ../\$dbName.chrom.sizes | grep "^total"` export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"` if [ "\$newTotal" != "\$oldTotal" ]; then printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2 printf "# \$newTotal != \$oldTotal\\n" 1>&2 exit 255 fi rm source.\$asmId.chrom.sizes -export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1` +export checkAgp=`checkAgpAndFa ../\$dbName.agp.gz ../\$dbName.2bit 2>&1 | tail -1` if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then - printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\\n" 1>&2 + printf "# ERROR: checkAgpAndFa \$dbName.agp.gz \$dbName.2bit failing\\n" 1>&2 exit 255 fi _EOF_ ); if ($ucscNames) { $bossScript->add(<<_EOF_ -join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift -join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift +join -t\$'\\t' <(sort ../\$dbName.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift +join -t\$'\\t' <(sort ../\$dbName.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift export c0=`cat \$asmId.ncbiToUcsc.lift | wc -l` export c1=`cat \$asmId.ucscToNcbi.lift | wc -l` -export c2=`cat ../\$asmId.chrom.sizes | wc -l` +export c2=`cat ../\$dbName.chrom.sizes | wc -l` # verify all names are accounted for if [ "\$c0" -ne "\$c2" ]; then printf "# ERROR: not all names accounted for in \$asmId.ncbiToUcsc.lift" 1>&2 exit 255 fi if [ "\$c1" -ne "\$c2" ]; then printf "# ERROR: not all names accounted for in \$asmId.ucscToNcbi.lift" 1>&2 exit 255 fi _EOF_ ); } $bossScript->add(<<_EOF_ -twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz -touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz +twoBitToFa ../\$dbName.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz +touch -r ../\$dbName.2bit \$asmId.faCount.txt.gz zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt -touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt +touch -r ../\$dbName.2bit \$asmId.faCount.signature.txt _EOF_ ); $bossScript->execute(); } # doSequence ######################################################################### # * step: assemblyGap [workhorse] sub doAssemblyGap { my $runDir = "$buildDir/trackData/assemblyGap"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct assembly and gap tracks from AGP file"; my $bossScript = newBash HgRemoteScript("$runDir/doAssemblyGap.bash", $workhorse, $runDir, $whatItDoes); @@ -2023,46 +2030,50 @@ if (length($species) < 1) { if (-s "$asmReport") { $species = `grep -i "organism name:" $asmReport`; chomp $species; $species =~ s/.*organism\s+name:\s+//i; $species =~ s/\s+\(.*//; } else { die "ERROR: no -species specified and can not find $asmReport"; } if (length($species) < 1) { die "ERROR: no -species specified and can not find Organism name: in $asmReport"; } } +$dbName = $opt_dbName ? $opt_dbName : $dbName; $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $noAugustus = $opt_noAugustus ? 1 : $noAugustus; # '1' == 'TRUE' $noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0; $noRmsk = $opt_noRmsk ? 1 : 0; die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; +if (length($dbName)) { +printf STDERR "# dbName %s - building in /hive/data/genomes/%s\n", $dbName, $dbName; +} printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq; printf STDERR "# assemblySource: %s\n", $assemblySource; printf STDERR "# rmskSpecies %s\n", $rmskSpecies; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE"; printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE"; printf STDERR "# noAugustus %s\n", $noAugustus ? "TRUE" : "FALSE"; printf STDERR "# noXenoRefSeq %s\n", $noXenoRefSeq ? "TRUE" : "FALSE"; printf STDERR "# noRmsk %s\n", $noRmsk ? "TRUE" : "FALSE"; # Do everything. $stepper->execute(); # Tell the user anything they should know.