f9ce722d011fb87c9f61a4d892b9140ef7fa9047 hiram Mon Sep 12 20:38:56 2022 -0700 now working through step chromAlias and fixup that procedure to work correctly on ucsc names refs #29811 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 637607e..650231e 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -68,30 +68,31 @@ # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; my $species = ""; # usually found in asmId_assembly_report.txt my $ftpDir = ""; # will be determined from given asmId my $rmskSpecies = ""; my $noRmsk = 0; # when RepeatMasker is not possible, such as bacteria my $ncbiRmsk = 0; # when =1 call doRepeatMasker.pl # with -ncbiRmsk=path.out.gz and -liftSpec=... my $augustusSpecies = "human"; my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq"; my $noAugustus = 0; # bacteria do *not* create an augustus track my $noXenoRefSeq = 0; # bacteria do *not* create a xenoRefSeq track my $ucscNames = 0; # default 'FALSE' (== 0) my $dbName = ""; # default uses NCBI asmId for name, can specify: abcDef1 +my $defaultName = ""; # will be asmId or dbName if present my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; # key is original accession name from the remove.dups.list, value is 1 my %dupAccessionList; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): @@ -182,31 +183,31 @@ 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from \$db.2bit. 3. The \$db.2bit files have already been distributed to cluster-scratch (/scratch/hg or /iscratch, /san etc). 4. template? " if ($detailed); print "\n"; exit $status; } # Globals: # Command line args: asmId my ( $asmId); # Other: -my ($buildDir, $secondsStart, $secondsEnd, $assemblySource); +my ($buildDir, $secondsStart, $secondsEnd, $assemblySource, $asmReport); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'sourceDir=s', 'rmskSpecies=s', 'noRmsk', 'ncbiRmsk', 'augustusSpecies=s', 'xenoRefSeq=s', 'dbName=s', 'noXenoRefSeq', 'noAugustus', 'ucscNames', @@ -846,33 +847,30 @@ my $agpSource = "$nonNucAsm/unlocalized_scaffolds/AGP"; my $agpOutput = "$runDir/$asmId.nonNucUnlocalized.agp.gz"; my $agpNames = "$runDir/$asmId.nonNucUnlocalized.names"; my $fastaOut = "$runDir/$asmId.nonNucUnlocalized.fa.gz"; $partsDone += 1; if (needsUpdate($nonNucChr2scaf, $agpOutput)) { unlocalizedAgp($nonNucChr2scaf, $agpSource, $agpOutput, $agpNames); `touch -r $nonNucChr2scaf $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { unlocalizedFasta($nonNucChr2scaf, $twoBitFile, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } - my $defaultName = $asmId; - $defaultName = $dbName if (length($dbName)); - $bossScript->add(<<_EOF_ export asmId="$asmId" export dbName="$defaultName" if [ -s ../\$dbName.chrom.sizes ]; then printf "sequence step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); ### construct sequence when no AGP files exist if (0 == $partsDone) { printf STDERR "# partsDone is zero, creating fake AGP\n"; @@ -965,31 +963,31 @@ ); $bossScript->execute(); } # doSequence ######################################################################### # * step: assemblyGap [workhorse] sub doAssemblyGap { my $runDir = "$buildDir/trackData/assemblyGap"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct assembly and gap tracks from AGP file"; my $bossScript = newBash HgRemoteScript("$runDir/doAssemblyGap.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -export asmId=$asmId +export asmId="$defaultName" if [ ../../\$asmId.agp.gz -nt \$asmId.assembly.bb ]; then zcat ../../\$asmId.agp.gz | grep -v "^#" | awk '\$5 != "N" && \$5 != "U"' \\ | awk '{printf "%s\\t%d\\t%d\\t%s\\t0\\t%s\\n", \$1, \$2-1, \$3, \$6, \$9}' \\ | sort -k1,1 -k2,2n > \$asmId.assembly.bed zcat ../../\$asmId.agp.gz | grep -v "^#" | awk '\$5 == "N" || \$5 == "U"' \\ | awk '{printf "%s\\t%d\\t%d\\t%s\\n", \$1, \$2-1, \$3, \$8}' \\ | sort -k1,1 -k2,2n > \$asmId.gap.bed bedToBigBed -extraIndex=name -verbose=0 \$asmId.assembly.bed \\ ../../\$asmId.chrom.sizes \$asmId.assembly.bb touch -r ../../\$asmId.agp.gz \$asmId.assembly.bb \$HOME/kent/src/hg/utils/automation/genbank/nameToIx.pl \\ @@ -1015,31 +1013,31 @@ $bossScript->execute(); } # assemblyGap ######################################################################### # * step: chromAlias [workhorse] sub doChromAlias { my $runDir = "$buildDir/trackData/chromAlias"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct asmId.chromAlias.txt for alias name recognition"; my $bossScript = newBash HgRemoteScript("$runDir/doChromAlias.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export buildDir=$buildDir -export asmId=$asmId +export asmId="$defaultName" \$HOME/kent/src/hg/utils/automation/asmHubChromAlias.pl \\ \${asmId} | sort > \${asmId}.chromAlias.txt \$HOME/kent/src/hg/utils/automation/aliasTextToBed.pl \\ -chromSizes=\$buildDir/\$asmId.chrom.sizes \\ -aliasText=\${asmId}.chromAlias.txt \\ -aliasBed=\${asmId}.chromAlias.bed \\ -aliasAs=\${asmId}.chromAlias.as \\ -aliasBigBed=\${asmId}.chromAlias.bb bigBedToBed -header \${asmId}.chromAlias.bb test.chromAlias.bed \$HOME/kent/src/hg/utils/automation/aliasBedToCt.pl \\ test.chromAlias.bed \${asmId}.chromAlias.bb . @@ -1076,34 +1074,34 @@ my $photoJpg = "noPhoto"; my $photoCredit = "noPhoto"; my $photoLink = ""; my $speciesNoBlank = $species; $speciesNoBlank =~ s/ /_/g; if ( -s "$runDir/../photo/$speciesNoBlank.jpg" ) { $photoJpg = "../photo/${speciesNoBlank}.jpg"; $photoCredit = "../photo/photoCredits.txt"; $photoLink = "rm -f ${speciesNoBlank}.jpg; ln -s ../photo/${speciesNoBlank}.jpg ." } else { printf STDERR "# gatewayPage: warning: no photograph available\n"; } $bossScript->add(<<_EOF_ export asmId=$asmId +export asmReport="$asmReport" \$HOME/kent/src/hg/utils/automation/asmHubGatewayPage.pl \\ - ../download/\${asmId}_assembly_report.txt \\ - ../\${asmId}.chrom.sizes \\ + \${asmReport} ../\${asmId}.chrom.sizes \\ $photoJpg $photoCredit \\ > \$asmId.description.html 2> \$asmId.names.tab \$HOME/kent/src/hg/utils/automation/genbank/buildStats.pl \\ ../\$asmId.chrom.sizes 2> \$asmId.build.stats.txt $photoLink _EOF_ ); $bossScript->execute(); } # gatewayPage ######################################################################### # * step: cytoBand [workhorse] sub doCytoBand { my $runDir = "$buildDir/trackData/cytoBand"; &HgAutomate::mustMkdir($runDir); @@ -2012,61 +2010,64 @@ my @partNames = split('_', $asmId); $ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0], substr($partNames[1],0,3), substr($partNames[1],3,3), substr($partNames[1],6,3), $asmId); # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1; # $opt_verbose = 3 if ($opt_verbose < 3); # Establish what directory we will work in. $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/asmHubs/refseqBuild/$ftpDir"; $sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir; $assemblySource = $opt_sourceDir ? "$opt_sourceDir" : "$sourceDir/$ftpDir"; -my $asmReport = "$assemblySource/${asmId}_assembly_report.txt"; +$asmReport = "$assemblySource/${asmId}_assembly_report.txt"; $species = $opt_species ? $opt_species : $species; if (length($species) < 1) { if (-s "$asmReport") { $species = `grep -i "organism name:" $asmReport`; chomp $species; $species =~ s/.*organism\s+name:\s+//i; $species =~ s/\s+\(.*//; } else { die "ERROR: no -species specified and can not find $asmReport"; } if (length($species) < 1) { die "ERROR: no -species specified and can not find Organism name: in $asmReport"; } } $dbName = $opt_dbName ? $opt_dbName : $dbName; $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $noAugustus = $opt_noAugustus ? 1 : $noAugustus; # '1' == 'TRUE' $noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0; $noRmsk = $opt_noRmsk ? 1 : 0; +$defaultName = $asmId; +$defaultName = $dbName if (length($dbName)); + die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; if (length($dbName)) { printf STDERR "# dbName %s - building in /hive/data/genomes/%s\n", $dbName, $dbName; } printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq; printf STDERR "# assemblySource: %s\n", $assemblySource; printf STDERR "# rmskSpecies %s\n", $rmskSpecies; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE"; printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE"; printf STDERR "# noAugustus %s\n", $noAugustus ? "TRUE" : "FALSE";