b2b24fc1972e32973020518288369e2582a7ffcf hiram Wed Sep 30 12:39:12 2020 -0700 fixup errors in building a hub without UCSC names refs #24386 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 0e93e6c..2a6f7b4 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -358,34 +358,39 @@ my $chrN = $accToChr{$acc}; next if (exists($chrNDone{$chrN})); $chrNDone{$chrN} = 1; my $agpFile = "$agpSource/chr$chrN.unlocalized.scaf.agp.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <FH>) { if ($line !~ m/^#/) { chomp $line; my (@a) = split('\t', $line); my $ncbiAcc = $a[0]; next if (exists($dupAccessionList{$ncbiAcc})); my $acc = $ncbiAcc; $acc =~ s/\./v/ if ($ucscNames); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); - my $ucscName = "${acc}"; - $ucscName = "chr${chrN}_${acc}_random" if ($ucscNames); + my $ucscName = "$ncbiAcc"; + $ucscName = "chr${chrN}_${acc}_random"; + $ucscName =~ s/\./v/; printf NAMES "%s\t%s\n", $ucscName, $ncbiAcc; + if ($ucscNames) { printf AGP "%s", $ucscName; # begin AGP line with accession name + } else { + printf AGP "%s", $ncbiAcc; # begin AGP line with accession name + } for (my $i = 1; $i < scalar(@a); ++$i) { # the rest of the AGP line printf AGP "\t%s", $a[$i]; } printf AGP "\n"; } } close (FH); } close (AGP); close (NAMES); } # sub unlocalizedAgp($$$$) ######################################################################### # process unlocalized NCBI fasta sequence, use UCSC chrom names if requested sub unlocalizedFasta($$$) { @@ -547,37 +552,39 @@ # process NCBI unplaced AGP file, perhaps translate into UCSC naming scheme # the agpNames result file is a naming correspondence file for later use # optional chrPrefix can be "chrUn_" for assemblies that have other chr names sub unplacedAgp($$$$) { my ($agpFile, $agpOutput, $agpNames, $chrPrefix) = @_; open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput"; open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <FH>) { if ($line =~ m/^#/) { print AGP $line; } else { my ($ncbiAcc, undef) = split('\s+', $line, 2); next if (exists($dupAccessionList{$ncbiAcc})); - if ($ucscNames) { my $ucscAcc = $ncbiAcc; $ucscAcc =~ s/\./v/; printf NAMES "%s%s\t%s\n", $chrPrefix, $ucscAcc, $ncbiAcc; + if ($ucscNames) { $line =~ s/\./v/; - } printf AGP "%s%s", $chrPrefix, $line; + } else { + printf AGP "%s", $line; + } } } close (FH); close (NAMES); close (AGP); } # sub unplacedAgp($$$$) ######################################################################### # process NCBI unplaced FASTA file, perhaps translate into UCSC naming scheme # optional chrPrefix can be "chrUn_" for assemblies that have other chr names sub unplacedFasta($$$$) { my ($agpFile, $twoBitFile, $chrPrefix, $fastaOut) = @_; my %contigName; # key is NCBI contig name, value is UCSC contig name open (FH, "zcat $agpFile|") or die "can not read $agpFile"; @@ -595,32 +602,36 @@ foreach my $ctg (sort keys %contigName) { if (! exists($dupAccessionList{$ctg})) { printf $fh "%s\n", $ctg; } } close $fh; open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut"; open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/^>//; $line =~ s/ .*//; die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($contigName{$line})); + if ($ucscNames) { printf FA ">%s%s\n", $chrPrefix, $contigName{$line}; } else { + printf FA ">%s\n", $contigName{$line}; + } + } else { print FA $line; } } close(FH); close(FA); } # sub unplacedFasta($$$$) ######################################################################### ######################################################################### # do Steps section ######################################################################### ######################################################################### # * step: download [workhorse] sub doDownload { my $runDir = "$buildDir/download"; @@ -731,31 +742,31 @@ ########### alternate sequences ############## my $assemblyStructure = "$buildDir/download/${asmId}_assembly_structure"; my $altCount = `find -L "${assemblyStructure}" -type f | grep "/alt_scaffolds/alt_scaffold_placement.txt" | wc -l`; chomp $altCount; if ($altCount > 0) { $partsDone += 1; ++$otherChrParts; altSequences($assemblyStructure); } ########### unplaced sequence ################ my $unplacedScafAgp = "$primaryAssembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; if ( -s $unplacedScafAgp ) { my $chrPrefix = ""; # no prefix if no other chrom parts - $chrPrefix = "chrUn_" if ($otherChrParts); + $chrPrefix = "chrUn_" if ($otherChrParts && ! $ucscNames); my $agpOutput = "$runDir/$asmId.unplaced.agp.gz"; my $agpNames = "$runDir/$asmId.unplaced.names"; $partsDone += 1; if (needsUpdate($unplacedScafAgp, $agpOutput)) { unplacedAgp($unplacedScafAgp, $agpOutput, $agpNames, $chrPrefix); `touch -r $unplacedScafAgp $agpOutput`; } my $fastaOut = "$runDir/$asmId.unplaced.fa.gz"; if (needsUpdate($twoBitFile, $fastaOut)) { unplacedFasta($unplacedScafAgp, $twoBitFile, $chrPrefix, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } @@ -1791,30 +1802,36 @@ $bossScript->add(<<_EOF_ printf "to be done\\n" 1>&2 _EOF_ ); $bossScript->execute(); } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); +if (scalar(@ARGV) != 1) { + printf STDERR "ERROR: can not find 1 argument in ARGV, instead: %d\n", scalar(@ARGV); + for (my $i = 0; $i < scalar(@ARGV); ++$i) { + printf "# ARGV[%d] : '%s'\n", $i, $ARGV[$i]; + } +} &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; # expected command line arguments after options are processed ($asmId) = @ARGV; # yes, there can be more than two fields separated by _ # but in this case, we only care about the first two: # GC[AF]_123456789.3_assembly_Name # 0 1 2 3 .... my @partNames = split('_', $asmId); $ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0], substr($partNames[1],0,3), substr($partNames[1],3,3), substr($partNames[1],6,3), $asmId); @@ -1854,30 +1871,32 @@ $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName; $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0; die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq; printf STDERR "# assemblySource: %s\n", $assemblySource; printf STDERR "# asmHubName %s\n", $asmHubName; printf STDERR "# rmskSpecies %s\n", $rmskSpecies; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; +printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE"; +printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE"; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60;