b9032792e3f2d0e704ecaf56b1fac45457acaa37 hiram Fri Feb 7 14:54:23 2020 -0800 eliminate : in chr names and correctly reading dup list where needed refs #23891 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 0b27f9a..cdd137e 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -232,30 +232,31 @@ } } return 1; } ######################################################################### # read chr2acc file, return name correspondence in given hash pointer sub readChr2Acc($$) { my ($chr2acc, $accToChr) = @_; open (FH, "<$chr2acc") or die "can not read $chr2acc"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\t', $line); $chrN =~ s/ /_/g; # some assemblies have spaces in chr names ... + $chrN =~ s/:/_/g; # one assembly GCF_002910315.2 had : in a chr name $accToChr->{$acc} = $chrN; } close (FH); } ######################################################################### # process NCBI AGP file into UCSC naming scheme # the agpNames result file is a naming correspondence file for later use sub compositeAgp($$$$) { my ($chr2acc, $agpSource, $agpOutput, $agpNames) = @_; my %accToChr; readChr2Acc($chr2acc, \%accToChr); open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput"; open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames"; @@ -648,50 +649,50 @@ faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\ \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz rm -f \$asmId.2bit faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit fi gzip -f \$asmId.dups.txt touch -r \${asmId}_genomic.fna.gz \$asmId.2bit else printf "# download step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); - readDupsList(); - } # doDownload ######################################################################### # * step: sequence [workhorse] sub doSequence { my $runDir = "$buildDir/sequence"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "process source files into 2bit sequence and agp"; my $bossScript = newBash HgRemoteScript("$runDir/doSequence.bash", $workhorse, $runDir, $whatItDoes); my $twoBitFile = "$buildDir/download/$asmId.2bit"; my $otherChrParts = 0; # to see if this is unplaced scaffolds only my $primaryAssembly = "$buildDir/download/${asmId}_assembly_structure/Primary_Assembly"; my $partsDone = 0; + readDupsList(); + ########### Assembled chromosomes ################ my $chr2acc = "$primaryAssembly/assembled_chromosomes/chr2acc"; if ( -s $chr2acc ) { ++$otherChrParts; my $agpSource = "$primaryAssembly/assembled_chromosomes/AGP"; my $agpOutput = "$runDir/$asmId.chr.agp.gz"; my $agpNames = "$runDir/$asmId.chr.names"; my $fastaOut = "$runDir/$asmId.chr.fa.gz"; $partsDone += 1; if (needsUpdate($chr2acc, $agpOutput)) { compositeAgp($chr2acc, $agpSource, $agpOutput, $agpNames); `touch -r $chr2acc $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { compositeFasta($chr2acc, $twoBitFile, $fastaOut);