d2ea6b76f1d69dafb40ebdb6348fb7fbcc5c6e49 hiram Fri Oct 8 15:12:35 2021 -0700 manage the names for single sequences in bacteria refs #28303 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 8c3e1ef..f8434ef 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -253,48 +253,50 @@ } return 1; } ######################################################################### # read chr2acc file, return name correspondence in given hash pointer sub readChr2Acc($$) { my ($chr2acc, $accToChr) = @_; open (FH, "<$chr2acc") or die "can not read $chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\t', $line); $chrN =~ s/ /_/g; # some assemblies have spaces in chr names ... $chrN =~ s/:/_/g; # one assembly GCF_002910315.2 had : in a chr name + $chrN = "chr" if ($chrN eq "na"); # seen in bacteria with only one chr $accToChr->{$acc} = $chrN; } close (FH); } ######################################################################### # process NCBI AGP file into UCSC naming scheme # the agpNames result file is a naming correspondence file for later use sub compositeAgp($$$$) { my ($chr2acc, $agpSource, $agpOutput, $agpNames) = @_; my %accToChr; readChr2Acc($chr2acc, \%accToChr); open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput"; open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames"; foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; my $ncbiChr = "chr${chrN}"; + $ncbiChr = "chr" if ($chrN eq "chr"); # seen in bacteria with only 1 chr open (FH, "zcat $agpSource/${ncbiChr}.comp.agp.gz|") or die "can not read $ncbiChr.comp.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print AGP $line; } else { $ncbiChr = "chrM" if ($ncbiChr eq "chrMT"); printf NAMES "%s\t%s\n", $ncbiChr, $acc; $line =~ s/^$acc/$ncbiChr/ if ($ucscNames); print AGP $line; } } close (FH); } close (AGP); close (NAMES);