0f9627d32b1a95d56c1078595924d387375e487d hiram Tue Sep 13 14:37:39 2022 -0700 the chromAlias needs to know about the NCBI asmId refs #29811 diff --git src/hg/utils/automation/asmHubChromAlias.pl src/hg/utils/automation/asmHubChromAlias.pl index 6fbdfba..ffa9187 100755 --- src/hg/utils/automation/asmHubChromAlias.pl +++ src/hg/utils/automation/asmHubChromAlias.pl @@ -1,43 +1,44 @@ #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); -if ($argc != 1) { - printf STDERR "usage: asmHubChromAlias.pl asmId > asmId.chromAlias.tab\n\n"; +if ($argc != 2) { + printf STDERR "usage: asmHubChromAlias.pl asmId ncbiAsmId > asmId.chromAlias.tab\n\n"; printf STDERR "where asmId is something like: GCF_000001735.4_TAIR10.1\n"; printf STDERR "Outputs a tab file suitable for processing with ixIxx to\n"; printf STDERR "create an index file to use in an assembly hub.\n"; printf STDERR "This command assumes it is in a work directory in the\n"; printf STDERR "assembly hub: .../asmId/trackData/chromAlias/\n"; printf STDERR "and ../../download/ and ../../sequence/ have\n"; printf STDERR "been created and processed for the hub build.\n"; exit 255; } my %ucscToRefSeq; # key is UCSC sequence name, value is RefSeq name my %ucscToGenbank; # key is UCSC sequence name, value is GenBank name my $ucscNames = 0; # == 1 if sequence is UCSC names, == 0 if NCBI names my $dupCount = 0; my %dupToSequence; # key is duplicate name, value is target sequence name # to manage duplicates coming in from assembly report my $sequenceCount = 0; my %sequenceSizes; # key is sequence name, value is sequence size my $asmId = shift; +my $ncbiAsmId = shift; my %aliasOut; # key is source name, value is a hash pointer where # key is alias name and value is chrom name # typical source names: ucsc, refseq, genbank, ensembl, assembly, ncbi # the argument is the name of the 'native' source name # where 'native' are the names in the assembly itself, thus the # assembly names. The other sources are aliases to that. sub showAlias($) { my ($native) = @_; # second line is the column headers printf "# %s", $native; foreach my $source (sort keys %aliasOut) { printf "\t%s", $source if ($source ne $native); } @@ -123,31 +124,31 @@ # already done, verify it is equivalent to previous request if (defined($hashPtr->{$alias})) { if ($sequence ne $hashPtr->{$alias}) { printf STDERR "ERROR: additional alias '%s:%s' does not match previous '%s'\n", $alias, $sequence, $hashPtr->{$alias}; exit 255; } return; } $hashPtr->{$alias} = $sequence; return; } # asmSource - is this a genbank or refseq assembly my $asmSource = "genbank"; my $isRefSeq = 0; # == 0 for Genbank assembly, == 1 for RefSeq assembly -if ($asmId =~ m/^GCF/) { +if ($ncbiAsmId =~ m/^GCF/) { # printf STDERR "# processing a RefSeq assembly\n"; $isRefSeq = 1; $asmSource = "refseq"; } else { # printf STDERR "# processing a GenBank assembly\n"; } my $twoBit = "../../$asmId.2bit"; open (FH, "twoBitInfo $twoBit stdout|") or die "can not twoBitInfo $twoBit stdout"; while (my $line = ) { chomp $line; my ($name, $size) = split('\s+', $line); $sequenceSizes{$name} = $size; ++$sequenceCount;