c270fe6ea9ae2701f1d41dc3b4180ac511dff187 hiram Mon Sep 21 14:55:35 2020 -0700 initial script works only on araTha refs #24396 diff --git src/hg/utils/automation/asmHubChromAlias.pl src/hg/utils/automation/asmHubChromAlias.pl new file mode 100755 index 0000000..9147569 --- /dev/null +++ src/hg/utils/automation/asmHubChromAlias.pl @@ -0,0 +1,161 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +my $argc = scalar(@ARGV); +if ($argc != 1) { + printf STDERR "usage: asmHubChromAlias.pl asmId > asmId.chromAlias.tab\n\n"; + printf STDERR "where asmId is something like: GCF_000001735.4_TAIR10.1\n"; + printf STDERR "Outputs a tab file suitable for processing with ixIxx to\n"; + printf STDERR "create an index file to use in an assembly hub.\n"; + printf STDERR "This command assumes it is in a work directory in the\n"; + printf STDERR "assembly hub: .../asmId/trackData/chromAlias/\n"; + printf STDERR "and .../asmId/downloads/ and .../asmId/sequence/ have\n"; + printf STDERR "been created and processed for the hub build.\n"; + exit 255; +} + +my %aliasOut; # key is alias name, value is sequence name in this assembly + +sub showAlias() { + my %chromIndex; # key is sequence name in assembly, value + # is a tab separated list of aliases + foreach my $alias (sort keys %aliasOut) { + my $name = $aliasOut{$alias}; +# printf "%s\t%s\n", $alias, $aliasOut{$alias}; + if (defined($chromIndex{$name})) { + $chromIndex{$name} .= "\t" . $alias; + } else { + $chromIndex{$name} = $alias; + } + } + foreach my $name (sort keys %chromIndex) { + printf "%s\t%s\n", $name, $chromIndex{$name}; + } +} + +# given an alias and a sequence name, add to result or verify identical +# to previous add +sub addAlias($$) { + my ($alias, $sequence) = @_; + # do not need to add the sequence name itself + return if ($alias eq $sequence); + # already done, verify it is equivalent to previous request + if (defined($aliasOut{$alias})) { + if ($sequence ne $aliasOut{$alias}) { + printf STDERR "ERROR: additional alias '%s:%s' does not match previous '%s'\n", $alias, $sequence, $aliasOut{$alias}; + exit 255; + } + return; + } + $aliasOut{$alias} = $sequence; + return; +} + +sub addVariations($$) { + my ($alias, $sequence) = @_; + addAlias($alias, $sequence); + return; + addAlias(lc($alias), $sequence); + my $noDotName = $alias; + $noDotName =~ s/\..*//; + addAlias($noDotName, $sequence); + addAlias(lc($noDotName), $sequence); + return; +} + +my $asmId = shift; + +my $twoBit = "../../$asmId.2bit"; + +my %sequenceNames; # key is sequence name, value is sequence size + +open (FH, "twoBitInfo $twoBit stdout|") or die "can not twoBitInfo $twoBit stdout"; +while (my $line = <FH>) { + chomp $line; + my ($name, $size) = split('\s+', $line); + $sequenceNames{$name} = $size; + addVariations($name, $name); +} + +close (FH); + +# foreach my $sequence (sort keys %sequenceNames) { +# printf "# %s\t%d\n", $sequence, $sequenceNames{$sequence}; +# } + +my %chrNames; # key is sequence name, value is 'chr' UCSC chromosome name +open (FH, "cat ../../sequence/*.names|") or die "can not cat ../../sequence/*.names"; +while (my $line = <FH>) { + chomp $line; + my ($chr, $name) = split('\s+', $line); + $chrNames{$name} = $chr; + addVariations($chr, $name); +} +close (FH); + +# foreach my $sequence (sort keys %chrNames) { +# printf "%s\t%s\n", $chrNames{$sequence}, $sequence; +# } + +my %chr2acc; # key is sequence name, value is NCBI chromosome name +open (FH, "grep -h -v '^#' ../../download/${asmId}_assembly_structure/*/*/chr2acc|") or die "can not gep chr2acc files"; +while (my $line = <FH>) { + chomp $line; + my ($chr, $name) = split('\s+', $line); + $chr2acc{$name} = $chr; + addVariations($chr, $name); +} +close (FH); + +# foreach my $sequence (sort keys %chr2acc) { +# printf "%s\t%s\n", $chr2acc{$sequence}, $sequence; +# } + +my %gbk2acc; # key is refseq name, value is genbank accession +open (FH, "grep -v '^#' ../../download/${asmId}_assembly_report.txt | cut -d\$'\t' -f5,7|") or die "can not grep assembly_report"; +while (my $line = <FH>) { + chomp $line; + my ($gbk, $name) = split('\s+', $line); + next if ($name eq "na"); # may not be any RefSeq names + $gbk2acc{$name} = $gbk; + addVariations($gbk, $name); +} +close (FH); + +showAlias(); +exit 0; + +foreach my $sequence (sort keys %gbk2acc) { + printf "%s\t%s\n", $gbk2acc{$sequence}, $sequence; +} + +my %seq2gbk; # key is genbank name, value is sequence name +open (FH, "grep -v '^#' ../../download/${asmId}_assembly_report.txt | cut -d\$'\t' -f1,5|") or die "can not grep assembly_report"; +while (my $line = <FH>) { + chomp $line; + my ($seq, $name) = split('\s+', $line); + next if ($name eq "na"); # may not be any genbank name + $seq2gbk{$name} = $seq; +} +close (FH); + +foreach my $sequence (sort keys %seq2gbk) { + printf "%s\t%s\n", $seq2gbk{$sequence}, $sequence; +} + +__END__ + +export asmId="GCF_000001735.4_TAIR10.1" + +grep -v "^#" ../../download/${asmId}_assembly_report.txt \ + | cut -d$'\t' -f1,5 + +grep -v "^#" ../../download/${asmId}_assembly_report.txt \ + | cut -d$'\t' -f1,7 + +grep -v "^#" ../../download/${asmId}_assembly_report.txt \ + | cut -d$'\t' -f5,7 + +2 assembled-molecule 2 Chromosome CP002685.1 = NC_003071.7 Primary Assembly 19698289 na