9909807c4830a971e6493793a4d48b3ff42e3a1e hiram Fri Dec 8 07:39:14 2023 -0800 do not make any alias for UCSC db names refs #28930 diff --git src/hg/gar/mkAlias.pl src/hg/gar/mkAlias.pl index d230944..26f1d44 100755 --- src/hg/gar/mkAlias.pl +++ src/hg/gar/mkAlias.pl @@ -1,205 +1,208 @@ #!/usr/bin/env perl use strict; use warnings; my %alreadyDone; # a record of lc(names) already used, can not have # duplicate alias names pointing to different destinations # value is lc(destination) my %ambiguous; # alias names that match to more than one target, don't use # them. key is lc(alias) value is destination my %goodToGo; # key is lc(alias), value is aliasdestination neither lc() # any ambiguous ones have been removed if so discovered later my %protectedAliases; # key is the lc(alias) name, value is their destination # if any other aliases come in after they are in this list, # don't use them or knock anything out if they are ambiguous # these aliases are pointing to database browser instances # on the RR, they take precedence over any other relationships +## establish a list of active database browsers +my %rrActive; # key is db name from hgcentral where active=1, value is 1 +my %rrCiActive; # key is lc(db) name for case insensitive matches sub outOne($$) { my ($alias, $destination) = @_; my $lcAlias = lc($alias); my $lcDest = lc($destination); return if ($lcAlias eq $lcDest); # no identity functions return if (defined($ambiguous{$lcAlias})); # ignore ambiguous aliases return if (defined($ambiguous{$lcDest})); # ignore ambiguous aliases - if (defined($protectedAliases{lc($alias)})) { + return if (defined($rrCiActive{$lcAlias})); # no aliases for UCSC db names + if (defined($protectedAliases{$lcAlias})) { ## warning if ambiguous if ($protectedAliases{lc($alias)} ne lc($destination)) { printf STDERR "# WARNING: protected and ambiguous ? %s -> %s %s\n", $alias, $destination, $protectedAliases{lc($alias)}; } return; } if (defined($alreadyDone{$lcAlias})) { # check for ambiguous dups if ($alreadyDone{$lcAlias} ne $lcDest) { $ambiguous{$lcAlias} = $destination; $ambiguous{$lcDest} = $alias; printf STDERR "# ambiguous %s %s %s\n", $alias, $destination, $alreadyDone{$lcAlias}; delete($goodToGo{$lcDest}) if (defined($goodToGo{$lcDest})); delete($goodToGo{$lcAlias}) if (defined($goodToGo{$lcAlias})); } # else it is not ambiguous, only an exact duplicate } else { # this is a new one $alreadyDone{$lcAlias} = $lcDest; $alreadyDone{$lcDest} = $lcAlias; # no circular definitions $goodToGo{$lcAlias} = sprintf("%s\t%s", $alias, $destination); } } # just for verficatiuon, see if any ambiguous names come in here # there should not be anything like that sub priorityAmbiguous($$) { my ($id, $db) = @_; if ($protectedAliases{lc($id)} ne lc($db)) { printf STDERR "# ERROR: ambiguous protectedAlias: %s -> %s != %s\n", $id, $db, $protectedAliases{lc($id)}; exit 255; } return; } # special relationships for database browsers on the RR, they take # precedence over all other relationships, check for ambiguous confusion sub dbPriority($$) { my ($asmId, $db) = @_; if (defined($protectedAliases{lc($asmId)})) { priorityAmbiguous($asmId, $db); } outOne($asmId, $db); $protectedAliases{lc($asmId)} = lc($db); my @a = split('_', $asmId, 3); my $accession = "$a[0]_$a[1]"; if (defined($protectedAliases{lc($accession)})) { priorityAmbiguous($accession, $db); } outOne($accession, $db); $protectedAliases{lc($accession)} = lc($db); if (defined($protectedAliases{lc($a[2])})) { priorityAmbiguous($a[2], $db); } outOne($a[2], $db); $protectedAliases{lc($a[2])} = lc($db); } -## establish a list of active database browsers -my %rrActive; # key is db name from hgcentral where active=1, value is 1 open (FH, "hgsql -N -hgenome-centdb -e 'select name from dbDb where active=1;' hgcentral|") or die "can not hgsql select from dbDb.hgcentral"; while (my $dbName = ) { chomp $dbName; $rrActive{$dbName} = 1; + $rrCiActive{lc($dbName)} = 1; } close (FH); printf STDERR "# first preference, specific high priority database assemblies\n"; # first preference are the high priority database assemblies # these should override GenArk hubs or any other relations, these are # manually verified dbPriority("GCA_009914755.4_T2T-CHM13v2.0", "hs1"); dbPriority("GCF_009914755.1_T2T-CHM13v2.0", "hs1"); dbPriority("GCF_000001405.40_GRCh38.p14", "hg38"); dbPriority("GCA_000001405.29_GRCh38.p14", "hg38"); dbPriority("GCA_000001405.14_GRCh37.p13", "hg19"); dbPriority("GCF_000001405.25_GRCh37.p13", "hg19"); dbPriority("GCA_000001635.1_MGSCv37", "mm9"); dbPriority("GCF_000001635.18_MGSCv37", "mm9"); dbPriority("GCA_000001635.8_GRCm38.p6", "mm10"); dbPriority("GCF_000001635.26_GRCm38.p6", "mm10"); dbPriority("GCA_000001635.9_GRCm39", "mm39"); dbPriority("GCF_000001635.27_GRCm39", "mm39"); # second priority, see if we can add more aliases for equivalent # active browsers on the RR printf STDERR "# second priority, any NCBI name perfect match to active RR database\n"; open (FH, "hgsql -N -e 'SELECT source,destination FROM asmEquivalent WHERE ((sourceAuthority=\"refseq\" OR sourceAuthority=\"genbank\") AND destinationAuthority=\"ucsc\") AND ABS(matchCount-sourceCount)<1;' hgFixed|") or die "can not SELECT from asmEquivalent"; while (my $line=) { chomp $line; my ($asmId, $db) = split('\t', $line); if (defined($rrActive{$db})) { dbPriority($asmId, $db); } # printf STDERR "# '%s'\t'%s'\n", $asmId, $db; } close (FH); # third priority, any NCBI name with a match count difference of 1 matching # any active RR database, but do NOT override or conflict with any of the # previously established relationships. printf STDERR "# third priority, any NCBI name with a close match count\n"; open (FH, "hgsql -N -e 'SELECT source,destination FROM asmEquivalent WHERE ((sourceAuthority=\"refseq\" OR sourceAuthority=\"genbank\") AND destinationAuthority=\"ucsc\") AND ABS(matchCount-sourceCount)=1;' hgFixed|") or die "can not SELECT from asmEquivalent"; while (my $line=) { chomp $line; my ($asmId, $db) = split('\t', $line); next if (defined($protectedAliases{lc($asmId)})); if (defined($rrActive{$db})) { printf STDERR "# secondary %s -> %s\n", $asmId, $db; dbPriority($asmId, $db); } } close (FH); printf STDERR "# fourth GenArk hubs\n"; # fourth preference will be GenArk hubs open (FH, "grep -v '^#' UCSC_GI.assemblyHubList.txt|grep -v GCA_009914755.4|") or die "can not read UCSC_GI.assemblyHubList.txt"; while (my $line = ) { chomp $line; my ($accession, $asmName, $rest) = split('\t', $line, 3); my $asmId = "${accession}_${asmName}"; outOne($accession, $accession); outOne($asmName, $accession); outOne($asmId, $accession); } close (FH); printf STDERR "# fifth equivalent genbank assemblies\n"; # fourth set, equivalent genbank to refseq only one difference and # destination is an assembly hub # extract ordered list of asmId from GenArk listing: `grep -v "^#" UCSC_GI.assemblyHubList.txt | \ awk -F\$'\\t' '{printf "%s_%s\\n", \$1,\$2}' | sort -u > genArk.asmId.list`; # and extract genbank to refseq equivalents with only one difference: # join with genArk list to select only those for a destination: `hgsql -N -e 'SELECT source,destination FROM asmEquivalent WHERE (sourceAuthority="genbank" AND destinationAuthority="refseq") AND ABS(matchCount-sourceCount)<2;' hgFixed | sort -k2 > asmEquiv.gbk.rs.list`; open (FH, "join -t\$'\\t' -1 2 asmEquiv.gbk.rs.list genArk.asmId.list | awk '{printf \"%s\\t%s\\n\", \$2, \$1}'|") or die "can not join asmEquiv.gbk.rs.list genArk.asmId.list"; while (my $line = ) { chomp $line; my ($src, $dest) = split('\t', $line); my @d = split('_', $dest, 3); my $genArk = "$d[0]_$d[1]"; my @a = split('_', $src, 3); my $accession = "$a[0]_$a[1]"; outOne($accession, $genArk); outOne($src, $genArk); } close (FH); if ( 0 == 1 ) { printf STDERR "# fifth no equivalents\n"; # fifth set are those we have no equivalent, can be requested open (FH, "awk -v FS=\$'\t' -v OFS=\$'\t' '\$2 = \"request\"' primates.tableData.txt mammals.tableData.txt birds.tableData.txt fish.tableData.txt fungi.tableData.txt invertebrate.tableData.txt plants.tableData.txt vertebrate.tableData.txt|") or die "can not read *.tableData.txt"; while (my $line = ){ my (undef, undef, $comName, $sciName, $asmId, $rest) = split('\t', $line, 6); my ($gcX, $id, $name) = split('_', $asmId, 3); my $accession = "${gcX}_$id"; outOne($accession, "n/a"); outOne($asmId, "n/a"); outOne($name, "n/a"); } close (FH); } # all done, output the table foreach my $lcAlias(sort keys %goodToGo) { printf "%s\n", $goodToGo{$lcAlias}; }