199b2d4113d9498515030061888af66cbf7e8246 hiram Thu Mar 26 17:40:17 2020 -0700 adding some of the scripts that do the comparisons refs #24547 diff --git src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl new file mode 100755 index 0000000..ad53d52 --- /dev/null +++ src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl @@ -0,0 +1,56 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +open (FH, "ls *.exact.txt|") or die "can not ls *.exact.txt"; +while (my $line = <FH>) { + chomp $line; + my ($sourceAuthority, $destinationAuthority, undef) = split('\.', $line, 3); + open (EX, "<$line") or die "can not read $line"; + while (my $equiv = <EX>) { + chomp $equiv; + my ($source, $sourceCount, undef, $destination, $destinationCount, undef) = + split('\s+', $equiv); + # in the case of these exact matches, the counts are all the same + if ($sourceCount != $destinationCount) { + printf "ERROR: exact matches expect source=destination counts: %d != %d\n", $sourceCount, $destinationCount; + exit 255; + } + printf "%s\t%s\t%s\t%s\t%d\t%d\t%d\n", $source, $destination, + $sourceAuthority, $destinationAuthority, $sourceCount, $sourceCount, $destinationCount; + } + close (EX); +} +close (FH); + +__END__ + +ensembl.genbank.exact.txt +Acanthochromis_polyacanthus.ASM210954v1 30414 30414 GCA_002109545.1_ASM210954v1 30414 30414 +ensembl.refseq.exact.txt +ensembl.ucscDb.exact.txt + +genbank.ensembl.exact.txt +genbank.refseq.exact.txt +genbank.ucscDb.exact.txt + +refseq.ensembl.exact.txt +refseq.genbank.exact.txt +refseq.ucscDb.exact.txt + +ucscDb.ensembl.exact.txt +ucscDb.genbank.exact.txt +ucscDb.refseq.exact.txt +table asmEquivalent +"Equivalence relationship of assembly versions, Ensembl: UCSC, NCBI genbank/refseq" + ( + string source; "assembly name" + string destination; "equivalent assembly name" + enum ("ensembl", "ucsc", "genbank", "refseq") sourceAuthority; "origin of source assembly" + enum ("ensembl", "ucsc", "genbank", "refseq") destinationAuthority; "origin of equivalent assembly" + uint matchCount; "number of exactly matching sequences" + uint sourceCount; "number of sequences in source assembly" + uint destinationCount; "number of sequences in equivalent assembly" + ) +