5909d2770c119549fd088df7eb210679e6b93b27
hiram
  Wed Nov 9 15:11:08 2022 -0800
update to Ensembl v107 plus rapidRelease no redmine

diff --git src/hg/makeDb/doc/assemblyEquivalence/update.2022-10-18.txt src/hg/makeDb/doc/assemblyEquivalence/update.2022-10-18.txt
new file mode 100644
index 0000000..ffa52b5
--- /dev/null
+++ src/hg/makeDb/doc/assemblyEquivalence/update.2022-10-18.txt
@@ -0,0 +1,446 @@
+
+idKeys for Ensembl assemblies were
+calculated in:  /hive/data/outside/ensembl/genomes/release-107/idKeys/keys/
+
+and Ensembl rapidRelease in:
+/hive/data/outside/ensembl/genomes/rapidRelease/idKeys/GCA/123/456/789/GCA_123456789.1.idKeys.txt
+
+New assemblies have been built for genbano, refseq and ucsc
+*AND* now including all virus and bacteria assemblies in both
+genbank and refseq.  Should see a large jump in the size of
+the resulting table.
+
+Working in:
+mkdir /hive/data/inside/assemblyEquivalence/2022-10-18
+cd /hive/data/inside/assemblyEquivalence/2022-10-18
+ln -s /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/exact.sh .
+
+mkdir ensembl refseq genbank ucsc rapidRelease
+
+### XXX ### The refseq and genbank key setup takes a very long time
+###         since it now includes virus and bacteria
+
+###########  refseq keys  ##################
+cd refseq
+
+find /hive/data/genomes/asmHubs/refseqBuild/GCF \
+    -maxdepth 4 -mindepth 4 -type d | while read buildDir
+do
+  asmId=`basename "${buildDir}"`
+  keySig="${buildDir}/idKeys/${asmId}.keySignature.txt"
+  idKeysDir=`dirname "${keySig}"`
+  if [ -s "${keySig}" ]; then
+     idKeys="$idKeysDir/$asmId.idKeys.txt"
+     fullCount=`cat $idKeys | wc -l`
+     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+     keySig=`cat "${keySig}"`
+  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}"
+  fi
+done | sort -k1,1 > refseq.keySignatures.txt
+
+    #  real    115m40.888s
+
+###########  genbank keys  ##################
+cd ../genbank
+
+find /hive/data/genomes/asmHubs/genbankBuild/GCA \
+    -maxdepth 4 -mindepth 4 -type d | while read buildDir
+do
+  asmId=`basename "${buildDir}"`
+  keySig="${buildDir}/idKeys/${asmId}.keySignature.txt"
+  idKeysDir=`dirname "${keySig}"`
+  if [ -s "${keySig}" ]; then
+     idKeys="$idKeysDir/$asmId.idKeys.txt"
+     fullCount=`cat $idKeys | wc -l`
+     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+     keySig=`cat "${keySig}"`
+ printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}"
+  fi
+done | sort -k1,1 > genbank.keySignatures.txt
+
+###########  UCSC keys  ##################
+cd ../ucsc
+
+ls -d /hive/data/genomes/* | while read dbDir
+do
+  db=`basename "${dbDir}"`
+  keySig="${dbDir}/bed/idKeys/${db}.keySignature.txt"
+  idKeysDir=`dirname "${keySig}"`
+  if [ -s "${keySig}" ]; then
+     idKeys="$idKeysDir/$db.idKeys.txt"
+     fullCount=`cat $idKeys | wc -l`
+     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+     keySig=`cat "${keySig}"`
+   printf "%s\t%s\t%d\t%d\n" "${keySig}" "${db}" "${fullCount}" "${uniqueCount}"
+  fi
+done | sort -k1,1 > ucsc.keySignatures.txt
+
+    # real    0m41.590s
+
+###########  Ensembl Rapid Release keys  ##################
+
+cd ../rapidRelease
+find /hive/data/outside/ensembl/genomes/rapidRelease/idKeys/GCA \
+    -maxdepth 4 -mindepth 4 -type f | grep keySignature.txt | while read keySig
+do
+  asmId=`basename "${keySig}" | sed -e 's/.keySignature.txt//;'`
+  idKeys=`echo "${keySig}" | sed -e 's/keySignature/idKeys/;'`
+  if [ -s "${keySig}" ]; then
+     fullCount=`grep -c . $idKeys`
+     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+     keySig=`cat "${keySig}"`
+ printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}"
+  fi
+done | sort -k1,1 > rapidRelease.keySignatures.txt
+
+    #	real    0m36.132s
+
+
+###########  Ensembl keys  ##################
+cd ../ensembl
+
+# does not always work, use the wget instead
+# rsync -av --stats \
+# rsync://ftp.ensembl.org/ensembl/pub/release-107/species_EnsemblVertebrates.txt \
+#    ./
+
+wget --timestamping  \
+ftp://ftp.ensembl.org/pub/release-107/species_EnsemblVertebrates.txt
+
+# extract the GCx assembly names:
+
+awk -F$'\t' '{printf "%s_%s\n", $6,$5}' species_EnsemblVertebrates.txt \
+   | grep "^G" | sort > ensembl.v107.asmId.txt
+
+# compare this to the ones we have on hand here
+
+cut -f2 ../genbank/genbank.keySignatures.txt  | sort > genbank.asmId.here.txt
+
+wc -l ensembl.v107.asmId.txt genbank.asmId.here.txt | sed -e 's/^/# /;'
+#      301 ensembl.v107.asmId.txt
+#  1441571 genbank.asmId.here.txt
+
+comm -12 ensembl.v107.asmId.txt genbank.asmId.here.txt | wc -l
+#	294
+
+# will need to catch up the assembly hubs for these missing 8:
+comm -23 ensembl.v107.asmId.txt genbank.asmId.here.txt
+GCA_000001215.4_BDGP6.32
+GCA_000004035.1_Meug_1.0
+GCA_000090745.2_AnoCar2.0v2
+GCA_000224145.1_KH
+GCA_000409795.2_ChlSab1.1
+GCA_008746955.1_ASM874695v1
+GCA_900186095.1_CHOK1GS_HDv1
+
+# checking the listings, it isn't clear all these are needed, appears to
+# be outdated versions at Ensembl, and perhaps only two new ones for
+# GCA_002776525.2_ASM277652v2 - assembly status: replaced GCA_002776525.3_ASM277652v3
+# GCA_008746955.1_ASM874695v1 -> GCA_008746955.1_CAU_Wild1.0
+
+# add a new find for this current one
+
+for ensVer in 107 105 104 103 101 100 99
+do
+  printf "# running version %s\n" "${ensVer}" 1>&2
+  find -L /hive/data/outside/ensembl/genomes/release-${ensVer}/idKeys -type f \
+    | grep keySignature.txt | while read K
+  do
+    idKeysDir=`dirname "${K}"`
+    id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
+    idKeys="$idKeysDir/$id.idKeys.txt"
+    fullCount=`cat $idKeys | wc -l`
+    uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+    keySig=`cat "${K}"`
+   printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
+  done | sort -k1,1 > ensembl-${ensVer}.keySignatures.txt
+done
+
+    #	real    2m19.537s
+
+# put them all together:
+
+sort -u ensembl-10?.keySignatures.txt \
+   ensembl-99.keySignatures.txt > ensembl.keySignatures.txt
+
+# something is odd with five of these.  Their names changed between
+# versions but the sum stayed the same.  Can't have this.
+# Find them via
+cut -f1 ensembl.keySignatures.txt | sort | uniq -c | sort -rn | head
+
+      2 7850e2d5dabb6134fdc9d7083f1a3a54
+      2 2f66b2c181e8de7668a9d3657666aa9b
+      2 2e374245cf85b32eddf186cf50f28f82
+      2 298de0770996a131c6f90a49ebebebb8
+      2 0eec31acf1f1120af29d62874cdd9952
+      2 009e19516d917fa9cdae10de74e8c3de
+
+egrep -h "7850e2d5dabb6134fdc9d7083f1a3a54|2f66b2c181e8de7668a9d3657666aa9b|2e374245cf85b32eddf186cf50f28f82|298de0770996a131c6f90a49ebebebb8|0eec31acf1f1120af29d62874cdd9952|009e19516d917fa9cdae10de74e8c3de" *.txt | cut -f2 | sort | uniq -c
+      5 Anolis_carolinensis.AnoCar2.0
+      4 Anolis_carolinensis.AnoCar2.0v2
+      3 Canis_familiaris.CanFam3.1
+      5 Canis_lupus_familiaris.CanFam3.1
+      3 Cyprinus_carpio_german_mirror.German_Mirror_carp_1.0
+      8 Cyprinus_carpio_germanmirror.German_Mirror_carp_1.0
+      3 Cyprinus_carpio_hebao_red.Hebao_red_carp_1.0
+      8 Cyprinus_carpio_hebaored.Hebao_red_carp_1.0
+      4 Drosophila_melanogaster.BDGP6.28
+      5 Drosophila_melanogaster.BDGP6.32
+      7 Gallus_gallus.GRCg6a
+      2 Gallus_gallus_gca000002315v5.GRCg6a
+
+    # Eliminate the older names:
+     mv ensembl.keySignatures.txt ensembl.keySignatures.txt.0 
+
+sort -u ensembl-10?.keySignatures.txt \
+  ensembl-99.keySignatures.txt \
+    | egrep -v -w "Anolis_carolinensis.AnoCar2.0|Canis_familiaris.CanFam3.1|Cyprinus_carpio_germanmirror.German_Mirror_carp_1.0|Cyprinus_carpio_hebaored.Hebao_red_carp_1.0|Drosophila_melanogaster.BDGP6.28|Gallus_gallus.GRCg6a" \
+      > ensembl.keySignatures.txt
+
+    # now it should be clean
+cut -f1 ensembl.keySignatures.txt | sort | uniq -c | sort -rn | head
+    # only count of 1 seen
+
+###########  Exact matches  ##################
+
+cd /hive/data/inside/assemblyEquivalence/2022-10-18
+./exact.sh
+
+~/kent/src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl \
+  | sort > table.2022-10-18.tsv
+wc -l table.*
+  317948 table.2022-10-18.tsv
+  363928 table.2022-02-07.tsv
+
+###########  Near matches  ##################
+
+mkdir /hive/data/inside/assemblyEquivalence/2022-10-18/notExact
+cd /hive/data/inside/assemblyEquivalence/2022-10-18/notExact
+
+sed -e 's/release-99/release-107/;' \
+  /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/runOne > runOne
+chmod +x runOne
+
+### XXX ### this isn't going to work with the big jump in number of
+### genbank/refseq assemblies
+
+time (~/kent/src/hg/makeDb/doc/assemblyEquivalence/allByAll.sh) > all.log 2>&1
+# real    107m11.347s
+
+sed -e 's/\tcount A .*//;' results/match.*.txt > notExact.table.2022-10-18.tsv
+
+ wc -l notExact.tab*
+  1258 notExact.table.2022-10-18.tsv
+  1239 notExact.table.2022-01-26.tsv
+  1738 notExact.table.2022-02-07.tsv
+
+###########  Table contents to load  ##################
+
+sort -u notExact.table.2022-10-18.tsv ../table.2022-10-18.tsv \
+    > hgFixed.asmEquivalent.tsv
+
+### Compare to existing:
+hgsql -N -e 'select * from asmEquivalent;' hgFixed | sort \
+   > existing.2022-10-18.tsv
+
+wc -l hgFixed*
+  319206 hgFixed.asmEquivalent.tsv
+  365666 hgFixed.asmEquivalent.tsv.2022-02-07
+
+
+wc -l hgFixed.asmEquivalent.tsv existing.2022-10-18.tsv
+  319206 hgFixed.asmEquivalent.tsv
+    2594 existing.2022-10-18.tsv
+
+# since we have so many new ones, will have to count identicals
+sort existing.2022-10-18.tsv hgFixed.asmEquivalent.tsv | uniq -c \
+  | sort -rn | awk '$1 > 1' | wc -l
+# 2308
+
+sort existing.2022-10-18.tsv hgFixed.asmEquivalent.tsv | uniq -c \
+  | sort -rn | awk '$1 > 1' | sed -e 's/^ \+//;' | cut -d' ' -f2- \
+    | sort > same.as.before
+
+wc -l same.as.b*
+  2290 same.as.before
+  2308 same.as.before.2022-02-07
+
+# so why are there so many missing that used to be counted ?
+
+sort existing.2022-10-18.tsv same.as.before | uniq -c | awk '$1 > 1' | wc -l
+# 2290
+
+sort existing.2022-10-18.tsv same.as.before | uniq -c | awk '$1 < 2' \
+   | sed -e 's/^ \+//;' | cut -d' ' -f2- | sort > why.gone.missing
+
+wc -l why*
+  304 why.gone.missing
+  286 why.gone.missing.2022-02-07
+
+### measure source authorities:
+cut -f3 hgFixed.asmEquivalent.tsv | sort | uniq -c | sort -n
+    708 ensembl
+    731 ucsc
+   1360 rapidRelease
+ 336537 refseq
+ 337454 genbank
+
+cut -f3 hgFixed.asmEquivalent.tsv | sort | uniq -c | sort -n
+### should be the same as destination authorities:
+    708 ensembl
+    731 ucsc
+   1360 rapidRelease
+ 336537 refseq
+ 337454 genbank
+
+cut -f3,4 hgFixed.asmEquivalent.tsv | sort | uniq -c | sort -n -u
+     24 rapidRelease    ucsc
+     81 ensembl rapidRelease
+     93 ensembl ucsc
+    208 ensembl refseq
+    263 rapidRelease    refseq
+    272 refseq  ucsc
+    326 ensembl genbank
+    342 genbank ucsc
+    992 genbank rapidRelease
+ 335794 genbank refseq
+
+### previously:
+
+cut -f3 existing.2022-10-18.tsv | sort | uniq -c | sort -n
+    571 ensembl
+    612 ucsc
+ 182165 refseq
+ 182318 genbank
+
+cut -f3,4 existing.2022-10-18.tsv | sort | uniq -c | sort -n -u
+     91 ensembl ucsc
+    187 ensembl refseq
+    237 refseq  ucsc
+    284 genbank ucsc
+    293 ensembl genbank
+ 181741 genbank refseq
+
+### How about the important ones:
+
+egrep "mm10|mm39|hg38|hg19" hgFixed.asmEquivalent.tsv
+
+GCA_000001405.14_GRCh37.p13     hg19    genbank ucsc    297     297     298
+GCA_000001405.15_GRCh38 tarIhg38        genbank ucsc    454     455     455
+GCA_000001405.28_GRCh38.p13     hg38    genbank ucsc    640     640     640
+GCA_000001635.8_GRCm38.p6       mm10    genbank ucsc    239     239     239
+GCA_000001635.9_GRCm39  mm39    genbank ucsc    61      61      61
+GCF_000001405.25_GRCh37.p13     hg19    refseq  ucsc    297     297     298
+GCF_000001405.26_GRCh38 tarIhg38        refseq  ucsc    454     455     455
+GCF_000001635.26_GRCm38.p6      mm10    refseq  ucsc    239     239     239
+GCF_000001635.27_GRCm39 mm39    refseq  ucsc    61      61      61
+Mus_musculus.GRCm39     mm39    ensembl ucsc    61      61      61
+hg19    GCA_000001405.14_GRCh37.p13     ucsc    genbank 297     298     297
+hg19    GCF_000001405.25_GRCh37.p13     ucsc    refseq  297     298     297
+hg38    GCA_000001405.28_GRCh38.p13     ucsc    genbank 640     640     640
+mm10    GCA_000001635.8_GRCm38.p6       ucsc    genbank 239     239     239
+mm10    GCF_000001635.26_GRCm38.p6      ucsc    refseq  239     239     239
+mm39    GCA_000001635.9_GRCm39  ucsc    genbank 61      61      61
+mm39    GCF_000001635.27_GRCm39 ucsc    refseq  61      61      61
+mm39    Mus_musculus.GRCm39     ucsc    ensembl 61      61      61
+tarIhg38        GCA_000001405.15_GRCh38 ucsc    genbank 454     455     455
+tarIhg38        GCF_000001405.26_GRCh38 ucsc    refseq  454     455     455
+
+### There are a lot more this time.
+wc -l hgFixed.asmEquivalent.tsv existing.2022-10-18.tsv
+#   676790 hgFixed.asmEquivalent.tsv
+#   365666 existing.2022-10-18.tsv
+
+join hgFixed.asmEquivalent.tsv existing.2022-10-18.tsv | wc -l
+         382994
+
+### probably should *not* be losing any from before:
+join -v 2 hgFixed.asmEquivalent.tsv existing.2022-10-18.tsv | wc -l
+      6
+
+join -t$'\t' -v 2 hgFixed.asmEquivalent.tsv existing.2022-10-18.tsv  | cut -f1-4
+
+GCA_000233375.4_ICSASG_v2       Salmo_salar.ICSASG_v2   genbank ensembl
+GCA_000738735.5_ASM73873v5      GCF_000738735.5_ASM73873v5      genbank refseq
+Gallus_gallus.GRCg6a    GCA_000002315.5_GRCg6a  ensembl genbank
+Gallus_gallus.GRCg6a    GCF_000002315.5_GRCg6a  ensembl refseq
+Gallus_gallus.GRCg6a    galGal6 ensembl ucsc
+Salmo_salar.ICSASG_v2   GCA_000233375.4_ICSASG_v2       ensembl genbank
+
+# Ensembl changed the name of their galGal6:
+grep galGal6 hgFixed.asmEquivalent.tsv | cut -f1-4 | grep ensembl
+
+Gallus_gallus_gca000002315v5.GRCg6a     galGal6 ensembl ucsc
+galGal6 Gallus_gallus_gca000002315v5.GRCg6a     ucsc    ensembl
+
+# and new assembly for Salmo_salar:
+GCA_905237065.2_Ssal_v3.1       Salmo_salar.Ssal_v3.1   genbank ensembl
+GCF_905237065.1_Ssal_v3.1       Salmo_salar.Ssal_v3.1   refseq  ensembl
+
+join -t$'\t' -v 2 hgFixed.asmEquivalent.tsv existing.2022-10-18.tsv \
+   | cut -f3 | sort | uniq -c
+
+      4 ensembl
+      2 genbank
+
+join -t$'\t' -v 2 hgFixed.asmEquivalent.tsv existing.2022-10-18.tsv    | cut -f4 | sort | uniq -c
+      1 ensembl
+      2 genbank
+      2 refseq
+      1 ucsc
+
+# if not 0, investigate.  Sometimes a new assembly is now an
+# exact match to something where it was a near match before to
+# a previous assembly of that organism.
+# In this case, it is the different names same idKeys that were eliminated
+#  this time that weren't taken care of before:
+
+    join -t$'\t' -v 2 hgFixed.asmEquivalent.tsv existing.2022-10-18.tsv \
+       | cut -f1-3
+GCA_000233375.4_ICSASG_v2       Salmo_salar.ICSASG_v2   genbank
+GCA_000738735.5_ASM73873v5      GCF_000738735.5_ASM73873v5      genbank
+Gallus_gallus.GRCg6a    GCA_000002315.5_GRCg6a  ensembl
+Gallus_gallus.GRCg6a    GCF_000002315.5_GRCg6a  ensembl
+Gallus_gallus.GRCg6a    galGal6 ensembl
+Salmo_salar.ICSASG_v2   GCA_000233375.4_ICSASG_v2       ensembl
+
+### there should be some new ones
+join -v 1 hgFixed.asmEquivalent.tsv existing.2022-10-18.tsv | wc -l
+         306571
+
+# I believe the large number of new ones this time is because I went to the
+# trouble of running up the nearMiss for genbank vs. refseq where I did not
+# before because there were so many.
+
+### There should be no duplicate equivalents:
+cut -f1,2 hgFixed.asmEquivalent.tsv | sort | uniq -c | sort -rn | head
+      1 zonAlb1 Zonotrichia_albicollis.Zonotrichia_albicollis-1.0.1
+      1 zonAlb1 GCF_000385455.1_Zonotrichia_albicollis-1.0.1
+      1 zonAlb1 GCA_000385455.1_Zonotrichia_albicollis-1.0.1
+      1 zaiEbo1 GCF_000848505.1_ViralProj14703
+      1 zaiEbo1 GCA_000848505.1_ViralProj14703
+... etc ...
+# none of them indicate '2' they are all unique
+
+#### To load up new table contents:
+hgLoadSqlTab hgFixed asmEquivalent ~/kent/src/hg/lib/asmEquivalent.sql \
+   hgFixed.asmEquivalent.tsv
+
+hgsql -N -e 'select * from asmEquivalent;' hgFixed \
+   | sort > updated.2022-10-18.tsv
+
+wc -l updated* existing*
+  676790 updated.2022-10-18.tsv
+  365666 existing.2022-10-18.tsv
+
+# Previously:
+
+wc -l ../../2022-01-18/notExact/updated.* ../../2022-01-18/notExact/existing.* 
+  319206 ../../2022-01-18/notExact/updated.2022-01-18.tsv
+  365666 ../../2022-01-18/notExact/updated.2022-02-07.tsv
+  2594 ../../2022-01-18/notExact/existing.2022-01-18.tsv
+
+
+wc -l ../../2021-05-11/notExact/updated.* ../../2021-05-11/notExact/existing.*
+  2594 ../../2021-05-11/notExact/updated.2021-05-11.tsv
+  2546 ../../2021-05-11/notExact/existing.2021-05-11.tsv