4a372a204622bf9a38b0cadf2a22a9fb0d09b570
hiram
  Wed Jun 2 13:51:23 2021 -0700
for Ensembl v104 update refs #27616

diff --git src/hg/makeDb/doc/assemblyEquivalence/update.2021-05-11.txt src/hg/makeDb/doc/assemblyEquivalence/update.2021-05-11.txt
new file mode 100644
index 0000000..656d577
--- /dev/null
+++ src/hg/makeDb/doc/assemblyEquivalence/update.2021-05-11.txt
@@ -0,0 +1,309 @@
+
+idKeys for Ensembl assemblies were
+calculated in:  /hive/data/outside/ensembl/genomes/release-104/
+
+New assemblies have been built for genbank, refseq and ucsc
+
+Working in:
+mkdir /hive/data/inside/assemblyEquivalence/2021-05-11
+cd /hive/data/inside/assemblyEquivalence/2021-05-11
+
+mkdir ensembl refseq genbank ucsc
+
+###########  refseq keys  ##################
+cd refseq
+
+find /hive/data/genomes/asmHubs/refseqBuild/GCF \
+    -maxdepth 4 -mindepth 4 -type d | while read buildDir
+do
+  asmId=`basename "${buildDir}"`
+  keySig="${buildDir}/idKeys/${asmId}.keySignature.txt"
+  idKeysDir=`dirname "${keySig}"`
+  if [ -s "${keySig}" ]; then
+     idKeys="$idKeysDir/$asmId.idKeys.txt"
+     fullCount=`cat $idKeys | wc -l`
+     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+     keySig=`cat "${keySig}"`
+  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}"
+  fi
+done | sort -k1,1 > refseq.keySignatures.txt
+
+###########  genbank keys  ##################
+cd ../genbank
+
+find /hive/data/genomes/asmHubs/genbankBuild/GCA \
+    -maxdepth 4 -mindepth 4 -type d | while read buildDir
+do
+  asmId=`basename "${buildDir}"`
+  keySig="${buildDir}/idKeys/${asmId}.keySignature.txt"
+  idKeysDir=`dirname "${keySig}"`
+  if [ -s "${keySig}" ]; then
+     idKeys="$idKeysDir/$asmId.idKeys.txt"
+     fullCount=`cat $idKeys | wc -l`
+     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+     keySig=`cat "${keySig}"`
+ printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}"
+  fi
+done | sort -k1,1 > genbank.keySignatures.txt
+
+###########  UCSC keys  ##################
+cd ../ucsc
+
+ls -d /hive/data/genomes/* | while read dbDir
+do
+  db=`basename "${dbDir}"`
+  keySig="${dbDir}/bed/idKeys/${db}.keySignature.txt"
+  idKeysDir=`dirname "${keySig}"`
+  if [ -s "${keySig}" ]; then
+     idKeys="$idKeysDir/$db.idKeys.txt"
+     fullCount=`cat $idKeys | wc -l`
+     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+     keySig=`cat "${keySig}"`
+   printf "%s\t%s\t%d\t%d\n" "${keySig}" "${db}" "${fullCount}" "${uniqueCount}"
+  fi
+done | sort -k1,1 > ucsc.keySignatures.txt
+
+cd /hive/data/inside/assemblyEquivalence/2021-05-11
+ln -s /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/exact.sh .
+
+###########  Ensembl keys  ##################
+cd ../ensembl
+
+# does not always work, use the wget instead
+# rsync -av --stats \
+# rsync://ftp.ensembl.org/ensembl/pub/release-104/species_EnsemblVertebrates.txt \
+#    ./
+
+wget --timestamping  \
+ftp://ftp.ensembl.org/pub/release-104/species_EnsemblVertebrates.txt
+
+# extract the GCx assembly names:
+
+awk -F$'\t' '{printf "%s_%s\n", $6,$5}' species_EnsemblVertebrates.txt \
+   | grep "^G" | sort > ensembl.v104.asmId.txt
+
+# compare this to the ones we have on hand here
+
+cut -f2 ../genbank/genbank.keySignatures.txt  | sort > genbank.asmId.here.txt
+
+wc -l ensembl.v104.asmId.txt genbank.asmId.here.txt | sed -e 's/^/# /;'
+#   295 ensembl.v104.asmId.txt
+#  1448 genbank.asmId.here.txt
+
+comm -12 ensembl.v104.asmId.txt genbank.asmId.here.txt | wc -l
+#	288
+
+# will need to catch up the assembly hubs for these missing 7:
+comm -23 ensembl.v104.asmId.txt genbank.asmId.here.txt
+# GCA_000001215.4_BDGP6.32
+# GCA_000004035.1_Meug_1.0
+# GCA_000090745.2_AnoCar2.0v2
+# GCA_000224145.1_KH
+# GCA_000409795.2_ChlSab1.1
+# GCA_013347765.1_ASM1334776v1
+# GCA_900186095.1_CHOK1GS_HDv1
+# checking the listings, it isn't clear all these are needed, appears to
+# be outdated versions at Ensembl, and perhaps only one new one for
+# GCA_013347765.1_ASM1334776v1
+# it exists as a GCF assembly
+
+find -L /hive/data/outside/ensembl/genomes/release-104/idKeys -type f \
+  | grep keySignature.txt | while read K
+do
+  idKeysDir=`dirname "${K}"`
+  id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
+  idKeys="$idKeysDir/$id.idKeys.txt"
+  fullCount=`cat $idKeys | wc -l`
+  uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+  keySig=`cat "${K}"`
+  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
+done | sort -k1,1 > ensembl-104.keySignatures.txt
+
+find -L /hive/data/outside/ensembl/genomes/release-103/idKeys -type f \
+  | grep keySignature.txt | while read K
+do
+  idKeysDir=`dirname "${K}"`
+  id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
+  idKeys="$idKeysDir/$id.idKeys.txt"
+  fullCount=`cat $idKeys | wc -l`
+  uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+  keySig=`cat "${K}"`
+  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
+done | sort -k1,1 > ensembl-103.keySignatures.txt
+
+find -L /hive/data/outside/ensembl/genomes/release-101/idKeys -type f \
+  | grep keySignature.txt | while read K
+do
+  idKeysDir=`dirname "${K}"`
+  id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
+  idKeys="$idKeysDir/$id.idKeys.txt"
+  fullCount=`cat $idKeys | wc -l`
+  uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+  keySig=`cat "${K}"`
+  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
+done | sort -k1,1 > ensembl-101.keySignatures.txt
+
+find /hive/data/outside/ensembl/genomes/release-100/idKeys -type f \
+  | grep keySignature.txt | while read K
+do
+  idKeysDir=`dirname "${K}"`
+  id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
+  idKeys="$idKeysDir/$id.idKeys.txt"
+  fullCount=`cat $idKeys | wc -l`
+  uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+  keySig=`cat "${K}"`
+  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
+done | sort -k1,1 > ensembl-100.keySignatures.txt
+
+find /hive/data/outside/ensembl/genomes/release-99/idKeys -type f \
+  | grep keySignature.txt | while read K
+do
+  idKeysDir=`dirname "${K}"`
+  id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
+  idKeys="$idKeysDir/$id.idKeys.txt"
+  fullCount=`cat $idKeys | wc -l`
+  uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
+  keySig=`cat "${K}"`
+  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
+done | sort -k1,1 > ensembl-99.keySignatures.txt
+
+# something is odd with three of these.  Their names changed between
+# versions but the sum stayed the same.  Can't have this.
+# Find them via
+cut -f1 ensembl.keySignatures.txt | sort | uniq -c | sort -rn | head
+      2 2e374245cf85b32eddf186cf50f28f82
+      2 298de0770996a131c6f90a49ebebebb8
+      2 0eec31acf1f1120af29d62874cdd9952
+      1 ffb6ffd6b48667dd28ac3a2364b6c10e
+egrep -h "2e374245cf85b32eddf186cf50f28f82|298de0770996a131c6f90a49ebebebb8|0eec31acf1f1120af29d62874cdd9952" *.txt | cut -f2 | sort | uniq -c
+      5 Anolis_carolinensis.AnoCar2.0
+      2 Anolis_carolinensis.AnoCar2.0v2
+      2 Canis_familiaris.CanFam3.1
+      5 Canis_lupus_familiaris.CanFam3.1
+      4 Drosophila_melanogaster.BDGP6.28
+      3 Drosophila_melanogaster.BDGP6.32
+
+    # Eliminate the old ones:
+sort -u ensembl-10?.keySignatures.txt \
+  ensembl-99.keySignatures.txt \
+    | egrep -v -w "Anolis_carolinensis.AnoCar2.0|Canis_familiaris.CanFam3.1|Drosophila_melanogaster.BDGP6.28" \
+      > ensembl.keySignatures.txt
+
+###########  Exact matches  ##################
+
+cd /hive/data/inside/assemblyEquivalence/2021-05-11
+./exact.sh
+
+~/kent/src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl \
+  | sort > table.2021-05-11.tsv
+
+###########  Near matches  ##################
+
+mkdir /hive/data/inside/assemblyEquivalence/2021-05-11/notExact
+cd /hive/data/inside/assemblyEquivalence/2021-05-11/notExact
+
+sed -e 's/release-99/release-104/;' \
+  /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/runOne > runOne
+chmod +x runOne
+
+time (~/kent/src/hg/makeDb/doc/assemblyEquivalence/allByAll.sh) > all.log 2>&1
+# real    17m52.128s
+
+sed -e 's/\tcount A .*//;' results/match.*.txt > notExact.table.2021-05-11.tsv
+wc -l notExact.table.2021-05-11.tsv
+# 1200 notExact.table.2021-05-11.tsv
+
+###########  Table contents to load  ##################
+
+sort -u notExact.table.2021-05-11.tsv ../table.2021-05-11.tsv \
+    > hgFixed.asmEquivalent.tsv
+
+### Compare to existing:
+hgsql -N -e 'select * from asmEquivalent;' hgFixed | sort \
+   > existing.2021-05-11.tsv
+
+wc -l hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv
+  2594 hgFixed.asmEquivalent.tsv
+  2546 existing.2021-05-11.tsv
+
+### How about the important ones:
+
+egrep "mm10|mm39|hg38|hg19" hgFixed.asmEquivalent.tsv
+
+GCA_000001405.27_GRCh38.p12     hg38    genbank ucsc    595     595     595
+GCA_000001635.8_GRCm38.p6       mm10    genbank ucsc    239     239     239
+GCA_000001635.9_GRCm39  mm39    genbank ucsc    61      61      61
+GCF_000001635.26_GRCm38.p6      mm10    refseq  ucsc    239     239     239
+GCF_000001635.27_GRCm39 mm39    refseq  ucsc    61      61      61
+Mus_musculus.GRCm39     mm39    ensembl ucsc    61      61      61
+hg38    GCA_000001405.27_GRCh38.p12     ucsc    genbank 595     595     595
+mm10    GCA_000001635.8_GRCm38.p6       ucsc    genbank 239     239     239
+mm10    GCF_000001635.26_GRCm38.p6      ucsc    refseq  239     239     239
+mm39    GCA_000001635.9_GRCm39  ucsc    genbank 61      61      61
+mm39    GCF_000001635.27_GRCm39 ucsc    refseq  61      61      61
+mm39    Mus_musculus.GRCm39     ucsc    ensembl 61      61      61
+
+### most should be identical to before
+join hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv | wc -l
+         5473
+
+### probably should *not* be losing any from before:
+join -v 2 hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv | wc -l
+      7
+# if not 0, investigate.  Sometimes a new assembly is now an
+# exact match to something where it was a near match before to
+# a previous assembly of that organism.
+# In this case, it is the different names same idKeys that were eliminated
+#  this time that weren't taken care of before:
+
+    join -t$'\t' -v 2 hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv \
+       | cut -f1-3
+Anolis_carolinensis.AnoCar2.0   GCA_000090745.2_AnoCar2.0       ensembl
+Anolis_carolinensis.AnoCar2.0   GCF_000090745.1_AnoCar2.0       ensembl
+Anolis_carolinensis.AnoCar2.0   anoCar2 ensembl
+Canis_familiaris.CanFam3.1      GCF_000002285.3_CanFam3.1       ensembl
+Canis_familiaris.CanFam3.1      canFam3 ensembl
+Drosophila_melanogaster.BDGP6.28        GCF_000001215.4_Release_6_plus_ISO1_MT ensembl
+Drosophila_melanogaster.BDGP6.28        dm6     ensembl
+
+### note previous comments:
+### ### Showing 5 this time:
+### ### This is because the nearMiss calculation is only taking place on the
+### ### most recent Ensembl release, not all the past ones.  These two assemblies
+### ### have new version in ens104 release, thus these older Ensembl assemblies
+### ### are not being checked for near miss.  For now, going to leave these
+### ### missing from the table.  Can't maintain all older Ensembl equivalents.
+
+### Esox_lucius.Eluc_V3 GCA_000721915.3_Eluc_V3 ensembl genbank 1018 1019 1018
+### GCA_000721915.3_Eluc_V3 Esox_lucius.Eluc_V3 genbank ensembl 1018 1018 1019
+### Sarcophilus_harrisii.DEVIL7.0 GCA_000189315.1_Devil_ref_v7.0 ensembl genbank 35974 35975 35974
+### Sarcophilus_harrisii.DEVIL7.0 GCF_000189315.1_Devil_ref_v7.0 ensembl refseq 35974 35975 35974
+### Sarcophilus_harrisii.DEVIL7.0 sarHar1 ensembl ucsc 35974 35975 35974
+
+### there should be some new ones
+join -v 1 hgFixed.asmEquivalent.tsv existing.2021-05-11.tsv | wc -l
+         42
+
+### There should be no duplicate equivalents:
+cut -f1,2 hgFixed.asmEquivalent.tsv | sort | uniq -c | sort -rn | head
+      1 zonAlb1 Zonotrichia_albicollis.Zonotrichia_albicollis-1.0.1
+      1 zonAlb1 GCF_000385455.1_Zonotrichia_albicollis-1.0.1
+      1 zonAlb1 GCA_000385455.1_Zonotrichia_albicollis-1.0.1
+      1 xipMac1 GCA_000241075.1_Xiphophorus_maculatus-4.4.2
+      1 xenTro9 Xenopus_tropicalis.Xenopus_tropicalis_v9.1
+... etc ...
+
+#### To load up new table contents:
+hgLoadSqlTab hgFixed asmEquivalent ~/kent/src/hg/lib/asmEquivalent.sql \
+   hgFixed.asmEquivalent.tsv
+
+hgsql -N -e 'select * from asmEquivalent;' hgFixed \
+   | sort > updated.2021-05-11.tsv
+
+wc -l updated.2021-05-11.tsv existing.2021-05-11.tsv
+  2594 updated.2021-05-11.tsv
+  2546 existing.2021-05-11.tsv
+
+# Previously:
+  2546 updated.2021-03-10.tsv
+  2344 existing.2021-03-10.tsv