18bbec9e1a2ecbd3430515d8818ce2361f960535 hiram Mon Feb 7 16:15:39 2022 -0800 better matching to historical NCBI assemblies no redmine diff --git src/hg/makeDb/doc/assemblyEquivalence/update.2022-01-18.txt src/hg/makeDb/doc/assemblyEquivalence/update.2022-01-18.txt index 0529366..4de8450 100644 --- src/hg/makeDb/doc/assemblyEquivalence/update.2022-01-18.txt +++ src/hg/makeDb/doc/assemblyEquivalence/update.2022-01-18.txt @@ -217,130 +217,151 @@ 4 Drosophila_melanogaster.BDGP6.32 # Eliminate the older names: sort -u ensembl-10?.keySignatures.txt \ ensembl-99.keySignatures.txt \ | egrep -v -w "Anolis_carolinensis.AnoCar2.0|Canis_familiaris.CanFam3.1|Cyprinus_carpio_germanmirror.German_Mirror_carp_1.0|Cyprinus_carpio_hebaored.Hebao_red_carp_1.0|Drosophila_melanogaster.BDGP6.28" \ > ensembl.keySignatures.txt ########### Exact matches ################## cd /hive/data/inside/assemblyEquivalence/2022-01-18 ./exact.sh ~/kent/src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl \ | sort > table.2022-01-18.tsv +wc -l table.* + 317948 table.2022-01-18.tsv + 363928 table.2022-02-07.tsv ########### Near matches ################## mkdir /hive/data/inside/assemblyEquivalence/2022-01-18/notExact cd /hive/data/inside/assemblyEquivalence/2022-01-18/notExact sed -e 's/release-99/release-105/;' \ /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/runOne > runOne chmod +x runOne ### XXX ### this isn't going to work with the big jump in number of ### genbank/refseq assemblies time (~/kent/src/hg/makeDb/doc/assemblyEquivalence/allByAll.sh) > all.log 2>&1 -# real 17m52.128s +# real 107m11.347s sed -e 's/\tcount A .*//;' results/match.*.txt > notExact.table.2022-01-18.tsv -wc -l notExact.table.2022-01-18.tsv -# 1258 notExact.table.2022-01-18.tsv + + wc -l notExact.tab* + 1258 notExact.table.2022-01-18.tsv + 1239 notExact.table.2022-01-26.tsv + 1738 notExact.table.2022-02-07.tsv ########### Table contents to load ################## sort -u notExact.table.2022-01-18.tsv ../table.2022-01-18.tsv \ > hgFixed.asmEquivalent.tsv ### Compare to existing: hgsql -N -e 'select * from asmEquivalent;' hgFixed | sort \ > existing.2022-01-18.tsv +wc -l hgFixed* + 319206 hgFixed.asmEquivalent.tsv + 365666 hgFixed.asmEquivalent.tsv.2022-02-07 + + wc -l hgFixed.asmEquivalent.tsv existing.2022-01-18.tsv 319206 hgFixed.asmEquivalent.tsv 2594 existing.2022-01-18.tsv # since we have so many new ones, will have to count identicals sort existing.2022-01-18.tsv hgFixed.asmEquivalent.tsv | uniq -c \ | sort -rn | awk '$1 > 1' | wc -l -# 2290 +# 2308 sort existing.2022-01-18.tsv hgFixed.asmEquivalent.tsv | uniq -c \ | sort -rn | awk '$1 > 1' | sed -e 's/^ \+//;' | cut -d' ' -f2- \ | sort > same.as.before -wc -l same.as.before -# 2290 +wc -l same.as.b* + 2290 same.as.before + 2308 same.as.before.2022-02-07 # so why are there so many missing that used to be counted ? sort existing.2022-01-18.tsv same.as.before | uniq -c | awk '$1 > 1' | wc -l # 2290 sort existing.2022-01-18.tsv same.as.before | uniq -c | awk '$1 < 2' \ | sed -e 's/^ \+//;' | cut -d' ' -f2- | sort > why.gone.missing -wc -l why.gone.missing -# 304 why.gone.missing - - +wc -l why* + 304 why.gone.missing + 286 why.gone.missing.2022-02-07 ### How about the important ones: egrep "mm10|mm39|hg38|hg19" hgFixed.asmEquivalent.tsv GCA_000001405.28_GRCh38.p13 hg38 genbank ucsc 640 640 640 GCA_000001635.8_GRCm38.p6 mm10 genbank ucsc 239 239 239 GCA_000001635.9_GRCm39 mm39 genbank ucsc 61 61 61 GCF_000001405.39_GRCh38.p13 hg38 refseq ucsc 639 639 640 GCF_000001635.26_GRCm38.p6 mm10 refseq ucsc 239 239 239 GCF_000001635.27_GRCm39 mm39 refseq ucsc 61 61 61 Mus_musculus.GRCm39 mm39 ensembl ucsc 61 61 61 hg38 GCA_000001405.28_GRCh38.p13 ucsc genbank 640 640 640 hg38 GCF_000001405.39_GRCh38.p13 ucsc refseq 639 640 639 mm10 GCA_000001635.8_GRCm38.p6 ucsc genbank 239 239 239 mm10 GCF_000001635.26_GRCm38.p6 ucsc refseq 239 239 239 mm39 GCA_000001635.9_GRCm39 ucsc genbank 61 61 61 mm39 GCF_000001635.27_GRCm39 ucsc refseq 61 61 61 mm39 Mus_musculus.GRCm39 ucsc ensembl 61 61 61 ### most should be identical to before join hgFixed.asmEquivalent.tsv existing.2022-01-18.tsv | wc -l - 5057 + 5168 ### probably should *not* be losing any from before: join -v 2 hgFixed.asmEquivalent.tsv existing.2022-01-18.tsv | wc -l - 134 + 118 join -v 2 hgFixed.asmEquivalent.tsv existing.2022-01-18.tsv \ | cut -d' ' -f3 | sort | uniq -c - 6 ensembl - 76 genbank - 41 refseq - 11 ucsc + 4 ensembl + 74 genbank + 35 refseq + 5 ucsc + +join -v 2 hgFixed.asmEquivalent.tsv existing.2022-01-18.tsv | awk '$3 == "ucsc"' +croPor0 GCA_000768395.1_Cpor_2.0 ucsc genbank 21892 23365 23365 +melGal1 GCA_000146605.2_Turkey_2.01 ucsc genbank 5890 5891 5891 +neoSch1 GCA_002201575.1_ASM220157v1 ucsc genbank 7871 7872 7872 +neoSch1 GCF_002201575.1_ASM220157v1 ucsc refseq 7872 7872 7873 +vicPac2 GCA_000164845.3_Vicugna_pacos-2.0.2 ucsc genbank 276609 276611 276724 join -v 2 hgFixed.asmEquivalent.tsv existing.2022-01-18.tsv | cut -d' ' -f4 | sort | uniq -c - 30 ensembl - 35 genbank - 49 refseq - 20 ucsc + 29 ensembl + 32 genbank + 41 refseq + 16 ucsc + +The first time through this revealed that we need to include the 'historical' +assemblies from NCBI to match the older UCSC assemblies. + +Not sure what all these new ones are this time. -Examined a couple (canFam3, xenTro9) appeared to have moved to a more exact -match from genbank to refseq. # if not 0, investigate. Sometimes a new assembly is now an # exact match to something where it was a near match before to # a previous assembly of that organism. # In this case, it is the different names same idKeys that were eliminated # this time that weren't taken care of before: join -t$'\t' -v 2 hgFixed.asmEquivalent.tsv existing.2022-01-18.tsv \ | cut -f1-3 Anolis_carolinensis.AnoCar2.0 GCA_000090745.2_AnoCar2.0 ensembl Anolis_carolinensis.AnoCar2.0 GCF_000090745.1_AnoCar2.0 ensembl Anolis_carolinensis.AnoCar2.0 anoCar2 ensembl Canis_familiaris.CanFam3.1 GCF_000002285.3_CanFam3.1 ensembl Canis_familiaris.CanFam3.1 canFam3 ensembl Drosophila_melanogaster.BDGP6.28 GCF_000001215.4_Release_6_plus_ISO1_MT ensembl Drosophila_melanogaster.BDGP6.28 dm6 ensembl @@ -368,24 +389,28 @@ 1 zonAlb1 Zonotrichia_albicollis.Zonotrichia_albicollis-1.0.1 1 zonAlb1 GCF_000385455.1_Zonotrichia_albicollis-1.0.1 1 zonAlb1 GCA_000385455.1_Zonotrichia_albicollis-1.0.1 1 xipMac1 GCA_000241075.1_Xiphophorus_maculatus-4.4.2 1 xenTro9 Xenopus_tropicalis.Xenopus_tropicalis_v9.1 ... etc ... #### To load up new table contents: hgLoadSqlTab hgFixed asmEquivalent ~/kent/src/hg/lib/asmEquivalent.sql \ hgFixed.asmEquivalent.tsv hgsql -N -e 'select * from asmEquivalent;' hgFixed \ | sort > updated.2022-01-18.tsv wc -l updated.2022-01-18.tsv existing.2022-01-18.tsv +wc -l updated* existing* 319206 updated.2022-01-18.tsv + 365666 updated.2022-02-07.tsv 2594 existing.2022-01-18.tsv + 302 existing.ensembl.list + 252 existing.ucsc.list # Previously: 2594 updated.2022-01-18.tsv 2546 existing.2022-01-18.tsv 2546 updated.2021-03-10.tsv 2344 existing.2021-03-10.tsv