0d25abead58ce0220a18d62dee344be325f280f9 hiram Fri Mar 12 12:22:57 2021 -0800 updates to Ensembl v103 release and new assemblies for genbank, refseq, and ucsc refs #27194 diff --git src/hg/makeDb/doc/assemblyEquivalence/update.2021-03-10.txt src/hg/makeDb/doc/assemblyEquivalence/update.2021-03-10.txt new file mode 100644 index 0000000..32cf673 --- /dev/null +++ src/hg/makeDb/doc/assemblyEquivalence/update.2021-03-10.txt @@ -0,0 +1,246 @@ + +idKeys for Ensembl assemblies were +calculated in: /hive/data/outside/ensembl/genomes/release-103/ + +New assemblies have been built for genbank, refseq and ucsc + +Working in: +mkdir /hive/data/inside/assemblyEquivalence/2021-03-10 +cd /hive/data/inside/assemblyEquivalence/2021-03-10 + +mkdir ensembl refseq genbank ucsc + +########### refseq keys ################## +cd ../refseq + +find /hive/data/genomes/asmHubs/refseqBuild/GCF \ + -maxdepth 4 -mindepth 4 -type d | while read buildDir +do + asmId=`basename "${buildDir}"` + keySig="${buildDir}/idKeys/${asmId}.keySignature.txt" + idKeysDir=`dirname "${keySig}"` + if [ -s "${keySig}" ]; then + idKeys="$idKeysDir/$asmId.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` + keySig=`cat "${keySig}"` + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}" + fi +done | sort -k1,1 > refseq.keySignatures.txt + +########### genbank keys ################## +cd ../genbank + +find /hive/data/genomes/asmHubs/genbankBuild/GCA \ + -maxdepth 4 -mindepth 4 -type d | while read buildDir +do + asmId=`basename "${buildDir}"` + keySig="${buildDir}/idKeys/${asmId}.keySignature.txt" + idKeysDir=`dirname "${keySig}"` + if [ -s "${keySig}" ]; then + idKeys="$idKeysDir/$asmId.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` + keySig=`cat "${keySig}"` + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}" + fi +done | sort -k1,1 > genbank.keySignatures.txt + +########### UCSC keys ################## +cd ../ucsc + +ls -d /hive/data/genomes/* | while read dbDir +do + db=`basename "${dbDir}"` + keySig="${dbDir}/bed/idKeys/${db}.keySignature.txt" + idKeysDir=`dirname "${keySig}"` + if [ -s "${keySig}" ]; then + idKeys="$idKeysDir/$db.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` + keySig=`cat "${keySig}"` + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${db}" "${fullCount}" "${uniqueCount}" + fi +done | sort -k1,1 > ucsc.keySignatures.txt + +cd /hive/data/inside/assemblyEquivalence/2021-03-10 +ln -s /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/exact.sh . + +########### Ensembl keys ################## +cd ensembl + +# does not always work, use the wget instead +# rsync -av --stats \ +# rsync://ftp.ensembl.org/ensembl/pub/release-103/species_EnsemblVertebrates.txt \ +# ./ + +wget --timestamping \ +ftp://ftp.ensembl.org/pub/release-103/species_EnsemblVertebrates.txt + +# extract the GCx assembly names: + +awk -F$'\t' '{printf "%s_%s\n", $6,$5}' species_EnsemblVertebrates.txt \ + | grep "^G" | sort > ensembl.v103.asmId.txt + +# compare this to the ones we have on hand here + +cut -f2 ../genbank/genbank.keySignatures.txt | sort > genbank.asmId.here.txt + +wc -l ensembl.v103.asmId.txt genbank.asmId.here.txt | sed -e 's/^/# /;' +# 295 ensembl.v103.asmId.txt +# 1428 genbank.asmId.here.txt + +comm -12 ensembl.v103.asmId.txt genbank.asmId.here.txt | wc -l +# 270 + +# will need to catch up the missing 25: +comm -23 ensembl.v103.asmId.txt genbank.asmId.here.txt + +find -L /hive/data/outside/ensembl/genomes/release-103/idKeys -type f \ + | grep keySignature.txt | while read K +do + idKeysDir=`dirname "${K}"` + id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` + idKeys="$idKeysDir/$id.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` + keySig=`cat "${K}"` + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" +done | sort -k1,1 > ensembl-103.keySignatures.txt + +find /hive/data/outside/ensembl/genomes/release-101/idKeys -type f \ + | grep keySignature.txt | while read K +do + idKeysDir=`dirname "${K}"` + id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` + idKeys="$idKeysDir/$id.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` + keySig=`cat "${K}"` + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" +done | sort -k1,1 > ensembl-101.keySignatures.txt + +find /hive/data/outside/ensembl/genomes/release-100/idKeys -type f \ + | grep keySignature.txt | while read K +do + idKeysDir=`dirname "${K}"` + id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` + idKeys="$idKeysDir/$id.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` + keySig=`cat "${K}"` + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" +done | sort -k1,1 > ensembl-100.keySignatures.txt + +find /hive/data/outside/ensembl/genomes/release-99/idKeys -type f \ + | grep keySignature.txt | while read K +do + idKeysDir=`dirname "${K}"` + id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` + idKeys="$idKeysDir/$id.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` + keySig=`cat "${K}"` + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" +done | sort -k1,1 > ensembl-99.keySignatures.txt + +sort -u ensembl-103.keySignatures.txt ensembl-101.keySignatures.txt \ + ensembl-100.keySignatures.txt \ + ensembl-99.keySignatures.txt > ensembl.keySignatures.txt + +########### Exact matches ################## + +./exact.sh + +~/kent/src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl \ + | sort > table.2021-03-10.tsv + +########### Near matches ################## + +mkdir /hive/data/inside/assemblyEquivalence/2021-03-10/notExact +cd /hive/data/inside/assemblyEquivalence/2021-03-10/notExact + +sed -e 's/release-99/release-103/;' \ + /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/runOne > runOne +chmod +x runOne + +time (~/kent/src/hg/makeDb/doc/assemblyEquivalence/allByAll.sh) > all.log 2>&1 +# real 22m56.438s + +sed -e 's/\tcount A .*//;' results/match.*.txt > notExact.table.2021-03-10.tsv + +########### Table contents to load ################## + +sort -u notExact.table.2021-03-10.tsv ../table.2021-03-10.tsv \ + > hgFixed.asmEquivalent.tsv + +### Compare to existing: +hgsql -N -e 'select * from asmEquivalent;' hgFixed | sort \ + > existing.2021-03-10.tsv + +wc -l hgFixed.asmEquivalent.tsv existing.2021-03-10.tsv + 2546 hgFixed.asmEquivalent.tsv + 2344 existing.2021-03-10.tsv + +### How about the important ones: + +egrep "mm10|mm39|hg38|hg19" hgFixed.asmEquivalent.tsv +GCA_000001405.27_GRCh38.p12 hg38 genbank ucsc 595 595 595 +GCF_000001635.27_GRCm39 mm39 refseq ucsc 61 61 61 +Mus_musculus.GRCm39 mm39 ensembl ucsc 61 61 61 +hg38 GCA_000001405.27_GRCh38.p12 ucsc genbank 595 595 595 +mm39 GCF_000001635.27_GRCm39 ucsc refseq 61 61 61 +mm39 Mus_musculus.GRCm39 ucsc ensembl 61 61 61 + +### most should be identical to before +join hgFixed.asmEquivalent.tsv existing.2021-03-10.tsv | wc -l + 5144 + +### probably should *not* be losing any from before: +join -v 2 hgFixed.asmEquivalent.tsv existing.2021-03-10.tsv | wc -l + 0 +# if not 0, investigate. Sometimes a new assembly is now an +# exact match to something where it was a near match before to +# a previous assembly of that organism. + +### Showing 5 this time: +### This is because the nearMiss calculation is only taking place on the +### most recent Ensembl release, not all the past ones. These two assemblies +### have new version in ens103 release, thus these older Ensembl assemblies +### are not being checked for near miss. For now, going to leave these +### missing from the table. Can't maintain all older Ensembl equivalents. + +Esox_lucius.Eluc_V3 GCA_000721915.3_Eluc_V3 ensembl genbank 1018 1019 1018 +GCA_000721915.3_Eluc_V3 Esox_lucius.Eluc_V3 genbank ensembl 1018 1018 1019 +Sarcophilus_harrisii.DEVIL7.0 GCA_000189315.1_Devil_ref_v7.0 ensembl genbank 35974 35975 35974 +Sarcophilus_harrisii.DEVIL7.0 GCF_000189315.1_Devil_ref_v7.0 ensembl refseq 35974 35975 35974 +Sarcophilus_harrisii.DEVIL7.0 sarHar1 ensembl ucsc 35974 35975 35974 + + +### there should be some new ones +join -v 1 hgFixed.asmEquivalent.tsv existing.2021-03-10.tsv | wc -l + 165 + +### There should be no duplicate equivalents: +cut -f1,2 hgFixed.asmEquivalent.tsv | sort | uniq -c | sort -rn | head + 1 zonAlb1 Zonotrichia_albicollis.Zonotrichia_albicollis-1.0.1 + 1 zonAlb1 GCF_000385455.1_Zonotrichia_albicollis-1.0.1 + 1 zonAlb1 GCA_000385455.1_Zonotrichia_albicollis-1.0.1 + 1 xipMac1 GCA_000241075.1_Xiphophorus_maculatus-4.4.2 + 1 xenTro9 Xenopus_tropicalis.Xenopus_tropicalis_v9.1 +... etc ... + +#### To load up new table contents: +hgLoadSqlTab hgFixed asmEquivalent ~/kent/src/hg/lib/asmEquivalent.sql \ + hgFixed.asmEquivalent.tsv + +hgsql -N -e 'select * from asmEquivalent;' hgFixed \ + | sort > updated.2021-03-10.tsv + +wc -l updated.2021-03-10.tsv existing.2021-03-10.tsv + 2546 updated.2021-03-10.tsv + 2344 existing.2021-03-10.tsv + +# Previously: + 2344 updated.2021-03-10.tsv + 2320 existing.2021-03-10.tsv