199b2d4113d9498515030061888af66cbf7e8246 hiram Thu Mar 26 17:40:17 2020 -0700 adding some of the scripts that do the comparisons refs #24547 diff --git src/hg/makeDb/doc/assemblyEquivalence/README.txt src/hg/makeDb/doc/assemblyEquivalence/README.txt index b3bed6b..457df39 100644 --- src/hg/makeDb/doc/assemblyEquivalence/README.txt +++ src/hg/makeDb/doc/assemblyEquivalence/README.txt @@ -63,85 +63,269 @@ The keySignature.txt is a single md5sum of all the individual sequence md5sums in the idKeys.txt file. For example: cat Vicugna_pacos.vicPac1.keySignature.txt 84f72c82109632aa712d85fe9d94d237 head Vicugna_pacos.vicPac1.idKeys.txt 000052eec7a560c419098fe75ce59ccd scaffold_62780 00008c4565756b4429fabffa2e374b2c scaffold_15650 00012aaa1678d5651863671f53c66e3e scaffold_138339 00014ee45a4daceca84ba8d39c05fa93 scaffold_24830 00016bdfe2b6c9cd9d3145bc59057443 scaffold_195552 ... etc ... ##### Gather the Ensembl keySignatures #################################### -mkdir /hive/data/inside/assemblyEquivalence/Ensembl -cd /hive/data/inside/assemblyEquivalence/Ensembl +mkdir /hive/data/inside/assemblyEquivalence/ensembl +cd /hive/data/inside/assemblyEquivalence/ensembl # There is a single file with all the name correspondences that will be useful: time rsync -av --stats rsync://ftp.ensembl.org/ensembl/pub/release-99/species_EnsemblVertebrates.txt ./ real 0m1.644s find /hive/data/outside/ensembl/genomes/release-99/idKeys -type f \ | grep keySignature.txt | while read K do + idKeysDir=`dirname "${K}"` id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` + idKeys="$idKeysDir/$id.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${K}"` - printf "%s\t%s\n" "${keySig}" "${id}" -done | sort -k1,1 > ens99.keySignatures.txt + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}" +done | sort -k1,1 > ensembl.keySignatures.txt ##### Gather the UCSC database genomes keySignatures ###################### mkdir /hive/data/inside/assemblyEquivalence/ucscDb cd /hive/data/inside/assemblyEquivalence/ucscDb ### Avoiding an expensive 'find' operation on the hive directories, just ### check for specific named files: ls -d /hive/data/genomes/* | while read dbDir do db=`basename "${dbDir}"` keySig="${dbDir}/bed/idKeys/${db}.keySignature.txt" + idKeysDir=`dirname "${keySig}"` if [ -s "${keySig}" ]; then + idKeys="$idKeysDir/$db.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${keySig}"` - printf "%s\t%s\n" "${keySig}" "${db}" + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${db}" "${fullCount}" "${uniqueCount}" fi done | sort -k1,1 > ucscDb.keySignatures.txt ##### Gather the NCBI Refseq keySignatures ###################### +##### There are two types of 'idKeys' in the NCBI assemblies +##### one at the top level: GCx/012/345/678/accessionId/idKeys/ +##### one at the build level: GCx/012/345/678/accessionId/trackData/idKeys/ +##### the top level one is for the 'original' assembly from NCBI, +##### the build level one is for the assembly hub which has duplicate +##### contigs removed. These are different key signatures when +##### duplicates have been removed. mkdir /hive/data/inside/assemblyEquivalence/refseq cd /hive/data/inside/assemblyEquivalence/refseq ### the maxdepth and mindepth seems to help speed up the find a lot, rather ### then letting run it without any limits: find /hive/data/genomes/asmHubs/refseqBuild/GCF \ -maxdepth 4 -mindepth 4 -type d | while read buildDir do asmId=`basename "${buildDir}"` - keySig="${buildDir}/trackData/idKeys/${asmId}.keySignature.txt" + keySig="${buildDir}/idKeys/${asmId}.keySignature.txt" + idKeysDir=`dirname "${keySig}"` if [ -s "${keySig}" ]; then + idKeys="$idKeysDir/$asmId.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${keySig}"` - printf "%s\t%s\n" "${keySig}" "${asmId}" + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}" fi done | sort -k1,1 > refseq.keySignatures.txt ##### Gather the NCBI genbank keySignatures ###################### mkdir /hive/data/inside/assemblyEquivalence/genbank cd /hive/data/inside/assemblyEquivalence/genbank find /hive/data/genomes/asmHubs/genbankBuild/GCA \ -maxdepth 4 -mindepth 4 -type d | while read buildDir do asmId=`basename "${buildDir}"` - keySig="${buildDir}/trackData/idKeys/${asmId}.keySignature.txt" + keySig="${buildDir}/idKeys/${asmId}.keySignature.txt" + idKeysDir=`dirname "${keySig}"` if [ -s "${keySig}" ]; then + idKeys="$idKeysDir/$asmId.idKeys.txt" + fullCount=`cat $idKeys | wc -l` + uniqueCount=`cut -f1 $idKeys | sort -u | wc -l` keySig=`cat "${keySig}"` - printf "%s\t%s\n" "${keySig}" "${asmId}" + printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}" fi done | sort -k1,1 > genbank.keySignatures.txt ######################################################################## +### construct exact matches +######################################################################## + +cd /hive/data/inside/assemblyEquivalence + +~/kent/src/hg/makeDb/doc/assemblyEquivalence/exact.sh + +## this exact.sh script constructs the files: +## ensembl.genbank.exact.txt genbank.refseq.exact.txt refseq.ucsc.exact.txt +## ensembl.refseq.exact.txt genbank.ucsc.exact.txt ucsc.ensembl.exact.txt +## ensembl.ucsc.exact.txt refseq.ensembl.exact.txt ucsc.genbank.exact.txt +## genbank.ensembl.exact.txt refseq.genbank.exact.txt ucsc.refseq.exact.txt + +~/kent/src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl \ + | sort > table.2020-03-20.tsv + +# for a first peek at the functioning of this table, load these +# exact matches: +hgLoadSqlTab hgFixed asmEquivalent \ + ~/kent/src/hg/lib/asmEquivalent.sql table.2020-03-20.tsv + +# what does this look like: +hgsql -e 'desc asmEquivalent' hgFixed +# Field Type Null Key Default Extra +# source varchar(255) NO MUL NULL +# destination varchar(255) NO MUL NULL +# sourceAuthority enum('ensembl','ucsc','genbank','refseq') NO NULL +# destinationAuthority enum('ensembl','ucsc','genbank','refseq') NO NULL +# matchCount int(10) unsigned NO NULL +# sourceCount int(10) unsigned NO NULL +# destinationCount int(10) unsigned NO NULL + +hgsql -e 'select count(*) from asmEquivalent' hgFixed +# +----------+ +# | count(*) | +# +----------+ +# | 858 | +# +----------+ + +hgsql -e 'show indexes from asmEquivalent' hgFixed + +# list of UCSC databases exactly equivalent to RefSeq assemblies +hgsql -e 'select destination from asmEquivalent where sourceAuthority="refseq" and destinationAuthority="ucsc";' hgFixed + +######################################################################## +### construct close matches +######################################################################## + +mkdir /hive/data/inside/assemblyEquivalence/notExact +cd /hive/data/inside/assemblyEquivalence/notExact +~/kent/src/hg/makeDb/doc/assemblyEquivalence/allByAll.sh +### the output of this script is summarized in ./results/ +### into two categories: match and notMatch. + +### This allByAll.sh script eliminates all the 'exact' matches as found +### above. For those assemblies remaining in each category that do not +### have an exact match: +### ensembl refseq genbank ucsc +### and then pairs them up according to how many unique sequences they +### have, where 'unique' means without duplicate contigs. If the number of +### unique sequences are close within +- 10 to each other, it runs a +### comparison procedure that checks how many of the sequences in each assembly +### match to each other. If the match is within %10 of the number of sequences, +### it calls that a 'match', anything outside %10 in common are +### called 'notMatch' + +### Finally, to get the lines that belong in the final asmEquivalent +### table in hgFixed: + +sed -e 's/\tcount A .*//;' results/match.*.txt > notExact.table.2020-03-26.tsv + +To load up both the exact and these notExact matches into the hgFixed table: + +sort notExact.table.2020-03-26.tsv ../table.2020-03-26.tsv \ + | hgLoadSqlTab hgFixed asmEquivalent ~/kent/src/hg/lib/asmEquivalent.sql stdin + +hgsql -e 'select count(*) from asmEquivalent;' hgFixed ++----------+ +| count(*) | ++----------+ +| 2036 | ++----------+ + +# To see some of the highest differences, sort by column 1 and by column 2 +# and check head or tail of such sorts, there shouldn't be any '0 0' columns +# as that would be an 'exact' match and that has already been ruled +# out: (this could be done with the loaded table too) + +awk -F$'\t' '{printf "%4d %4d\t%s\t%s\n", $5-$7,$6-$7,$1,$2}' notExact.table.2020-03-26.tsv | sort -k1,1n | head +-1473 0 GCA_000768395.1_Cpor_2.0 croPor0 +-1473 0 croPor0 GCA_000768395.1_Cpor_2.0 +-756 -2 GCA_000708225.1_ASM70822v1 nipNip0 +-754 2 nipNip0 GCA_000708225.1_ASM70822v1 +-454 -1 GCA_000231765.1_GadMor_May2010 gadMor1 +-453 1 gadMor1 GCA_000231765.1_GadMor_May2010 +-428 -1 GCA_003265705.1_ASM326570v1 GCF_003265705.1_ASM326570v1 +-427 1 GCF_003265705.1_ASM326570v1 GCA_003265705.1_ASM326570v1 +-412 -1 GCA_003584765.1_ASM358476v1 GCF_003584765.1_ASM358476v1 +-411 1 GCF_003584765.1_ASM358476v1 GCA_003584765.1_ASM358476v1 + +awk -F$'\t' '{printf "%4d %4d\t%s\t%s\n", $5-$7,$6-$7,$1,$2}' notExact.table.2020-03-26.tsv | sort -k2,2n | head +-141 -141 Otolemur_garnettii.OtoGar3 GCA_000181295.3_OtoGar3 +-141 -141 Otolemur_garnettii.OtoGar3 GCF_000181295.1_OtoGar3 +-141 -141 Otolemur_garnettii.OtoGar3 otoGar3 +-115 -113 vicPac2 GCA_000164845.3_Vicugna_pacos-2.0.2 +-109 -109 takFla1 GCA_000400755.1_version_1_of_Takifugu_flavidus_genome + -42 -42 Mustela_putorius_furo.MusPutFur1.0 GCF_000215625.1_MusPutFur1.0 + -42 -42 musFur1 GCF_000215625.1_MusPutFur1.0 + -41 -41 Mustela_putorius_furo.MusPutFur1.0 GCA_000215625.1_MusPutFur1.0 + -41 -41 musFur1 GCA_000215625.1_MusPutFur1.0 + -19 -19 gorGor4 GCA_000151905.3_gorGor4 + +awk -F$'\t' '{printf "%4d %4d\t%s\t%s\n", $5-$7,$6-$7,$1,$2}' notExact.table.2020-03-26.tsv | sort -k1,1n | tail + 0 16 Neolamprologus_brichardi.NeoBri1.0 neoBri1 + 0 19 GCA_000151905.3_gorGor4 gorGor4 + 0 41 GCA_000215625.1_MusPutFur1.0 Mustela_putorius_furo.MusPutFur1.0 + 0 41 GCA_000215625.1_MusPutFur1.0 musFur1 + 0 42 GCF_000215625.1_MusPutFur1.0 Mustela_putorius_furo.MusPutFur1.0 + 0 42 GCF_000215625.1_MusPutFur1.0 musFur1 + 0 109 GCA_000400755.1_version_1_of_Takifugu_flavidus_genome takFla1 + 0 141 GCA_000181295.3_OtoGar3 Otolemur_garnettii.OtoGar3 + 0 141 GCF_000181295.1_OtoGar3 Otolemur_garnettii.OtoGar3 + 0 141 otoGar3 Otolemur_garnettii.OtoGar3 + +awk -F$'\t' '{printf "%4d %4d\t%s\t%s\n", $5-$7,$6-$7,$1,$2}' notExact.table.2020-03-26.tsv | sort -k2,2n | tail + -1 19 Gorilla_gorilla.gorGor4 gorGor4 + 0 41 GCA_000215625.1_MusPutFur1.0 Mustela_putorius_furo.MusPutFur1.0 + 0 41 GCA_000215625.1_MusPutFur1.0 musFur1 + 0 42 GCF_000215625.1_MusPutFur1.0 Mustela_putorius_furo.MusPutFur1.0 + 0 42 GCF_000215625.1_MusPutFur1.0 musFur1 + 0 109 GCA_000400755.1_version_1_of_Takifugu_flavidus_genome takFla1 + -2 113 GCA_000164845.3_Vicugna_pacos-2.0.2 vicPac2 + 0 141 GCA_000181295.3_OtoGar3 Otolemur_garnettii.OtoGar3 + 0 141 GCF_000181295.1_OtoGar3 Otolemur_garnettii.OtoGar3 + 0 141 otoGar3 Otolemur_garnettii.OtoGar3 + +# This could be done with the loaded table too: + +hgsql -N -e 'select matchCount-sourceCount,matchCount-destinationCount,(matchCount-sourceCount)-(matchCount-destinationCount),source from asmEquivalent where matchCount-sourceCount!=0;' hgFixed | sort -k3,3n | tail +-1 -3 2 enhLutKen1 +-1 -3 2 manPen1 +-1 -3 2 serCan1 +-1 -3 2 serCan1 +-754 -756 2 GCA_000708225.1_ASM70822v1 +-1 -4 3 GCA_001541155.1_Algmis_Hirise_1.0 +-1 -9 8 xipMac1 +-1 -16 15 neoBri1 +-1 -20 19 gorGor4 +-2 -115 113 vicPac2 + +hgsql -N -e 'select matchCount-sourceCount,matchCount-destinationCount,(matchCount-sourceCount)-(matchCount-destinationCount),source from asmEquivalent where matchCount-sourceCount!=0;' hgFixed | sort -k3,3n | head +-141 0 -141 GCA_000181295.3_OtoGar3 +-141 0 -141 GCF_000181295.1_OtoGar3 +-141 0 -141 otoGar3 +-115 -2 -113 GCA_000164845.3_Vicugna_pacos-2.0.2 +-109 0 -109 GCA_000400755.1_version_1_of_Takifugu_flavidus_genome +-42 0 -42 GCF_000215625.1_MusPutFur1.0 +-42 0 -42 GCF_000215625.1_MusPutFur1.0 +-41 0 -41 GCA_000215625.1_MusPutFur1.0 +-41 0 -41 GCA_000215625.1_MusPutFur1.0 +-19 0 -19 GCA_000151905.3_gorGor4