caa7d37fe23e92494b9c694c2e91c8d98e3a846c hiram Thu Mar 19 12:51:40 2020 -0700 document the procedure refs #24547 diff --git src/hg/makeDb/doc/assemblyEquivalence/README.txt src/hg/makeDb/doc/assemblyEquivalence/README.txt new file mode 100644 index 0000000..b3bed6b --- /dev/null +++ src/hg/makeDb/doc/assemblyEquivalence/README.txt @@ -0,0 +1,147 @@ +############################################################################# +Procedure to determine equivalence between assemblies + +The process is to compare md5sums of each assembly to determine +exact matches. When there isn't an exact match, them compare +the md5sums of each sequence to determine how many match with +each other. With enough matches, a partial equivalence can be +established. There are sometimes small differences due to: + 1. different or non-existing chrM/MT sequence between assemblies + 2. duplicated sequences are usually eliminated in UCSC assemblies + 3. elimination of sequences under a size limit, for example, + do not use any sequences under 100 bases in length + +The md5sums are calculated with the automatic script: doIdKeys.pl +which only needs a 2bit file to perform the calculation. There are +two files created by this process: + 1. .idKeys.txt - individual md5sum of each sequence + 2. .keySignature.txt - md5sum of all the individual idKeys sums + This is a single md5sum for the whole assembly + +There are five different sources of sequence that will be compared: +1. UCSC database browser genomes +2. NCBI RefSeq assemblies +3. NCBI Genbank assemblies +4. Ensembl assemblies +XXX 5. UCSC assembly hub genomes - maybe, this is the same ID as RefSeq/Genbank + +Priority will go to exact matches. If there is an exact match, +the partial matches will not be checked. When there is no exact match, +the closest partial match will be used. + +############################################################################# +Working directory: + +/hive/data/inside/assemblyEquivalence/ + +############################################################################# + +Ensembl version 99 sequences were downloaded to: + + /hive/data/outside/ensembl/genomes/release-99/fasta/ + +The 'idKeys' signatures for these sequences were calculated in: + + /hive/data/outside/ensembl/genomes/release-99/idKeys/ + +A typical run of idKeys in one of those directories, for +example: + + cat /hive/data/outside/ensembl/genomes/release-99/idKeys/Vicugna_pacos/run.sh + +set -beEu -o pipefail +cd /hive/data/outside/ensembl/genomes/release-99/idKeys/Vicugna_pacos +doIdKeys.pl -buildDir=`pwd` -twoBit=`pwd`/Vicugna_pacos.vicPac1.2bit "Vicugna_pacos.vicPac1" + +All of these were set up with scripts to run them in parallel, +the 2bit files was constructed from the fasta, then this +doIdKeys.pl procedure was run which creates the two files of interest: + +# -rw-rw-r-- 1 14164089 Jan 21 21:58 Vicugna_pacos.vicPac1.idKeys.txt +# -rw-rw-r-- 1 33 Jan 21 21:58 Vicugna_pacos.vicPac1.keySignature.txt + +The keySignature.txt is a single md5sum of all the individual sequence +md5sums in the idKeys.txt file. For example: + +cat Vicugna_pacos.vicPac1.keySignature.txt +84f72c82109632aa712d85fe9d94d237 + +head Vicugna_pacos.vicPac1.idKeys.txt +000052eec7a560c419098fe75ce59ccd scaffold_62780 +00008c4565756b4429fabffa2e374b2c scaffold_15650 +00012aaa1678d5651863671f53c66e3e scaffold_138339 +00014ee45a4daceca84ba8d39c05fa93 scaffold_24830 +00016bdfe2b6c9cd9d3145bc59057443 scaffold_195552 +... etc ... + +##### Gather the Ensembl keySignatures #################################### +mkdir /hive/data/inside/assemblyEquivalence/Ensembl +cd /hive/data/inside/assemblyEquivalence/Ensembl + +# There is a single file with all the name correspondences that will be useful: + +time rsync -av --stats rsync://ftp.ensembl.org/ensembl/pub/release-99/species_EnsemblVertebrates.txt ./ + +real 0m1.644s + +find /hive/data/outside/ensembl/genomes/release-99/idKeys -type f \ + | grep keySignature.txt | while read K +do + id=`basename "${K}" | sed -e 's/.keySignature.txt//;'` + keySig=`cat "${K}"` + printf "%s\t%s\n" "${keySig}" "${id}" +done | sort -k1,1 > ens99.keySignatures.txt + +##### Gather the UCSC database genomes keySignatures ###################### + +mkdir /hive/data/inside/assemblyEquivalence/ucscDb +cd /hive/data/inside/assemblyEquivalence/ucscDb + +### Avoiding an expensive 'find' operation on the hive directories, just +### check for specific named files: + +ls -d /hive/data/genomes/* | while read dbDir +do + db=`basename "${dbDir}"` + keySig="${dbDir}/bed/idKeys/${db}.keySignature.txt" + if [ -s "${keySig}" ]; then + keySig=`cat "${keySig}"` + printf "%s\t%s\n" "${keySig}" "${db}" + fi +done | sort -k1,1 > ucscDb.keySignatures.txt + +##### Gather the NCBI Refseq keySignatures ###################### + +mkdir /hive/data/inside/assemblyEquivalence/refseq +cd /hive/data/inside/assemblyEquivalence/refseq + +### the maxdepth and mindepth seems to help speed up the find a lot, rather +### then letting run it without any limits: +find /hive/data/genomes/asmHubs/refseqBuild/GCF \ + -maxdepth 4 -mindepth 4 -type d | while read buildDir +do + asmId=`basename "${buildDir}"` + keySig="${buildDir}/trackData/idKeys/${asmId}.keySignature.txt" + if [ -s "${keySig}" ]; then + keySig=`cat "${keySig}"` + printf "%s\t%s\n" "${keySig}" "${asmId}" + fi +done | sort -k1,1 > refseq.keySignatures.txt + +##### Gather the NCBI genbank keySignatures ###################### + +mkdir /hive/data/inside/assemblyEquivalence/genbank +cd /hive/data/inside/assemblyEquivalence/genbank + +find /hive/data/genomes/asmHubs/genbankBuild/GCA \ + -maxdepth 4 -mindepth 4 -type d | while read buildDir +do + asmId=`basename "${buildDir}"` + keySig="${buildDir}/trackData/idKeys/${asmId}.keySignature.txt" + if [ -s "${keySig}" ]; then + keySig=`cat "${keySig}"` + printf "%s\t%s\n" "${keySig}" "${asmId}" + fi +done | sort -k1,1 > genbank.keySignatures.txt + +########################################################################