e931fd3e5fdab83eeb110b2450a0f365eff170e2 angie Mon Oct 4 17:04:17 2021 -0700 Omit ORF1a from taxodium amino acid annotations (redundant with ORF1ab). Add optional baseProtobuf arg to updateCombinedTree.sh / makeNewMaskedVcf.sh to start with a custom protobuf (e.g. optimized) instead of eyesterday's protobuf. diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh index bb426e6..3791daa 100755 --- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -1,60 +1,64 @@ #!/bin/bash set -beEu -x -o pipefail # Do not modify this script, modify the source tree copy: # kent/src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh usage() { - echo "usage: $0 prevDate today problematicSitesVcf" + echo "usage: $0 prevDate today problematicSitesVcf [baseProtobuf]" echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated." } -if [ $# != 3 ]; then +if [ $# != 3 && $# != 4]; then usage exit 1 fi prevDate=$1 today=$2 problematicSitesVcf=$3 +baseProtobuf=$4 ottoDir=/hive/data/outside/otto/sarscov2phylo ncbiDir=$ottoDir/ncbi.latest cogUkDir=$ottoDir/cogUk.latest cncbDir=$ottoDir/cncb.latest gisaidDir=/hive/users/angie/gisaid minReal=20000 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit epiToPublic=$gisaidDir/epiToPublicAndDate.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh mkdir -p $ottoDir/$today cd $ottoDir/$today prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb -prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils renaming=oldAndNewNames +if [ "$baseProtobuf" == "" ]; then + baseProtobuf=$prevProtobufMasked +fi + # Make lists of sequences already in the tree. -$matUtils extract -i $prevProtobufMasked -u prevNames +$matUtils extract -i $baseProtobuf -u prevNames # Before updating the tree with new sequences, update the names used in the tree: # * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs # * COG-UK sequences that are in GenBank. Per Sam Nicholls the isolate alone is enough to identify # them -- no need to match country & year which are sometimes incorrect at first. grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \ | sort \ > cogUkInGenBankIsolateToNewName fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \ > cogUkInGenBank set +o pipefail # From 2021-05-04 on there should be no more unrenamed COG-UK/ in prevNames, but doesn't hurt @@ -117,34 +121,34 @@ # And finally, sometimes there are duplicates due to country or date being corrected in COG-UK # metadata. grep -vFwf dupsToRemove prevTree.renaming \ | cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \ | cut -d/ -f 2 \ > cogUkIsolateStillDup grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \ | awk -F, '{print $1 "|" $5;}' \ > cogDupsLatestMetadata grep -Fwf cogUkIsolateStillDup prevNames \ | grep -vFwf dupsToRemove \ | grep -vFwf cogDupsLatestMetadata \ | cat \ >> dupsToRemove set -o pipefail -startingProtobuf=$prevProtobufMasked +startingProtobuf=$baseProtobuf if [ -s dupsToRemove ]; then startingProtobuf=prevDupTrimmed.pb - $matUtils extract -i $prevProtobufMasked -p -s dupsToRemove -o $startingProtobuf + $matUtils extract -i $baseProtobuf -p -s dupsToRemove -o $startingProtobuf fi if [ -s prevTree.renaming ]; then $matUtils mask -r prevTree.renaming -i $startingProtobuf -o prevRenamed.pb >& rename.out $matUtils extract -i prevRenamed.pb -u prevNames else ln -sf $startingProtobuf prevRenamed.pb fi # OK, now that the tree names are updated, figure out which seqs are already in there and # which need to be added. awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc grep -E '^(England|Northern|Scotland|Wales)' prevNames \ | cut -d\| -f1 > prevCogUk