00842797b9fda733465b9bf732648a492d133cac angie Wed Jan 12 10:00:10 2022 -0800 Also mask 21302, 21304 and 21305 in Delta. Mask convergent deletion of 11291-11296 plus 3 bases to the left in Alpha, Beta and Gamma to prevent spurious 'SNVs' at deletion site from messing up the tree; may need to add this for Omicron eventually. diff --git src/hg/utils/otto/sarscov2phylo/maskDelta.sh src/hg/utils/otto/sarscov2phylo/maskDelta.sh index 30cba66..251119d 100755 --- src/hg/utils/otto/sarscov2phylo/maskDelta.sh +++ src/hg/utils/otto/sarscov2phylo/maskDelta.sh @@ -20,33 +20,59 @@ usherDir=~angie/github/usher matUtils=$usherDir/build/matUtils matOptimize=$usherDir/build/matOptimize # I wish there were a less hacky method to identify the node for Delta, but since mutation # paths can change as new samples are added, this is the most stable method I have at the moment: # make sample-paths, grep for basal sample IND/GBRC714b/2021 (USA/WI-CDC-FG-038252/2021 would also # work in case that one goes away for any reason), use the final node in path. samplePaths=$treeInPb.sample-paths $matUtils extract -i $treeInPb -S $samplePaths deltaNode=$(grep IND/GBRC714b/2021 $samplePaths | awk '{print $NF;}' | sed -re 's/:.*//;') # Delta has deletions at S:157-158 (22029-22034), ORF8:119-120 (28248-28253) and 28271. # Mask those locations and some adjacent bases where we get a ton of spurious "mutations". -maskFile=$(basename $treeInPb .pb).maskForDelta.tsv +maskFile=$(basename $treeInPb .pb).branchSpecificMask.tsv set +x for ((i=22027; $i <= 22034; i++)); do echo -e "N${i}N\t$deltaNode" done > $maskFile for ((i=28246; $i <= 28253; i++)); do echo -e "N${i}N\t$deltaNode" done >> $maskFile echo -e "N28271N\t$deltaNode" >> $maskFile set -x # S:95 (21846) is also very unreliably detected in Delta. Mask it off to avoid tree trouble, # like split AY.100. echo -e "N21846N\t$deltaNode" >> $maskFile +# These three sites are recommended for caution in the Problematic Sites set, and seem to have +# create a false lineage (AY.89) from samples that probably should be AY.4. AY.89 is being +# withdrawn (https://github.com/cov-lineages/pango-designation/issues/398); mask sites in Delta. +echo -e "N21302N\t$deltaNode" >> $maskFile +echo -e "N21304N\t$deltaNode" >> $maskFile +echo -e "N21305N\t$deltaNode" >> $maskFile + +# OK, not just Delta -- Alpha, Beta, and Gamma have a deletion that causes spurious "mutations", +# especially at 11296 and 11291, somewhat also at 11288. +# Omicron has ~ the same deletion but it aligns 5 bases to the left, probably because it was +# combined with an SNV (https://github.com/cov-lineages/pango-designation/issues/361). +alphaNode=$(grep Italy/TAA-1900553896/2021 $samplePaths \ + | awk '{print $(NF-1);}' | sed -re 's/:.*//;') +betaNode=$(grep SouthAfrica/CERI-KRISP-K012031/2021 $samplePaths \ + | awk '{print $NF;}' | sed -re 's/:.*//;') +gammaNode=$(grep France/PAC-IHU-5193-N1/2021 $samplePaths | awk '{print $NF;}' | sed -re 's/:.*//;') +set +x +for node in $alphaNode $betaNode $gammaNode; do + for ((i=11288; $i <= 11296; i++)); do + echo -e "N${i}N\t$node" + done +done >> $maskFile +set -x + time $matUtils mask -i $treeInPb \ -m $maskFile \ -o $treeOutPb + +rm $samplePaths