cabd81a0d3b5aecaa7ebc0f7ed86dd541c5330f4 angie Mon Nov 29 16:13:42 2021 -0800 Mask deleted bases (and adjacent bases) in the Delta branch to avoid bogus reported SNVs at those locations from erroneous genome assembly pipelines. diff --git src/hg/utils/otto/sarscov2phylo/maskDelta.sh src/hg/utils/otto/sarscov2phylo/maskDelta.sh new file mode 100755 index 0000000..6d767e5 --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/maskDelta.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -beEu -x -o pipefail + +# Do not modify this script, modify the source tree copy: +# kent/src/hg/utils/otto/sarscov2phylo/maskDelta.sh + +usage() { + echo "usage: $0 treeIn.pb treeOut.pb" +} + +if [ $# != 2 ]; then + usage + exit 1 +fi + +treeInPb=$1 +treeOutPb=$2 + +ottoDir=/hive/data/outside/otto/sarscov2phylo + +usherDir=~angie/github/usher +matUtils=$usherDir/build/matUtils +matOptimize=$usherDir/build/matOptimize + +# I wish there were a less hacky method to identify the node for Delta, but since mutation +# paths can change as new samples are added, this is the most stable method I have at the moment: +# make sample-paths, grep for basal sample IND/GBRC714b/2021 (USA/WI-CDC-FG-038252/2021 would also +# work in case that one goes away for any reason), use the final node in path. +samplePaths=$treeInPb.sample-paths +$matUtils extract -i $treeInPb -S $samplePaths +deltaNode=$(grep IND/GBRC714b/2021 $samplePaths | awk '{print $NF;}' | sed -re 's/:.*//;') + +# Delta has deletions at S:157-158 (22029-22034), ORF8:119-120 (28248-28253) and 28271. +# Mask those locations and some adjacent bases where we get a ton of spurious "mutations". +maskFile=$(basename $treeInPb .pb).maskForDelta.tsv +set +x +for ((i=22027; $i <= 22034; i++)); do + echo -e "N${i}N\t$deltaNode" +done > $maskFile +for ((i=28246; $i <= 28253; i++)); do + echo -e "N${i}N\t$deltaNode" +done >> $maskFile +echo -e "N28271N\t$deltaNode" >> $maskFile +set -x + +time $matUtils mask -i $treeInPb \ + -m $maskFile \ + -o $treeOutPb