3bdcfd94b4905ad578c1ae99c9bb47fbb937893c angie Sat Mar 12 20:35:04 2022 -0800 Mask out deleted sites in BA.1 and BA.2 diff --git src/hg/utils/otto/sarscov2phylo/maskDelta.sh src/hg/utils/otto/sarscov2phylo/maskDelta.sh index 3bc7072..2393232 100755 --- src/hg/utils/otto/sarscov2phylo/maskDelta.sh +++ src/hg/utils/otto/sarscov2phylo/maskDelta.sh @@ -47,37 +47,75 @@ # like split AY.100. echo -e "N21846N\t$deltaNode" >> $maskFile # These three sites are recommended for caution in the Problematic Sites set, and seem to have # create a false lineage (AY.89) from samples that probably should be AY.4. AY.89 is being # withdrawn (https://github.com/cov-lineages/pango-designation/issues/398); mask sites in Delta. echo -e "N21302N\t$deltaNode" >> $maskFile echo -e "N21304N\t$deltaNode" >> $maskFile echo -e "N21305N\t$deltaNode" >> $maskFile # Mask flaky positions 28254 (ORF8:121) and 28461 (N:63) so that AY.96 is merged into AY.46 # https://github.com/cov-lineages/pango-designation/issues/435 echo -e "N28254N\t$deltaNode" >> $maskFile echo -e "N28461N\t$deltaNode" >> $maskFile -# OK, not just Delta -- Alpha, Beta, and Gamma have a deletion that causes spurious "mutations", +# OK, not just Delta -- Alpha, Beta, Gamma and BA.2 have a deletion that causes spurious "mutations", # especially at 11296 and 11291, somewhat also at 11288. -# Omicron has ~ the same deletion but it aligns 5 bases to the left, probably because it was -# combined with an SNV (https://github.com/cov-lineages/pango-designation/issues/361). alphaNode=$(grep Italy/TAA-1900553896/2021 $samplePaths \ | awk '{print $(NF-1);}' | sed -re 's/:.*//;') betaNode=$(grep SouthAfrica/CERI-KRISP-K012031/2021 $samplePaths \ | awk '{print $NF;}' | sed -re 's/:.*//;') gammaNode=$(grep France/PAC-IHU-5193-N1/2021 $samplePaths | awk '{print $NF;}' | sed -re 's/:.*//;') +BA2Node=$(grep Germany/RP-RKI-I-517345/2022 $samplePaths \ + | awk '{print $(NF-1);}' | sed -re 's/:.*//;') set +x -for node in $alphaNode $betaNode $gammaNode; do +for node in $alphaNode $betaNode $gammaNode $BA2Node; do for ((i=11288; $i <= 11296; i++)); do echo -e "N${i}N\t$node" done done >> $maskFile set -x +# BA.1 has almost the same deletion but it aligns 5 bases to the left, probably because it was +# combined with an SNV (https://github.com/cov-lineages/pango-designation/issues/361). +BA1Node=$(grep England/DHSC-CYBJ4Y8/2022 $samplePaths \ + | awk '{print $(NF-1);}' | sed -re 's/:.*//;') +set +x +for ((i=11283; $i <= 11291; i++)); do + echo -e "N${i}N\t$BA1Node" +done >> $maskFile +# BA.1 has several other deletions that cause the same problem. +for ((i=6513; $i <= 6515; i++)); do + echo -e "N${i}N\t$BA1Node" +done >> $maskFile +for ((i=21765; $i <= 21770; i++)); do + echo -e "N${i}N\t$BA1Node" +done >> $maskFile +# There's a deletion 21987-21995 and then an insertion after 22204 and more messy bases after that. +for ((i=21988; $i <= 22217; i++)); do + echo -e "N${i}N\t$BA1Node" +done >> $maskFile +for ((i=22194; $i <= 22198; i++)); do + echo -e "N${i}N\t$BA1Node" +done >> $maskFile +for ((i=28362; $i <= 28370; i++)); do + echo -e "N${i}N\t$BA1Node" +done >> $maskFile + +# BA.2 has some additional deletions. +for ((i=21633; $i <= 21641; i++)); do + echo -e "N${i}N\t$BA2Node" +done >> $maskFile +for ((i=28362; $i <= 28370; i++)); do + echo -e "N${i}N\t$BA1Node" +done >> $maskFile +for ((i=29734; $i <= 29759; i++)); do + echo -e "N${i}N\t$BA1Node" +done >> $maskFile +set -x + time $matUtils mask -i $treeInPb \ -m $maskFile \ -o $treeOutPb rm $samplePaths