03e6d9865fa98c05dda6bbf5d780b0a2b41d389c angie Tue Feb 22 11:11:08 2022 -0800 Remove B.1.1.529 from lineageTree.pb to match pangoLEARN behavior & avoid applying Omicron label to sequences of very poor quality -- requested by Rachel Colquhoun & Andrew Rambaut. diff --git src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh index a7ae840e..23a3144 100755 --- src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh +++ src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh @@ -23,36 +23,37 @@ today=$(date +%F) cd $ottoDir/$buildDate # Get node ID for root of lineage A, used as reference/root by Pangolin: if [ ! -s clade-paths ]; then $matUtils extract -i gisaidAndPublic.$buildDate.masked.pb -C clade-paths fi lineageARoot=$(grep ^A$'\t' clade-paths | cut -f 2) # Reroot protobuf to lineage A: $matUtils extract -i gisaidAndPublic.$buildDate.masked.pb \ --reroot $lineageARoot \ -o gisaidAndPublic.$buildDate.masked.reroot.pb -# Reroot pango.clade-mutations.tsv: +# Reroot pango.clade-mutations.tsv; also remove B.1.1.529 (Rachel's request): grep -w ^A $scriptDir/pango.clade-mutations.tsv \ | sed -re 's/T28144C( > )?//; s/C8782T( > )?//;' \ > pango.clade-mutations.reroot.tsv grep -vw ^A $scriptDir/pango.clade-mutations.tsv \ | sed -re 's/\t/\tT8782C > C28144T > /;' \ +| grep -vF B.1.1.529 \ >> pango.clade-mutations.reroot.tsv # Mask additional bases at the beginning and end of the genome that pangolin masks after # aligning input sequences. for ((i=56; $i <= 265; i++)); do echo -e "N${i}N" done > maskPangoEnds for ((i=29674; $i < 29804; i++)); do echo -e "N${i}N" done >> maskPangoEnds $matUtils mask -i gisaidAndPublic.$buildDate.masked.reroot.pb \ -m maskPangoEnds -o gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb # Assign updated lineages on the rerooted & pango-masked tree, pango-only for pangolin: time $matUtils annotate -T 50 \