d1ab093492f89b1fe6130371898d1ac2d47ee85d angie Mon Nov 29 15:56:05 2021 -0800 Reroot to lineage A internal node, not leaf sequence. Mask entire untranslated beginning & end of genome like Pangolin does. diff --git src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh index 36125ad..a7ae840e 100755 --- src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh +++ src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh @@ -12,59 +12,75 @@ if [ $# != 1 ]; then usage exit 1 fi buildDate=$1 ottoDir=/hive/data/outside/otto/sarscov2phylo usherDir=~angie/github/usher matUtils=$usherDir/build/matUtils today=$(date +%F) cd $ottoDir/$buildDate -# Get full sample name for lineage A sequence used as reference/root by Pangolin: -wh4SampleName=$(grep LR757995 samples.$buildDate) +# Get node ID for root of lineage A, used as reference/root by Pangolin: +if [ ! -s clade-paths ]; then + $matUtils extract -i gisaidAndPublic.$buildDate.masked.pb -C clade-paths +fi +lineageARoot=$(grep ^A$'\t' clade-paths | cut -f 2) -# Reroot protobuf: +# Reroot protobuf to lineage A: $matUtils extract -i gisaidAndPublic.$buildDate.masked.pb \ - --reroot "$wh4SampleName" \ + --reroot $lineageARoot \ -o gisaidAndPublic.$buildDate.masked.reroot.pb # Reroot pango.clade-mutations.tsv: grep -w ^A $scriptDir/pango.clade-mutations.tsv \ | sed -re 's/T28144C( > )?//; s/C8782T( > )?//;' \ > pango.clade-mutations.reroot.tsv grep -vw ^A $scriptDir/pango.clade-mutations.tsv \ | sed -re 's/\t/\tT8782C > C28144T > /;' \ >> pango.clade-mutations.reroot.tsv -# Assign updated lineages on the rerooted tree, pango-only for pangolin: + +# Mask additional bases at the beginning and end of the genome that pangolin masks after +# aligning input sequences. +for ((i=56; $i <= 265; i++)); do + echo -e "N${i}N" +done > maskPangoEnds +for ((i=29674; $i < 29804; i++)); do + echo -e "N${i}N" +done >> maskPangoEnds +$matUtils mask -i gisaidAndPublic.$buildDate.masked.reroot.pb \ + -m maskPangoEnds -o gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb + +# Assign updated lineages on the rerooted & pango-masked tree, pango-only for pangolin: time $matUtils annotate -T 50 \ - -i gisaidAndPublic.$buildDate.masked.reroot.pb \ + -i gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb \ -M pango.clade-mutations.reroot.tsv \ -l \ -c lineageToName \ -f 0.95 \ -u mutations.pangoOnly \ -D details.pangoOnly \ -o gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \ >& annotate.pangoOnly.out set +o pipefail grep 'Could not' annotate.pangoOnly.out | cat +grep skip annotate.pangoOnly.out | cat set -o pipefail # Make a bunch of smaller trees and see how they do. mkdir -p /hive/users/angie/lineageTreeUpdate.$today cd /hive/users/angie/lineageTreeUpdate.$today for i in 0 1 2 3 4 5 6 7 8 9; do echo test.50.$i $matUtils extract -i $ottoDir/$buildDate/gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \ -r 50 -o test.50.$i.pb $matUtils mask -i test.50.$i.pb -S -o test.50.$i.simp.pb done # Run pangolin on each subtree # 7.5-14.5m each job, ~3.5hrs total: conda activate pangolin