c3ac1c095777767dff7ad31b94d4e71318a6db67
angie
  Tue Feb 22 10:55:09 2022 -0800
Use new scripts findDropoutContam.pl and findRefBackfill.pl to identify problematic sequences using nextclade annotations.  Exclude those sequences (with an exception for reported recombinants, via new file includeRecombinants.tsv) from the tree.

diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
index c85b2b0..02de702 100755
--- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
+++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
@@ -151,70 +151,95 @@
 fi
 
 # OK, now that the tree names are updated, figure out which seqs are already in there and
 # which need to be added.
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
 grep -E '^(England|Northern|Scotland|Wales)' prevNames \
 | cut -d\| -f1 > prevCogUk
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^EPI_ISL_' > prevGisaid
 # Add public sequences that have been mapped to GISAID sequences to prevGisaid.
 grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid
 grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid
 wc -l prev*
 
+# Exclude some sequences based on nextclade counts of reversions and other-clade mutations.
+zcat $gisaidDir/chunks/nextclade.full.tsv.gz \
+| $scriptDir/findDropoutContam.pl > gisaid.dropoutContam
+zcat $ncbiDir/nextclade.full.tsv.gz \
+| $scriptDir/findDropoutContam.pl > gb.dropoutContam
+zcat $cogUkDir/nextclade.full.tsv.gz \
+| $scriptDir/findDropoutContam.pl > cog.dropoutContam
+cut -f 1 *.dropoutContam \
+| awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
+    > dropoutContam.ids
+# Also exclude sequences with unbelievably low numbers of mutations given sampling dates.
+zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,5 \
+| awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \
+| $scriptDir/findRefBackfill.pl > gisaid.refBackfill
+zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,5 | sort \
+| join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \
+| $scriptDir/findRefBackfill.pl > gb.refBackfill
+zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,5 | sort \
+| join -t $'\t' <(cut -d, -f 1,5 $cogUkDir/cog_metadata.csv | tr , $'\t' | sort) - \
+| $scriptDir/findRefBackfill.pl > cog.refBackfill
+cut -f 1 *.refBackfill > refBackfill.ids
+sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \
+| grep -vFwf <(tail -n+4 $scriptDir/includeRecombinants.tsv | cut -f 1) \
+    > exclude.ids
+
 # Get new GenBank sequences with at least $minReal non-N bases.
 # Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic.
 set +o pipefail
 egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \
 | grep -Fwf prevGisaid - \
 | grep -vFwf prevGbAcc \
 | cat \
     >> prevGbAcc
 set -o pipefail
 xzcat $ncbiDir/genbank.fa.xz \
-| faSomeRecords -exclude stdin <(cat prevGbAcc ../tooManyEpps.ids ../badBranchSeed.ids) newGenBank.fa
+| faSomeRecords -exclude stdin <(cat prevGbAcc exclude.ids) newGenBank.fa
 faSize -veryDetailed newGenBank.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > gbTooSmall
 # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too.
 set +o pipefail
 fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall
 set -o pipefail
 faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa
 faSize newGenBank.filtered.fa
 
 # Get new COG-UK sequences with at least $minReal non-N bases.
 # Also exclude cog_all.fasta sequences not found in cog_metadata.csv.
 comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \
     <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \
     > cogFaNotMeta
 # Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above).
 xzcat $cogUkDir/cog_all.fasta.xz \
 | faSomeRecords -exclude stdin \
-    <(cat prevCogUk cogFaNotMeta cogUkInGenBank ../tooManyEpps.ids ../badBranchSeed.ids) newCogUk.fa
+    <(cat prevCogUk cogFaNotMeta cogUkInGenBank exclude.ids) newCogUk.fa
 faSize -veryDetailed newCogUk.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > cogUkTooSmall
 faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa
 faSize newCogUk.filtered.fa
 
 # Get new GISAID sequences with at least $minReal non-N bases.
 xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \
 | sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \
-| faSomeRecords -exclude stdin <(cat prevGisaid ../tooManyEpps.ids ../badBranchSeed.ids) newGisaid.fa
+| faSomeRecords -exclude stdin <(cat prevGisaid exclude.ids) newGisaid.fa
 faSize -veryDetailed newGisaid.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > gisaidTooSmall
 faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa
 faSize newGisaid.filtered.fa
 
 # Exclude public-mapped sequences from newGisaid:
 set +o pipefail
 fastaNames newGisaid.filtered.fa \
 | grep -Fwf - $epiToPublic \
 | cut -f 1 \
     > newGisaid.public.names
 set -o pipefail
 if [ -s newGisaid.public.names ]; then
     faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp
@@ -270,22 +295,22 @@
     set -o pipefail
 fi
 if [ -s newGisaid.filtered.fa ]; then
     zcat $gisaidDir/metadata_batch_$today.tsv.gz \
     | grep -Fwf <(fastaNames newGisaid.filtered.fa) \
     | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \
         >> $renaming
 fi
 wc -l $renaming
 
 # Make masked VCF
 tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
     $problematicSitesVcf > mask.vcf
 time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
 | faToVcf -maxDiff=1000 \
-    -excludeFile=<(cat ../tooManyEpps.ids ../badBranchSeed.ids) \
+    -excludeFile=exclude.ids \
     -verbose=2 stdin stdout \
 | vcfRenameAndPrune stdin $renaming stdout \
 | vcfFilter -excludeVcf=mask.vcf stdin \
 | gzip -c \
     > new.masked.vcf.gz