c3ac1c095777767dff7ad31b94d4e71318a6db67 angie Tue Feb 22 10:55:09 2022 -0800 Use new scripts findDropoutContam.pl and findRefBackfill.pl to identify problematic sequences using nextclade annotations. Exclude those sequences (with an exception for reported recombinants, via new file includeRecombinants.tsv) from the tree. diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh index c85b2b0..02de702 100755 --- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -151,70 +151,95 @@ fi # OK, now that the tree names are updated, figure out which seqs are already in there and # which need to be added. awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc grep -E '^(England|Northern|Scotland|Wales)' prevNames \ | cut -d\| -f1 > prevCogUk awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^EPI_ISL_' > prevGisaid # Add public sequences that have been mapped to GISAID sequences to prevGisaid. grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid wc -l prev* +# Exclude some sequences based on nextclade counts of reversions and other-clade mutations. +zcat $gisaidDir/chunks/nextclade.full.tsv.gz \ +| $scriptDir/findDropoutContam.pl > gisaid.dropoutContam +zcat $ncbiDir/nextclade.full.tsv.gz \ +| $scriptDir/findDropoutContam.pl > gb.dropoutContam +zcat $cogUkDir/nextclade.full.tsv.gz \ +| $scriptDir/findDropoutContam.pl > cog.dropoutContam +cut -f 1 *.dropoutContam \ +| awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ + > dropoutContam.ids +# Also exclude sequences with unbelievably low numbers of mutations given sampling dates. +zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,5 \ +| awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \ +| $scriptDir/findRefBackfill.pl > gisaid.refBackfill +zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,5 | sort \ +| join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \ +| $scriptDir/findRefBackfill.pl > gb.refBackfill +zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,5 | sort \ +| join -t $'\t' <(cut -d, -f 1,5 $cogUkDir/cog_metadata.csv | tr , $'\t' | sort) - \ +| $scriptDir/findRefBackfill.pl > cog.refBackfill +cut -f 1 *.refBackfill > refBackfill.ids +sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \ +| grep -vFwf <(tail -n+4 $scriptDir/includeRecombinants.tsv | cut -f 1) \ + > exclude.ids + # Get new GenBank sequences with at least $minReal non-N bases. # Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic. set +o pipefail egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \ | grep -Fwf prevGisaid - \ | grep -vFwf prevGbAcc \ | cat \ >> prevGbAcc set -o pipefail xzcat $ncbiDir/genbank.fa.xz \ -| faSomeRecords -exclude stdin <(cat prevGbAcc ../tooManyEpps.ids ../badBranchSeed.ids) newGenBank.fa +| faSomeRecords -exclude stdin <(cat prevGbAcc exclude.ids) newGenBank.fa faSize -veryDetailed newGenBank.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gbTooSmall # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too. set +o pipefail fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall set -o pipefail faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa faSize newGenBank.filtered.fa # Get new COG-UK sequences with at least $minReal non-N bases. # Also exclude cog_all.fasta sequences not found in cog_metadata.csv. comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \ <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \ > cogFaNotMeta # Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above). xzcat $cogUkDir/cog_all.fasta.xz \ | faSomeRecords -exclude stdin \ - <(cat prevCogUk cogFaNotMeta cogUkInGenBank ../tooManyEpps.ids ../badBranchSeed.ids) newCogUk.fa + <(cat prevCogUk cogFaNotMeta cogUkInGenBank exclude.ids) newCogUk.fa faSize -veryDetailed newCogUk.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > cogUkTooSmall faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa faSize newCogUk.filtered.fa # Get new GISAID sequences with at least $minReal non-N bases. xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \ | sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \ -| faSomeRecords -exclude stdin <(cat prevGisaid ../tooManyEpps.ids ../badBranchSeed.ids) newGisaid.fa +| faSomeRecords -exclude stdin <(cat prevGisaid exclude.ids) newGisaid.fa faSize -veryDetailed newGisaid.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gisaidTooSmall faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa faSize newGisaid.filtered.fa # Exclude public-mapped sequences from newGisaid: set +o pipefail fastaNames newGisaid.filtered.fa \ | grep -Fwf - $epiToPublic \ | cut -f 1 \ > newGisaid.public.names set -o pipefail if [ -s newGisaid.public.names ]; then faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp @@ -270,22 +295,22 @@ set -o pipefail fi if [ -s newGisaid.filtered.fa ]; then zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(fastaNames newGisaid.filtered.fa) \ | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \ >> $renaming fi wc -l $renaming # Make masked VCF tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ | faToVcf -maxDiff=1000 \ - -excludeFile=<(cat ../tooManyEpps.ids ../badBranchSeed.ids) \ + -excludeFile=exclude.ids \ -verbose=2 stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | vcfFilter -excludeVcf=mask.vcf stdin \ | gzip -c \ > new.masked.vcf.gz