a6d479158de520ee012585952f425eec7324bbad angie Thu Jan 13 11:48:43 2022 -0800 Exclude problematic sequences upstream of alignment -- avoid waste of time & CPU. diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh index 0f07cda..c85b2b0 100755 --- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -161,59 +161,60 @@ # Add public sequences that have been mapped to GISAID sequences to prevGisaid. grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid wc -l prev* # Get new GenBank sequences with at least $minReal non-N bases. # Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic. set +o pipefail egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \ | grep -Fwf prevGisaid - \ | grep -vFwf prevGbAcc \ | cat \ >> prevGbAcc set -o pipefail xzcat $ncbiDir/genbank.fa.xz \ -| faSomeRecords -exclude stdin prevGbAcc newGenBank.fa +| faSomeRecords -exclude stdin <(cat prevGbAcc ../tooManyEpps.ids ../badBranchSeed.ids) newGenBank.fa faSize -veryDetailed newGenBank.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gbTooSmall # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too. set +o pipefail fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall set -o pipefail faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa faSize newGenBank.filtered.fa # Get new COG-UK sequences with at least $minReal non-N bases. # Also exclude cog_all.fasta sequences not found in cog_metadata.csv. comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \ <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \ > cogFaNotMeta # Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above). xzcat $cogUkDir/cog_all.fasta.xz \ -| faSomeRecords -exclude stdin <(cat prevCogUk cogFaNotMeta cogUkInGenBank) newCogUk.fa +| faSomeRecords -exclude stdin \ + <(cat prevCogUk cogFaNotMeta cogUkInGenBank ../tooManyEpps.ids ../badBranchSeed.ids) newCogUk.fa faSize -veryDetailed newCogUk.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > cogUkTooSmall faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa faSize newCogUk.filtered.fa # Get new GISAID sequences with at least $minReal non-N bases. xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \ | sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \ -| faSomeRecords -exclude stdin prevGisaid newGisaid.fa +| faSomeRecords -exclude stdin <(cat prevGisaid ../tooManyEpps.ids ../badBranchSeed.ids) newGisaid.fa faSize -veryDetailed newGisaid.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gisaidTooSmall faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa faSize newGisaid.filtered.fa # Exclude public-mapped sequences from newGisaid: set +o pipefail fastaNames newGisaid.filtered.fa \ | grep -Fwf - $epiToPublic \ | cut -f 1 \ > newGisaid.public.names set -o pipefail if [ -s newGisaid.public.names ]; then faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp