fd84fe49fb0697a08c40b8cdbd1b4b003c06330b angie Mon Nov 29 12:59:31 2021 -0800 Now that Problematic Sites track has been updated, no need to patch in masking of site 21987 (S:142). diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh index bc5c2a7..0f07cda 100755 --- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -267,29 +267,24 @@ | sed -re 's/ /_/g' \ >> $renaming set -o pipefail fi if [ -s newGisaid.filtered.fa ]; then zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(fastaNames newGisaid.filtered.fa) \ | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \ >> $renaming fi wc -l $renaming # Make masked VCF tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf -# Add masked VCF to previous protobuf -#*** With horrible hack for time being, to mask 21987 (because it messes up the Delta branch)... -#*** TODO: make hgPhyloPlace handle protobufs that don't have all of the latest problematic sites -#*** masked more gracefully, and update the Problematic Sites track. time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ | faToVcf -maxDiff=1000 \ -excludeFile=<(cat ../tooManyEpps.ids ../badBranchSeed.ids) \ -verbose=2 stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | vcfFilter -excludeVcf=mask.vcf stdin \ -| tawk '$2 != 21987' \ | gzip -c \ > new.masked.vcf.gz