1f64522294183ac3613b48b2d937fa8a666c9afa angie Thu Sep 2 17:55:45 2021 -0700 Move the --max-parsimony post-filtering into usher's new --max-parsimony-per-sample option. Add mechanism to exclude sequences that sneak past quality filters by faking reference alleles where they should have Ns. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index a2d68e2..fee47a9 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -266,57 +266,59 @@ zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(fastaNames newGisaid.filtered.fa) \ | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \ >> $renaming fi wc -l $renaming # Make masked VCF tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf # Add masked VCF to previous protobuf #*** With horrible hack for time being, to mask 21987 (because it messes up the Delta branch)... #*** TODO: make hgPhyloPlace handle protobufs that don't have all of the latest problematic sites #*** masked more gracefully, and update the Problematic Sites track. time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ -| faToVcf -maxDiff=1000 -excludeFile=../tooManyEpps.ids -verbose=2 stdin stdout \ +| faToVcf -maxDiff=1000 \ + -excludeFile=<(cat ../tooManyEpps.ids ../badBranchSeed.ids) \ + -verbose=2 stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | vcfFilter -excludeVcf=mask.vcf stdin \ | tawk '$2 != 21987' \ | gzip -c \ > new.masked.vcf.gz fi # if [ ! -s new.masked.vcf.gz ] time $usher -u -T 80 \ -A \ -e 5 \ + --max-parsimony-per-sample 20 \ -v new.masked.vcf.gz \ -i prevRenamed.pb \ -o gisaidAndPublic.$today.masked.preTrim.pb \ >& usher.addNew.log mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk # Exclude sequences with a very high number of EPPs from future runs grep ^Current usher.addNew.log \ | awk '$16 >= 10 {print $8;}' \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ >> ../tooManyEpps.ids # Prune samples with too many private mutations and internal branches that are too long. $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \ - -a 20 \ -b 30 \ -O -o gisaidAndPublic.$today.masked.pb # Metadata for hgPhyloPlace: # Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them: echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \ > gisaidAndPublic.$today.metadata.tsv # It's not always possible to recreate both old and new names correctly from metadata, # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs. cut -f 2 $renaming \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \ | sort \ > idToName # NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \