be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/hg/utils/otto/sarscov2phylo/processGisaid.sh src/hg/utils/otto/sarscov2phylo/processGisaid.sh new file mode 100755 index 0000000..cee80c7 --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/processGisaid.sh @@ -0,0 +1,150 @@ +#!/bin/bash +set -beEu -x -o pipefail + +# Do not modify this script, modify the source tree copy: +# kent/src/hg/utils/otto/sarscov2phylo/processGisaid.sh + +usage() { + echo "usage: $0 treeDate treeDir msaFile nextmeta problematic_sites_sarsCov2.vcf" +} + +if [ $# != 5 ]; then + usage + exit 1 +fi + +releaseLabel=$1 +treeDir=$2 +msaFile=$3 +nextmeta=$4 +problematicSitesVcf=$5 + +echo "releaseLabel=$releaseLabel treeDir=$treeDir msaFile=$msaFile nextmeta=$nextmeta problematicSitesVcf=$problematicSitesVcf" + +scriptDir=$(dirname "${BASH_SOURCE[0]}") + +# UShER (and matToVcf) +usherDir=~angie/github/usher +usher=$usherDir/build/usher +matToVcf=$usherDir/build/matToVcf + +# strain_phylogenetics / find_parsimonious_assignments for parsimony scores +find_parsimonious_assignments=~angie/github/strain_phylogenetics/build/find_parsimonious_assignments + +metadata=$treeDir/metadata.csv +treeFile=$treeDir/global.tree + +sed -re 's/,/,\n/g' $treeFile | sed -re 's/:.*//; s/[()]//g;' | sort > tree.ids.sorted +grep ^\> $msaFile | sed -re 's/^>//' | sort > msa.ids.sorted + +treeNotMsa=$(comm -23 tree.ids.sorted msa.ids.sorted | wc -l) +if (( $treeNotMsa > 0 )); then + echo "" + echo "Error: $treeFile has $treeNotMsa sequences that are not in $msaFile." + echo "Try using an msa download from a later date." + exit +fi + +# Make renaming files for tree (EPI_ IDs) and VCF (isolate names): add strain name and +# date with shortened year. +tail -n+2 $metadata \ +| csvToTab \ +| awk -F"\t" '{print $1 "\t" $2 "|" $1 "|" $3;}' \ +| sed -re 's@hCo[Vv]-19/@@; s/\|20/\|/; s/ //g; s/,//g;' \ +| sort > epi.renaming +wc -l epi.renaming + +# Rename tree IDs from just EPI IDs to strain|epi|date ids: +phyloRenameAndPrune $treeFile epi.renaming global.renamed.nwk +sed -re 's/,/,\n/g' global.renamed.nwk | sed -re 's/:.*//; s/[()]//g;' \ +| sort > tree.renamed.ids.sorted +wc -l tree.renamed.ids.sorted + +# Set the root of the tree to the Wuhan/Hu-1 (NC_045512.2) reference not WIV04: +~angie/github/newick_utils/src/nw_reroot global.renamed.nwk \ + 'Wuhan/Hu-1/2019|EPI_ISL_402125|19-12-31' > global.reroot.nwk + +# Run faToVcf without any filtering, masking or conversion of ambiguous alleles: +time faToVcf $msaFile stdout -includeRef \ + -ref=EPI_ISL_402125 \ + -vcfChrom=NC_045512v2 -verbose=2 \ +| vcfRenameAndPrune stdin epi.renaming gisaid-$releaseLabel.unfiltered.vcf +ls -l gisaid-$releaseLabel.unfiltered.vcf +wc -l gisaid-$releaseLabel.unfiltered.vcf +gzip -f gisaid-$releaseLabel.unfiltered.vcf + +# Run usher and matToVcf to make ambig-resolved VCF for hgTracks +time $usher -c \ + --vcf gisaid-$releaseLabel.unfiltered.vcf.gz \ + --tree global.reroot.nwk \ + --save-mutation-annotated-tree sarscov2phylo-$releaseLabel.notMasked.pb \ + --write-uncondensed-final-tree +mv uncondensed-final-tree.nh global.reroot.collapsed.notMasked.nwk +$matToVcf -i sarscov2phylo-$releaseLabel.notMasked.pb \ + -v gisaid-$releaseLabel.vcf +bgzip -f gisaid-$releaseLabel.vcf +tabix -p vcf gisaid-$releaseLabel.vcf.gz + +# Remove problematic sites recommended for masking for usher/phyloPlace +tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ + $problematicSitesVcf > mask.vcf +time vcfFilter -excludeVcf=mask.vcf gisaid-$releaseLabel.unfiltered.vcf.gz \ +| gzip -c \ + > gisaid-$releaseLabel.unfiltered.masked.vcf.gz + +# Full find_parsimonious_assignments output on uncollapsed tree for collaborators +time $find_parsimonious_assignments --tree global.reroot.nwk \ + --vcf gisaid-$releaseLabel.unfiltered.masked.vcf.gz \ +| gzip -c \ + > find_parsimonious_assignments.$releaseLabel.out.gz + +# Run usher to collapse tree and make masked protobuf for hgPhyloPlace +time $usher -c \ + --vcf gisaid-$releaseLabel.unfiltered.masked.vcf.gz \ + --tree global.reroot.nwk \ + --save-mutation-annotated-tree sarscov2phylo-$releaseLabel.masked.pb \ + --write-uncondensed-final-tree +mv uncondensed-final-tree.nh global.reroot.collapsed.nwk + +# Parsimony scores on collapsed tree +time $find_parsimonious_assignments --tree global.reroot.collapsed.nwk \ + --vcf <(gunzip -c gisaid-$releaseLabel.vcf.gz) \ +| tail -n+2 \ +| sed -re 's/^[A-Z]([0-9]+)[A-Z,]+.*parsimony_score=([0-9]+).*/\1\t\2/;' \ +| tawk '{print "NC_045512v2", $1-1, $1, $2;}' \ +| sort -k2n,2n \ + > gisaid-$releaseLabel.parsimony.bg +bedGraphToBigWig gisaid-$releaseLabel.parsimony.bg /hive/data/genomes/wuhCor1/chrom.sizes \ + gisaid-$releaseLabel.parsimony.bw + +# Full find_parsimonious_assignments output on collapsed tree for collaborators +time $find_parsimonious_assignments --tree global.reroot.collapsed.nwk \ + --vcf gisaid-$releaseLabel.unfiltered.masked.vcf.gz \ + > find_parsimonious_assignments.$releaseLabel.collapsed.out + +# Now filter by frequency for faster track display: alt al freq >= 0.001 +sampleCount=$(wc -l < tree.renamed.ids.sorted) +minAc001=$(( (($sampleCount + 999) / 1000) )) +time vcfFilter -minAc=$minAc001 -rename gisaid-$releaseLabel.vcf.gz \ + > gisaid-$releaseLabel.minAf.001.vcf +ls -l gisaid-$releaseLabel.minAf.001.vcf +wc -l gisaid-$releaseLabel.minAf.001.vcf +bgzip -f gisaid-$releaseLabel.minAf.001.vcf +tabix -p vcf gisaid-$releaseLabel.minAf.001.vcf.gz + +# Alt al freq >= 0.01 +minAc01=$(( (($sampleCount + 99) / 100) )) +time vcfFilter -minAc=$minAc01 -rename gisaid-$releaseLabel.minAf.001.vcf.gz \ + > gisaid-$releaseLabel.minAf.01.vcf +ls -l gisaid-$releaseLabel.minAf.01.vcf +wc -l gisaid-$releaseLabel.minAf.01.vcf +bgzip -f gisaid-$releaseLabel.minAf.01.vcf +tabix -p vcf gisaid-$releaseLabel.minAf.01.vcf.gz + +# Make sample color files +gunzip -c $nextmeta | cut -f3,18-20 \ +| sort > metadata.epiToLineageAndClades +# Join on EPI ID to associate tree sample names with lineages. +join -t$'\t' epi.renaming metadata.epiToLineageAndClades -o 1.2,2.2,2.3,2.4 > sampleToLineage +$scriptDir/cladeLineageColors.pl sampleToLineage +gzip -f *Colors