src/hg/utils/otto/sarscov2phylo/processRelease.sh 99599c7130790107ff0de9f043930da6aa7fddf1

99599c7130790107ff0de9f043930da6aa7fddf1
angie
  Mon Nov 16 16:35:58 2020 -0800
Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching
sequences and metadata from several public sources, mapping GISAID IDs to
public seq IDs, downloading the latest release of the phylogenetic tree from
github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public
sequences, and using github.com/yatisht/usher to resolve ambiguous alleles,
make protobuf files for hgPhyloPlace, and add public sequences that have not
been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive
public tree+VCF.

This is still not fully otto-mated because certain crucial inputs like
GISAID sequences still must be downloaded using a web browser, but the goal
is to automate as much as possible and maybe someday have it fully cron-driven.

There are two main top-level scripts which call other scripts, which may in turn
call scripts, in this hierarchy:

updateIdMapping.sh
getCogUk.sh
getNcbi.sh
searchAllSarsCov2BioSample.sh
bioSampleIdToText.sh
bioSampleTextToTab.pl
gbMetadataAddBioSample.pl
fixNcbiFastaNames.pl

updateSarsCov2Phylo.sh
getRelease.sh
processRelease.sh
cladeLineageColors.pl
mapPublic.sh
extractUnmappedPublic.sh
addUnmappedPublic.sh

many of the above:
util.sh

publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I
figure out how to automate fetching of author/institution metadata from NCBI
and COG-UK.

diff --git src/hg/utils/otto/sarscov2phylo/processRelease.sh src/hg/utils/otto/sarscov2phylo/processRelease.sh
new file mode 100755
index 0000000..22b7fe6
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/processRelease.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+set -beEu -x -o pipefail
+
+#	Do not modify this script, modify the source tree copy:
+#	kent/src/hg/utils/otto/sarscov2phylo/processRelease.sh
+
+usage() {
+    echo "usage: $0 releaseLabel problematic_sites_sarsCov2.vcf"
+}
+
+if [ $# != 2 ]; then
+  usage
+  exit 1
+fi
+
+releaseLabel=$1
+problematicSitesVcf=$2
+
+echo "releaseLabel=$releaseLabel problematicSitesVcf=$problematicSitesVcf"
+
+scriptDir=$(dirname "${BASH_SOURCE[0]}")
+
+# UShER (and matToVcf)
+usherDir=~angie/github/usher
+usher=$usherDir/build/usher
+matToVcf=$usherDir/build/matToVcf
+
+# Files generated by getRelease.sh
+metadata=gisaid_metadata.tsv
+fasta=gisaid_sequences.fasta
+alignedFasta=gisaid_aligned.fasta
+
+# We'll need to prune the tree to keep only those sequences that are in metadata and fasta:
+grep ^\> $fasta | sed -re 's/^>//' | sort > namesInFasta
+
+# Make renaming files for tree (EPI_ IDs) and VCF (isolate names): add strain name and
+# date with shortened year.
+awk -F"\t" '{print $3 "\t" $1 "|" $3 "|" $5;}' $metadata \
+| grep -Fwf namesInFasta \
+| sed -re 's/\|20/\|/;' \
+| sort > tree.renaming
+awk -F"\t" '{print $1 "\t" $1 "|" $3 "|" $5;}' $metadata \
+| grep -Fwf namesInFasta \
+| sed -re 's/\|20/\|/;' \
+| sort  > vcf.renaming
+wc -l tree.renaming
+wc -l vcf.renaming
+
+# Rename tree IDs from just EPI IDs to strain|epi|date ids and prune any sequences not in
+# metadata and fasta:
+phyloRenameAndPrune ft_SH.tree tree.renaming ft_SH.renamed.tree
+sed -re 's/\)[0-9.]+:/\):/g; s/:[0-9e.:-]+//g; s/[\(\);]//g; s/,/\n/g;'" s/'//g;" \
+  ft_SH.renamed.tree \
+| sort > tree.renamedPruned.ids.sorted
+wc -l tree.renamedPruned.ids.sorted
+
+# Set the root of the tree to the Wuhan/Hu-1 (NC_045512.2) reference not WIV04:
+~angie/github/newick_utils/src/nw_reroot ft_SH.renamed.tree \
+    'Wuhan/Hu-1/2019|EPI_ISL_402125|19-12-26' > ft_SH.reroot.nh
+
+# Run faToVcf without any filtering, masking or conversion of ambiguous alleles:
+time faToVcf $alignedFasta stdout -includeRef \
+  -ref=Wuhan/Hu-1/2019 \
+  -vcfChrom=NC_045512v2 -verbose=2 \
+| vcfRenameAndPrune stdin vcf.renaming gisaid-$releaseLabel.unfiltered.vcf
+ls -l gisaid-$releaseLabel.unfiltered.vcf
+wc -l gisaid-$releaseLabel.unfiltered.vcf
+gzip -f gisaid-$releaseLabel.unfiltered.vcf
+
+# Run usher and matToVcf to make ambig-resolved VCF for hgTracks
+time $usher -c \
+  --vcf gisaid-$releaseLabel.unfiltered.vcf.gz \
+  --tree ft_SH.reroot.nh \
+  --save-mutation-annotated-tree sarscov2phylo-$releaseLabel.notMasked.pb \
+  --write-uncondensed-final-tree
+mv uncondensed-final-tree.nh ft_SH.reroot.collapsed.notMasked.nh 
+$matToVcf -i sarscov2phylo-$releaseLabel.notMasked.pb \
+    -v gisaid-$releaseLabel.vcf
+bgzip -f gisaid-$releaseLabel.vcf
+tabix -p vcf gisaid-$releaseLabel.vcf.gz
+
+# Remove problematic sites recommended for masking for usher/phyloPlace
+tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
+    $problematicSitesVcf > mask.vcf
+time vcfFilter -excludeVcf=mask.vcf gisaid-$releaseLabel.unfiltered.vcf.gz \
+| gzip -c \
+    > gisaid-$releaseLabel.unfiltered.masked.vcf.gz
+
+# Run usher to collapse tree and make masked protobuf for hgPhyloPlace
+time $usher -c \
+  --vcf gisaid-$releaseLabel.unfiltered.masked.vcf.gz \
+  --tree ft_SH.reroot.nh \
+  --save-mutation-annotated-tree sarscov2phylo-$releaseLabel.masked.pb \
+  --write-uncondensed-final-tree
+mv uncondensed-final-tree.nh ft_SH.reroot.collapsed.nh 
+
+# Now filter by frequency for faster track display: alt al freq >= 0.001
+sampleCount=$(wc -l < tree.renamedPruned.ids.sorted)
+minAc001=$(( (($sampleCount + 999) / 1000) ))
+time vcfFilter -minAc=$minAc001 -rename gisaid-$releaseLabel.vcf.gz \
+    > gisaid-$releaseLabel.minAf.001.vcf
+ls -l gisaid-$releaseLabel.minAf.001.vcf
+wc -l gisaid-$releaseLabel.minAf.001.vcf
+bgzip -f gisaid-$releaseLabel.minAf.001.vcf
+tabix -p vcf gisaid-$releaseLabel.minAf.001.vcf.gz
+
+# Alt al freq >= 0.01
+minAc01=$(( (($sampleCount + 99) / 100) ))
+time vcfFilter -minAc=$minAc01 -rename gisaid-$releaseLabel.minAf.001.vcf.gz \
+    > gisaid-$releaseLabel.minAf.01.vcf
+ls -l gisaid-$releaseLabel.minAf.01.vcf
+wc -l gisaid-$releaseLabel.minAf.01.vcf
+bgzip -f gisaid-$releaseLabel.minAf.01.vcf
+tabix -p vcf gisaid-$releaseLabel.minAf.01.vcf.gz
+
+# Make sample color files
+#*** TODO: Nextstrain Clade is now in field 18!  Add that as an option!
+cut -f3,18-20 $metadata \
+| sort > metadata.epiToLineageAndClades
+# Join on EPI ID to associate tree sample names with lineages.
+join -t$'\t' tree.renaming metadata.epiToLineageAndClades -o 1.2,2.2,2.3,2.4 > sampleToLineage
+$scriptDir/cladeLineageColors.pl sampleToLineage
+gzip -f *Colors