99599c7130790107ff0de9f043930da6aa7fddf1 angie Mon Nov 16 16:35:58 2020 -0800 Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching sequences and metadata from several public sources, mapping GISAID IDs to public seq IDs, downloading the latest release of the phylogenetic tree from github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public sequences, and using github.com/yatisht/usher to resolve ambiguous alleles, make protobuf files for hgPhyloPlace, and add public sequences that have not been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive public tree+VCF. This is still not fully otto-mated because certain crucial inputs like GISAID sequences still must be downloaded using a web browser, but the goal is to automate as much as possible and maybe someday have it fully cron-driven. There are two main top-level scripts which call other scripts, which may in turn call scripts, in this hierarchy: updateIdMapping.sh getCogUk.sh getNcbi.sh searchAllSarsCov2BioSample.sh bioSampleIdToText.sh bioSampleTextToTab.pl gbMetadataAddBioSample.pl fixNcbiFastaNames.pl updateSarsCov2Phylo.sh getRelease.sh processRelease.sh cladeLineageColors.pl mapPublic.sh extractUnmappedPublic.sh addUnmappedPublic.sh many of the above: util.sh publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I figure out how to automate fetching of author/institution metadata from NCBI and COG-UK. diff --git src/hg/utils/otto/sarscov2phylo/processRelease.sh src/hg/utils/otto/sarscov2phylo/processRelease.sh new file mode 100755 index 0000000..22b7fe6 --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/processRelease.sh @@ -0,0 +1,123 @@ +#!/bin/bash +set -beEu -x -o pipefail + +# Do not modify this script, modify the source tree copy: +# kent/src/hg/utils/otto/sarscov2phylo/processRelease.sh + +usage() { + echo "usage: $0 releaseLabel problematic_sites_sarsCov2.vcf" +} + +if [ $# != 2 ]; then + usage + exit 1 +fi + +releaseLabel=$1 +problematicSitesVcf=$2 + +echo "releaseLabel=$releaseLabel problematicSitesVcf=$problematicSitesVcf" + +scriptDir=$(dirname "${BASH_SOURCE[0]}") + +# UShER (and matToVcf) +usherDir=~angie/github/usher +usher=$usherDir/build/usher +matToVcf=$usherDir/build/matToVcf + +# Files generated by getRelease.sh +metadata=gisaid_metadata.tsv +fasta=gisaid_sequences.fasta +alignedFasta=gisaid_aligned.fasta + +# We'll need to prune the tree to keep only those sequences that are in metadata and fasta: +grep ^\> $fasta | sed -re 's/^>//' | sort > namesInFasta + +# Make renaming files for tree (EPI_ IDs) and VCF (isolate names): add strain name and +# date with shortened year. +awk -F"\t" '{print $3 "\t" $1 "|" $3 "|" $5;}' $metadata \ +| grep -Fwf namesInFasta \ +| sed -re 's/\|20/\|/;' \ +| sort > tree.renaming +awk -F"\t" '{print $1 "\t" $1 "|" $3 "|" $5;}' $metadata \ +| grep -Fwf namesInFasta \ +| sed -re 's/\|20/\|/;' \ +| sort > vcf.renaming +wc -l tree.renaming +wc -l vcf.renaming + +# Rename tree IDs from just EPI IDs to strain|epi|date ids and prune any sequences not in +# metadata and fasta: +phyloRenameAndPrune ft_SH.tree tree.renaming ft_SH.renamed.tree +sed -re 's/\)[0-9.]+:/\):/g; s/:[0-9e.:-]+//g; s/[\(\);]//g; s/,/\n/g;'" s/'//g;" \ + ft_SH.renamed.tree \ +| sort > tree.renamedPruned.ids.sorted +wc -l tree.renamedPruned.ids.sorted + +# Set the root of the tree to the Wuhan/Hu-1 (NC_045512.2) reference not WIV04: +~angie/github/newick_utils/src/nw_reroot ft_SH.renamed.tree \ + 'Wuhan/Hu-1/2019|EPI_ISL_402125|19-12-26' > ft_SH.reroot.nh + +# Run faToVcf without any filtering, masking or conversion of ambiguous alleles: +time faToVcf $alignedFasta stdout -includeRef \ + -ref=Wuhan/Hu-1/2019 \ + -vcfChrom=NC_045512v2 -verbose=2 \ +| vcfRenameAndPrune stdin vcf.renaming gisaid-$releaseLabel.unfiltered.vcf +ls -l gisaid-$releaseLabel.unfiltered.vcf +wc -l gisaid-$releaseLabel.unfiltered.vcf +gzip -f gisaid-$releaseLabel.unfiltered.vcf + +# Run usher and matToVcf to make ambig-resolved VCF for hgTracks +time $usher -c \ + --vcf gisaid-$releaseLabel.unfiltered.vcf.gz \ + --tree ft_SH.reroot.nh \ + --save-mutation-annotated-tree sarscov2phylo-$releaseLabel.notMasked.pb \ + --write-uncondensed-final-tree +mv uncondensed-final-tree.nh ft_SH.reroot.collapsed.notMasked.nh +$matToVcf -i sarscov2phylo-$releaseLabel.notMasked.pb \ + -v gisaid-$releaseLabel.vcf +bgzip -f gisaid-$releaseLabel.vcf +tabix -p vcf gisaid-$releaseLabel.vcf.gz + +# Remove problematic sites recommended for masking for usher/phyloPlace +tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ + $problematicSitesVcf > mask.vcf +time vcfFilter -excludeVcf=mask.vcf gisaid-$releaseLabel.unfiltered.vcf.gz \ +| gzip -c \ + > gisaid-$releaseLabel.unfiltered.masked.vcf.gz + +# Run usher to collapse tree and make masked protobuf for hgPhyloPlace +time $usher -c \ + --vcf gisaid-$releaseLabel.unfiltered.masked.vcf.gz \ + --tree ft_SH.reroot.nh \ + --save-mutation-annotated-tree sarscov2phylo-$releaseLabel.masked.pb \ + --write-uncondensed-final-tree +mv uncondensed-final-tree.nh ft_SH.reroot.collapsed.nh + +# Now filter by frequency for faster track display: alt al freq >= 0.001 +sampleCount=$(wc -l < tree.renamedPruned.ids.sorted) +minAc001=$(( (($sampleCount + 999) / 1000) )) +time vcfFilter -minAc=$minAc001 -rename gisaid-$releaseLabel.vcf.gz \ + > gisaid-$releaseLabel.minAf.001.vcf +ls -l gisaid-$releaseLabel.minAf.001.vcf +wc -l gisaid-$releaseLabel.minAf.001.vcf +bgzip -f gisaid-$releaseLabel.minAf.001.vcf +tabix -p vcf gisaid-$releaseLabel.minAf.001.vcf.gz + +# Alt al freq >= 0.01 +minAc01=$(( (($sampleCount + 99) / 100) )) +time vcfFilter -minAc=$minAc01 -rename gisaid-$releaseLabel.minAf.001.vcf.gz \ + > gisaid-$releaseLabel.minAf.01.vcf +ls -l gisaid-$releaseLabel.minAf.01.vcf +wc -l gisaid-$releaseLabel.minAf.01.vcf +bgzip -f gisaid-$releaseLabel.minAf.01.vcf +tabix -p vcf gisaid-$releaseLabel.minAf.01.vcf.gz + +# Make sample color files +#*** TODO: Nextstrain Clade is now in field 18! Add that as an option! +cut -f3,18-20 $metadata \ +| sort > metadata.epiToLineageAndClades +# Join on EPI ID to associate tree sample names with lineages. +join -t$'\t' tree.renaming metadata.epiToLineageAndClades -o 1.2,2.2,2.3,2.4 > sampleToLineage +$scriptDir/cladeLineageColors.pl sampleToLineage +gzip -f *Colors