99599c7130790107ff0de9f043930da6aa7fddf1 angie Mon Nov 16 16:35:58 2020 -0800 Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching sequences and metadata from several public sources, mapping GISAID IDs to public seq IDs, downloading the latest release of the phylogenetic tree from github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public sequences, and using github.com/yatisht/usher to resolve ambiguous alleles, make protobuf files for hgPhyloPlace, and add public sequences that have not been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive public tree+VCF. This is still not fully otto-mated because certain crucial inputs like GISAID sequences still must be downloaded using a web browser, but the goal is to automate as much as possible and maybe someday have it fully cron-driven. There are two main top-level scripts which call other scripts, which may in turn call scripts, in this hierarchy: updateIdMapping.sh getCogUk.sh getNcbi.sh searchAllSarsCov2BioSample.sh bioSampleIdToText.sh bioSampleTextToTab.pl gbMetadataAddBioSample.pl fixNcbiFastaNames.pl updateSarsCov2Phylo.sh getRelease.sh processRelease.sh cladeLineageColors.pl mapPublic.sh extractUnmappedPublic.sh addUnmappedPublic.sh many of the above: util.sh publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I figure out how to automate fetching of author/institution metadata from NCBI and COG-UK. diff --git src/hg/utils/otto/sarscov2phylo/getRelease.sh src/hg/utils/otto/sarscov2phylo/getRelease.sh new file mode 100755 index 0000000..402803c --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/getRelease.sh @@ -0,0 +1,80 @@ +#!/bin/bash +set -beEu -x -o pipefail + +# Do not modify this script, modify the source tree copy: +# kent/src/hg/utils/otto/sarscov2phylo/getRelease.sh + +usage() { + echo "usage: $0 releaseLabel metadata_date.tsv.gz sequences_date.fasta.gz" +} + +if [ $# != 3 ]; then + usage + exit 1 +fi + +releaseLabel=$1 +nextmeta=$2 +nextfasta=$3 + +alignmentScript=~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh + +# Outputs for next script (processRelease.sh): +metadata=gisaid_metadata.tsv +fasta=gisaid_sequences.fasta +alignedFasta=gisaid_aligned.fasta + +# Download tree from sarscov2phylo release +rm -f ft_SH.tree +wget --quiet https://github.com/roblanf/sarscov2phylo/releases/download/$releaseLabel/ft_SH.tree + +# Get IDs used in tree +sed -re 's/\)[0-9.]+:/\):/g; s/:[0-9e.:-]+//g; s/[\(\);]//g; s/,/\n/g;'" s/'//g;" ft_SH.tree \ +| sort > tree.ids.sorted + +# How many samples? +wc -l tree.ids.sorted + +scriptDir=$(dirname "${BASH_SOURCE[0]}") +source $scriptDir/util.sh + +expectedHeader="strain virus gisaid_epi_isl genbank_accession date region country division location region_exposure country_exposure division_exposure segment length host age sex Nextstrain_clade pangolin_lineage GISAID_clade originating_lab submitting_lab authors url title paper_url date_submitted" + +# Disregard pipefail just for this pipe because head prevents the cat from completing: +set +o pipefail +header=`xcat $nextmeta | head -1` +set -o pipefail + +if [ "$header" != "$expectedHeader" ]; then + echo "" + echo "*** ERROR: nextmeta header format changed ***" + echo "" + echo "Expected first line of $nextmeta to be this:" + echo "" + echo "$expectedHeader" + echo "" + echo "But got this:" + echo "$header" + echo "" + exit 1 +fi + +xcat $nextmeta | tail -n +2 > $metadata + +# Prune fasta to keep only the sequences that are in both metadata and tree. +# fasta names are strain names, tree IDs are EPI IDs; use metadata to join them up. +awk -F"\t" '{print $3 "\t" $1;}' $metadata | sort > metadata.epiToName +join -o 2.2 tree.ids.sorted metadata.epiToName > namesInTreeAndMetadata +xcat $nextfasta \ +| faSomeRecords stdin namesInTreeAndMetadata $fasta + +# Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results +# as a multifasta alignment (from which we can extract VCF with SNVs): +#conda install -c bioconda mafft +rm -f $alignedFasta +export TMPDIR=/dev/shm +time bash $alignmentScript \ + -i $fasta \ + -o $alignedFasta \ + -t 50 +grep ^\> $alignedFasta | wc -l