99599c7130790107ff0de9f043930da6aa7fddf1 angie Mon Nov 16 16:35:58 2020 -0800 Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching sequences and metadata from several public sources, mapping GISAID IDs to public seq IDs, downloading the latest release of the phylogenetic tree from github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public sequences, and using github.com/yatisht/usher to resolve ambiguous alleles, make protobuf files for hgPhyloPlace, and add public sequences that have not been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive public tree+VCF. This is still not fully otto-mated because certain crucial inputs like GISAID sequences still must be downloaded using a web browser, but the goal is to automate as much as possible and maybe someday have it fully cron-driven. There are two main top-level scripts which call other scripts, which may in turn call scripts, in this hierarchy: updateIdMapping.sh getCogUk.sh getNcbi.sh searchAllSarsCov2BioSample.sh bioSampleIdToText.sh bioSampleTextToTab.pl gbMetadataAddBioSample.pl fixNcbiFastaNames.pl updateSarsCov2Phylo.sh getRelease.sh processRelease.sh cladeLineageColors.pl mapPublic.sh extractUnmappedPublic.sh addUnmappedPublic.sh many of the above: util.sh publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I figure out how to automate fetching of author/institution metadata from NCBI and COG-UK. diff --git src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh new file mode 100755 index 0000000..e7a49fb --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh @@ -0,0 +1,95 @@ +#!/bin/bash +set -beEu -x -o pipefail + +# Do not modify this script, modify the source tree copy: +# kent/src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh + +usage() { + echo "usage: $0 epiToPublicNameAndDate genbank.fa cogUk.fa cncb.fa" +} + +if [ $# != 4 ]; then + usage + exit 1 +fi + +epiToPublic=$1 +genbankFa=$2 +cogUkFa=$3 +cncbFa=$4 + +minSize=29000 + +echo "epiToPublic=$epiToPublic" +echo "genbank=$genbankFa" +echo "cogUk=$cogUkFa" +echo "cncb=$cncbFa" + +ottoDir=/hive/data/outside/otto/sarscov2phylo +gbToDate=$ottoDir/ncbi.latest/gbToDate +cogUkToDate=$ottoDir/cogUk.latest/cogUkToDate +cncbToDate=$ottoDir/cncb.latest/cncbToDate + +# Outputs for next script (addUnmappedPublic.sh): +alignedFa=unmapped.aligned.fasta +renaming=unmapped.renaming + +scriptDir=$(dirname "${BASH_SOURCE[0]}") + +source $scriptDir/util.sh + +# faFilter discards fasta header info after the first word, so before running it, +# make files that associate IDs and strain names. +fastaNames $genbankFa | cleanGenbank | sort > gbAccName +# COG-UK has only names, so no need to associate anything. +# CNCB puts name first, then ID, so swap the order: +fastaNames $cncbFa | cleanCncb \ +| tawk '{print $2, $1;}' | sort > cncbAccName + +# Filter minSize and exclude sequences that were mapped to GISAID IDs +join -t$'\t' tree.renaming $epiToPublic | cut -f 3 > mappedIds +xcat $genbankFa \ +| faFilter -minSize=$minSize stdin stdout \ +| faSomeRecords -exclude stdin mappedIds genbank.unmapped.fa +fastaSeqCount genbank.unmapped.fa +#*** TODO: also exclude COG-UK sequences that are in GenBank (some with incomplete names) +xcat $cogUkFa \ +faFilter -minSize=$minSize stdin stdout \ +| faSomeRecords -exclude stdin mappedIds cogUk.unmapped.fa +fastaSeqCount cogUk.unmapped.fa +#*** TODO: also exclude CNCB sequences that are in GenBank (some with incomplete names) +# Tweak CNCB's fasta headers around to use acc not name: +xcat $cncbFa \ +| sed -re 's/^>.*\| */>/;' \ +| faFilter -minSize=$minSize stdin stdout \ +| faSomeRecords -exclude stdin mappedIds cncb.unmapped.fa +fastaSeqCount cncb.unmapped.fa + +cat genbank.unmapped.fa cogUk.unmapped.fa cncb.unmapped.fa > unmapped.fa +fastaSeqCount unmapped.fa + +# Make a file for renaming sequences from just the ID to isolateName|ID|date +join -t$'\t' <(fastaNames genbank.unmapped.fa | sort) gbAccName > gbAccNameUnmapped +join -t$'\t' <(fastaNames cncb.unmapped.fa | sort) cncbAccName > cncbAccNameUnmapped +join -t$'\t' <(sort cncbAccNameUnmapped gbAccNameUnmapped) <(sort $cncbToDate $gbToDate) \ +| grep -vF NC_045512.2 \ +| tawk '{ if ($2 == "") { $2 = "?";} if ($3 == "") { $3 = "?"; } print $1, $2 "|" $1 "|" $3; }' \ +| sed -re 's/, /_/g; s/[, ]/_/g;' \ + > $renaming +join -t$'\t' <(fastaNames cogUk.unmapped.fa | sort) <(sort $cogUkToDate) \ +| tawk '{print $1, $1 "|" $2;}' \ + >> $renaming + +#*** TODO: extract submitter credits from metadata, make acknowledgements file + +# Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results +# as a multifasta alignment (from which we can extract VCF with SNVs): +#conda install -c bioconda mafft +rm -f $alignedFa +export TMPDIR=/dev/shm +time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \ + -i unmapped.fa \ + -o $alignedFa \ + -t 50 +faSize $alignedFa +