src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh 99599c7130790107ff0de9f043930da6aa7fddf1

99599c7130790107ff0de9f043930da6aa7fddf1
angie
  Mon Nov 16 16:35:58 2020 -0800
Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching
sequences and metadata from several public sources, mapping GISAID IDs to
public seq IDs, downloading the latest release of the phylogenetic tree from
github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public
sequences, and using github.com/yatisht/usher to resolve ambiguous alleles,
make protobuf files for hgPhyloPlace, and add public sequences that have not
been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive
public tree+VCF.

This is still not fully otto-mated because certain crucial inputs like
GISAID sequences still must be downloaded using a web browser, but the goal
is to automate as much as possible and maybe someday have it fully cron-driven.

There are two main top-level scripts which call other scripts, which may in turn
call scripts, in this hierarchy:

updateIdMapping.sh
getCogUk.sh
getNcbi.sh
searchAllSarsCov2BioSample.sh
bioSampleIdToText.sh
bioSampleTextToTab.pl
gbMetadataAddBioSample.pl
fixNcbiFastaNames.pl

updateSarsCov2Phylo.sh
getRelease.sh
processRelease.sh
cladeLineageColors.pl
mapPublic.sh
extractUnmappedPublic.sh
addUnmappedPublic.sh

many of the above:
util.sh

publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I
figure out how to automate fetching of author/institution metadata from NCBI
and COG-UK.

diff --git src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh
new file mode 100755
index 0000000..e7a49fb
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+set -beEu -x -o pipefail
+
+#	Do not modify this script, modify the source tree copy:
+#	kent/src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh
+
+usage() {
+    echo "usage: $0 epiToPublicNameAndDate genbank.fa cogUk.fa cncb.fa"
+}
+
+if [ $# != 4 ]; then
+  usage
+  exit 1
+fi
+
+epiToPublic=$1
+genbankFa=$2
+cogUkFa=$3
+cncbFa=$4
+
+minSize=29000
+
+echo "epiToPublic=$epiToPublic"
+echo "genbank=$genbankFa"
+echo "cogUk=$cogUkFa"
+echo "cncb=$cncbFa"
+
+ottoDir=/hive/data/outside/otto/sarscov2phylo
+gbToDate=$ottoDir/ncbi.latest/gbToDate
+cogUkToDate=$ottoDir/cogUk.latest/cogUkToDate
+cncbToDate=$ottoDir/cncb.latest/cncbToDate
+
+# Outputs for next script (addUnmappedPublic.sh):
+alignedFa=unmapped.aligned.fasta
+renaming=unmapped.renaming
+
+scriptDir=$(dirname "${BASH_SOURCE[0]}")
+
+source $scriptDir/util.sh
+
+# faFilter discards fasta header info after the first word, so before running it,
+# make files that associate IDs and strain names.
+fastaNames $genbankFa | cleanGenbank | sort > gbAccName
+# COG-UK has only names, so no need to associate anything.
+# CNCB puts name first, then ID, so swap the order:
+fastaNames $cncbFa | cleanCncb \
+| tawk '{print $2, $1;}' | sort > cncbAccName
+
+# Filter minSize and exclude sequences that were mapped to GISAID IDs
+join -t$'\t' tree.renaming $epiToPublic | cut -f 3 > mappedIds
+xcat $genbankFa \
+| faFilter -minSize=$minSize stdin stdout \
+| faSomeRecords -exclude stdin mappedIds genbank.unmapped.fa
+fastaSeqCount genbank.unmapped.fa
+#*** TODO: also exclude COG-UK sequences that are in GenBank (some with incomplete names)
+xcat $cogUkFa \
+faFilter -minSize=$minSize stdin stdout \
+| faSomeRecords -exclude stdin mappedIds cogUk.unmapped.fa
+fastaSeqCount cogUk.unmapped.fa
+#*** TODO: also exclude CNCB sequences that are in GenBank (some with incomplete names)
+# Tweak CNCB's fasta headers around to use acc not name:
+xcat $cncbFa \
+| sed -re 's/^>.*\| */>/;' \
+| faFilter -minSize=$minSize stdin stdout \
+| faSomeRecords -exclude stdin mappedIds cncb.unmapped.fa
+fastaSeqCount cncb.unmapped.fa
+
+cat genbank.unmapped.fa cogUk.unmapped.fa cncb.unmapped.fa > unmapped.fa
+fastaSeqCount unmapped.fa
+
+# Make a file for renaming sequences from just the ID to isolateName|ID|date
+join -t$'\t' <(fastaNames genbank.unmapped.fa | sort) gbAccName > gbAccNameUnmapped
+join -t$'\t' <(fastaNames cncb.unmapped.fa | sort) cncbAccName > cncbAccNameUnmapped
+join -t$'\t' <(sort cncbAccNameUnmapped gbAccNameUnmapped) <(sort $cncbToDate $gbToDate) \
+| grep -vF NC_045512.2 \
+| tawk '{ if ($2 == "") { $2 = "?";} if ($3 == "") { $3 = "?"; } print $1, $2 "|" $1 "|" $3; }' \
+| sed -re 's/, /_/g; s/[, ]/_/g;' \
+    > $renaming
+join -t$'\t' <(fastaNames cogUk.unmapped.fa | sort) <(sort $cogUkToDate) \
+| tawk '{print $1, $1 "|" $2;}' \
+    >> $renaming
+
+#*** TODO: extract submitter credits from metadata, make acknowledgements file
+
+# Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results
+# as a multifasta alignment (from which we can extract VCF with SNVs):
+#conda install -c bioconda mafft
+rm -f $alignedFa
+export TMPDIR=/dev/shm
+time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \
+  -i unmapped.fa \
+  -o $alignedFa \
+  -t 50
+faSize $alignedFa
+