1d99af58ff1f767eb3497d666eeacc1ab8ac166a angie Mon Feb 14 12:12:37 2022 -0800 Script for running nextclade on compressed or uncompressed fasta, keeping full TSV & aligned fasta output, suitable for running on cluster. diff --git src/hg/utils/otto/sarscov2phylo/runNextclade.sh src/hg/utils/otto/sarscov2phylo/runNextclade.sh new file mode 100755 index 0000000..d91c158 --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/runNextclade.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Run nextclade on xz'd fasta input; produce $outBase.{nextclade.full.tsv.gz,nextalign.fasta.xz} + +# nextclade is installed in conda (base environment), so we need ~/.bashrc setup: +source ~/.bashrc +set -beEu -o pipefail + +faIn=$1 +outBase=$2 + +xcat () { + for f in $@; do + if [ "${f##*.}" == "xz" ]; then + xzcat $f + elif [ "${f##*.}" == "gz" ]; then + zcat $f + else + cat $f + fi + done +} +export -f xcat + +logfile=$(mktemp) +outDir=$(mktemp -d) + +nDataDir=~angie/github/nextclade/data/sars-cov-2 +nextclade run -i <(xcat $faIn | sed -re 's@^>hCo[Vv]-19/+@>@; s/[ '"'"',()]//g;') \ + --input-dataset $nDataDir \ + --output-dir $outDir \ + --output-tsv $outBase.nextclade.full.tsv \ + --output-basename out \ + --jobs 1 \ + >& $logfile + +gzip -f $outBase.nextclade.full.tsv +xz -f $outDir/out.aligned.fasta +cp -p $outDir/out.aligned.fasta.xz $outBase.nextalign.fasta.xz + +rm -rf $logfile $outDir