e4eb9b464422dc81af82a711fd6470174ed3fb92 angie Fri Aug 26 11:56:30 2022 -0700 Add retry for all the URLs. Update to latest nextclade version. diff --git src/hg/utils/otto/sarscov2phylo/getCogUk.sh src/hg/utils/otto/sarscov2phylo/getCogUk.sh index 6cac2c6..9aab143 100755 --- src/hg/utils/otto/sarscov2phylo/getCogUk.sh +++ src/hg/utils/otto/sarscov2phylo/getCogUk.sh @@ -1,82 +1,78 @@ #!/bin/bash set -beEux -o pipefail # Download latest COG-UK fasta and metadata; update $ottoDir/cogUk.latest link. scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) ottoDir=/hive/data/outside/otto/sarscov2phylo cogUrlBase=https://cog-uk.s3.climb.ac.uk/phylogenetics/latest mkdir -p $ottoDir/cogUk.$today cd $ottoDir/cogUk.$today # Sometimes the curl fails with a DNS error, regardless of whether my previous cron job with -# curl -I succeeded. Do multiple retries for the first URL; once it's working, it should -# continue to work for the other URLs (she said hopefully). +# curl -I succeeded. Do multiple retries per URL -- sometimes even after one succeeds, the +# next can still fail e.g. with "curl: (6) Could not resolve host: cog-uk.s3.climb.ac.uk". # On some days, the fetch started but failed partway through, even after 5 tries, so # keep the partial result around and try again with '-C -'. -attempt=0 -maxAttempts=5 -retryDelay=60 +function curlRetry { + local url=$* + local attempt=0 + local maxAttempts=5 + local retryDelay=60 while [[ $((++attempt)) -le $maxAttempts ]]; do echo "curl attempt $attempt" - if curl -S -s -C - -O $cogUrlBase/cog_all.fasta.gz; then + if curl -S -s -C - -O $url; then break else echo "FAILED; will try again after $retryDelay seconds" sleep $retryDelay fi done if [[ $attempt -gt $maxAttempts ]]; then echo "curl failed $maxAttempts times; quitting." exit 1 fi -curl -S -s -C - -O $cogUrlBase/cog_metadata.csv.gz +} + +curlRetry $cogUrlBase/cog_all.fasta.gz +curlRetry $cogUrlBase/cog_metadata.csv.gz gunzip cog_metadata.csv.gz -curl -S -s -C - -O $cogUrlBase/cog_global_tree.newick +curlRetry $cogUrlBase/cog_global_tree.newick zcat cog_all.fasta.gz | xz -T 20 > cog_all.fasta.xz rm cog_all.fasta.gz tail -n +2 cog_metadata.csv \ | awk -F, '{print $1 "\t" $5;}' | sort > cogUkToDate # Reuse nextclade assignments for older sequences; compute nextclade assignments for new seqs. zcat $ottoDir/cogUk.latest/nextclade.full.tsv.gz > nextclade.full.tsv cp $ottoDir/cogUk.latest/nextalign.fa.xz . comm -13 <(cut -f 1 nextclade.full.tsv | sort) <(fastaNames cog_all.fasta.xz | sort) \ > seqsForNextclade if [ -s seqsForNextclade ]; then - splitDir=splitForNextclade - rm -rf $splitDir - mkdir $splitDir - faSomeRecords <(xzcat cog_all.fasta.xz) seqsForNextclade stdout \ - | faSplit about stdin 300000000 $splitDir/chunk nDataDir=~angie/github/nextclade/data/sars-cov-2 - outDir=$(mktemp -d) outTsv=$(mktemp) - for chunkFa in $splitDir/chunk*.fa; do - nextclade -j 30 -i $chunkFa \ + outFa=$(mktemp) + faSomeRecords <(xzcat cog_all.fasta.xz) seqsForNextclade stdout \ + | nextclade run -j 30 \ --input-dataset $nDataDir \ - --output-dir $outDir \ - --output-basename out \ + --output-fasta $outFa \ --output-tsv $outTsv >& nextclade.log tail -n+2 $outTsv | sed -re 's/"//g;' >> nextclade.full.tsv - xz -T 20 < $outDir/out.aligned.fasta >> nextalign.fa.xz - rm -f $outTsv $outDir/* - done - rm -rf $outDir - rm -rf $splitDir + xz -T 20 < $outFa >> nextalign.fa.xz + rm -f $outTsv $outFa fi pigz -f -p 8 nextclade.full.tsv rm -f $ottoDir/cogUk.latest ln -s cogUk.$today $ottoDir/cogUk.latest rm -f ~angie/public_html/sarscov2phylo/cogUk.$today ln -s $ottoDir/cogUk.$today ~angie/public_html/sarscov2phylo/cogUk.$today