src/hg/utils/otto/sarscov2phylo/getNcbi.sh 3d8092305bf17c5c01713c6ed9b6ba3c289dd8ef

3d8092305bf17c5c01713c6ed9b6ba3c289dd8ef
angie
  Thu Aug 19 11:40:27 2021 -0700
Add retries for NCBI and COG-UK fetches to get around sporadic network errors.

diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh
index 75c4c8e..8e0e040 100755
--- src/hg/utils/otto/sarscov2phylo/getNcbi.sh
+++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh
@@ -5,38 +5,56 @@
 # Download SARS-CoV-2 GenBank FASTA and metadata using NCBI Datasets API.
 # Use E-Utils to get SARS-CoV-2 metadata from BioSample.
 # Use BioSample metadata to fill in gaps in GenBank metadata and report conflicting dates.
 # Use enhanced metadata to rewrite FASTA headers for matching up with sequences in other databases.
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 today=$(date +%F)
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 
 mkdir -p $ottoDir/ncbi.$today
 cd $ottoDir/ncbi.$today
 
-datasets download virus genome taxon 2697049 \
+attempt=0
+maxAttempts=5
+retryDelay=300
+while [[ $((++attempt)) -le $maxAttempts ]]; do
+    echo "datasets attempt $attempt"
+    if datasets download virus genome taxon 2697049 \
             --exclude-cds \
             --exclude-protein \
             --exclude-gpff \
             --exclude-pdb \
             --filename ncbi_dataset.zip \
         |& tail -50 \
-    > datasets.log
+            > datasets.log.$attempt; then
+        break;
+    else
+        echo "FAILED; will try again after $retryDelay seconds"
+        rm -f ncbi_dataset.zip
+        sleep $retryDelay
+        # Double the delay to give NCBI progressively more time
+        retryDelay=$(($retryDelay * 2))
+    fi
+done
+if [[ ! -f ncbi_dataset.zip ]]; then
+    echo "datasets command failed $maxAttempts times; quitting."
+    exit 1
+fi
 rm -rf ncbi_dataset
 unzip -o ncbi_dataset.zip
 # Creates ./ncbi_dataset/
 
 # This makes something just like ncbi.datasets.tsv from the /table/ API query:
 jq -c -r '[.accession, .biosample, .isolate.collectionDate, .location.geographicLocation, .host.sciName, .isolate.name, .completeness, (.length|tostring)] | join("\t")' \
     ncbi_dataset/data/data_report.jsonl \
 | sed -e 's/COMPLETE/complete/; s/PARTIAL/partial/;' \
 | sort \
     > ncbi_dataset.tsv
 
 # Use EUtils (esearch) to get all SARS-CoV-2 BioSample GI# IDs:
 $scriptDir/searchAllSarsCov2BioSample.sh
 sort all.biosample.gids.txt > all.biosample.gids.sorted.txt