d80e5aa50badf8d4b1e0fbadfb5cd346b812d18e angie Fri Dec 2 11:09:52 2022 -0800 Remove outdated options from datasets command; don't waste time sleeping after final failure. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index a7312b1..89be6d8 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -15,41 +15,41 @@ mkdir -p $ottoDir/ncbi.$today cd $ottoDir/ncbi.$today attempt=0 maxAttempts=5 retryDelay=300 #*** From Eric Cox 1/25/22 when download failed and they were debugging firewall issues: # --proxy https://www.st-va.ncbi.nlm.nih.gov/datasets/v1 \ #*** From Mirian Tsuchiya 6/3/22: add --debug; if there's a problem send Ncbi-Phid. while [[ $((++attempt)) -le $maxAttempts ]]; do echo "datasets attempt $attempt" if datasets download virus genome taxon 2697049 \ --exclude-cds \ --exclude-protein \ - --exclude-gpff \ - --exclude-pdb \ --filename ncbi_dataset.zip \ --no-progressbar \ --debug \ >& datasets.log.$attempt; then break; else echo "FAILED; will try again after $retryDelay seconds" - rm -f ncbi_dataset.zip + mv ncbi_dataset.zip{,.fail.$attempt} + if [[ $attempt -lt $maxAttempts ]]; then sleep $retryDelay + fi # Double the delay to give NCBI progressively more time retryDelay=$(($retryDelay * 2)) fi done if [[ ! -f ncbi_dataset.zip ]]; then echo "datasets command failed $maxAttempts times; quitting." exit 1 fi rm -rf ncbi_dataset unzip -o ncbi_dataset.zip # Creates ./ncbi_dataset/ # This makes something just like ncbi.datasets.tsv from the /table/ API query: time jq -c -r '[.accession, .biosample, .isolate.collectionDate, .location.geographicLocation, .host.sciName, .isolate.name, .completeness, (.length|tostring)] | join("\t")' \ ncbi_dataset/data/data_report.jsonl \