22d7aeffdf9fcd45a7ec9acf96c57eafdd55b218 angie Tue Jun 3 11:16:28 2025 -0700 tweak CGI-encoding of metadataUrl... to no avail, they seem to have shut down access. diff --git src/hg/utils/otto/sarscov2phylo/getCncb.sh src/hg/utils/otto/sarscov2phylo/getCncb.sh index cd3cd7f08c4..71b7c6ce9fc 100755 --- src/hg/utils/otto/sarscov2phylo/getCncb.sh +++ src/hg/utils/otto/sarscov2phylo/getCncb.sh @@ -1,50 +1,50 @@ #!/bin/bash set -beEux -o pipefail # Download latest CNCB/NGDC fasta and metadata; update $ottoDir/cncb.latest link. scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) ottoDir=/hive/data/outside/otto/sarscov2phylo -metadataUrl='https://ngdc.cncb.ac.cn/ncov/api/es/genome/download?q=&accession=&country=&province=&city=&host=&minCollectDate=&maxCollectDate=&source=CNGBdb,GenBase,Genome+Warehouse,NMDC&qc=&complete=&minLength=15000&maxLength=31000&ns=&ds=&lineage=&whoLabel=&latest=0&md5=0&md5value=&minSubDate=&maxSubDate=' +metadataUrl='https://ngdc.cncb.ac.cn/ncov/api/es/genome/download?q=&accession=&country=&province=&city=&host=&minCollectDate=&maxCollectDate=&source=CNGBdb,GenBase,Genome%20Warehouse,NMDC&qc=&complete=&minLength=15000&maxLength=31000&ns=&ds=&lineage=&whoLabel=&latest=0&md5=0&md5value=&minSubDate=&maxSubDate=' genBaseFileApiBase='https://ngdc.cncb.ac.cn/genbase/api/file/fasta?acc=' mkdir -p $ottoDir/cncb.$today cd $ottoDir/cncb.$today function curlRetry { local url=$* local attempt=0 - local maxAttempts=100 + local maxAttempts=20 local retryDelay=300 while [[ $((++attempt)) -le $maxAttempts ]]; do >&2 echo "curl attempt $attempt" if curl -Ss $url; then break else >&2 echo "FAILED; will try again after $retryDelay seconds" sleep $retryDelay fi done if [[ $attempt -gt $maxAttempts ]]; then - $>2 echo "curl failed $maxAttempts times; quitting." + >&2 echo "curl failed $maxAttempts times; quitting." exit 1 fi } # Discard Pango lineage column so we can keep the same column order as before & won't have to # update scripts. Also watch out for non-ASCII characters (e.g. NMDC60013002-06's "2019?" # was changed to "2019\302\240" -- change it back). curlRetry $metadataUrl \ | cut -f 1-4,6- \ | perl -wpe 's/[^[:print:]^\t^\n]+/?/g;' \ > cncb.metadata.tsv colCount=$(head -1 cncb.metadata.tsv | tawk '{print NF;}') if [[ $colCount != 16 ]]; then echo "Metadata format error: expected 16 columns, got $colCount"