265430be8faea69b3c41caa789e9a58a0ac4ebb8 angie Fri Dec 20 15:25:04 2024 -0800 Suppress progressbar when downloading, clean up when done. diff --git src/hg/utils/otto/dengue/getNcbiDengue.sh src/hg/utils/otto/dengue/getNcbiDengue.sh index f591004..ba7c5c2 100755 --- src/hg/utils/otto/dengue/getNcbiDengue.sh +++ src/hg/utils/otto/dengue/getNcbiDengue.sh @@ -1,81 +1,83 @@ #!/bin/bash source ~/.bashrc set -beEu -x -o pipefail # Download all subtypes (1-4) of dengue virus (taxid 12637) from NCBI. # assembly Organism name NC_ taxid GenBank # GCF_000862125.1 Dengue virus 1 NC_001477.1 11053 U88536.1 (clone 45AZ5) # GCF_000871845.1 Dengue virus 2 NC_001474.2 11060 U87411.1 (Thailand/16681/84) # GCF_000866625.1 Dengue virus 3 NC_001475.2 11069 AY099336.1 (D3/H/IMTSSA-SRI/2000/1266) # GCF_000865065.1 Dengue virus 4 NC_002640.1 11070 NC_002640.1 (recombinant clone rDEN4) today=$(date +%F) dengueDir=/hive/data/outside/otto/dengue minSize=9000 mkdir -p $dengueDir/ncbi/ncbi.$today cd $dengueDir/ncbi/ncbi.$today # Download all dengue sequences, sort out subtypes later. taxId=12637 # Thank you Nextstrain (monkeypox/ingest/bin/genbank-url) for query format: metadataUrl='https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?fq=%7B%21tag%3DSeqType_s%7DSeqType_s%3A%28%22Nucleotide%22%29&fq=VirusLineageId_ss%3A%28'$taxId'%29&q=%2A%3A%2A&cmd=download&dlfmt=csv&fl=genbank_accession_rev%3AAccVer_s%2Cisolate%3AIsolate_s%2Cregion%3ARegion_s%2Clocation%3ACountryFull_s%2Ccollected%3ACollectionDate_s%2Csubmitted%3ACreateDate_dt%2Clength%3ASLen_i%2Chost%3AHost_s%2Cbioproject_accession%3ABioProject_s%2Cbiosample_accession%3ABioSample_s%2Csra_accession%3ASRALink_csv%2Ctitle%3ADefinition_s%2Cauthors%3AAuthors_csv%2Cpublications%3APubMed_csv%2Cstrain%3AStrain_s%2Cserotype%3ASerotype_s&sort=id+asc&email='$USER'@soe.ucsc.edu' attempt=0 maxAttempts=5 retryDelay=300 while [[ $((++attempt)) -le $maxAttempts ]]; do echo "metadata attempt $attempt" if curl -fSs $metadataUrl | csvToTab | tawk '$7 >= '$minSize > metadata.tsv; then break; else echo "FAILED metadata; will try again after $retryDelay seconds" rm -f metadata.tsv sleep $retryDelay # Double the delay to give NCBI progressively more time retryDelay=$(($retryDelay * 2)) fi done if [[ ! -f metadata.tsv ]]; then echo "datasets command failed $maxAttempts times; quitting." exit 1 fi wc -l metadata.tsv attempt=0 maxAttempts=5 retryDelay=300 while [[ $((++attempt)) -le $maxAttempts ]]; do echo "fasta attempt $attempt" - if datasets download virus genome taxon $taxId --include genome,biosample; then + if datasets download virus --no-progressbar genome taxon $taxId --include genome,biosample; then break; else echo "FAILED fasta; will try again after $retryDelay seconds" rm -f ncbi_dataset.zip sleep $retryDelay # Double the delay to give NCBI progressively more time retryDelay=$(($retryDelay * 2)) fi done if [[ ! -s ncbi_dataset.zip ]]; then echo "fasta query failed $maxAttempts times; quitting." exit 1 fi unzip ncbi_dataset.zip faFilter -minSize=$minSize ncbi_dataset/data/genomic.fna stdout \ | xz -T 20 > genbank.fa.xz faSize <(xzcat genbank.fa.xz) # Make sure the download wasn't truncated without reporting an error: count=$(wc -l < metadata.tsv) minSamples=12000 if (( $count < $minSamples )); then echo "*** Too few samples ($count)! Expected at least $minSamples. Halting. ***" exit 1 fi +rm -rf ncbi_dataset ncbi_dataset.zip + rm -f $dengueDir/ncbi/ncbi.latest ln -s ncbi.$today $dengueDir/ncbi/ncbi.latest