07d759a94ed39950bd7634e0f3e0cc717d7a7b3c angie Tue May 14 16:26:03 2024 -0700 Remove unused annotation option from datasets download command; split into larger chunks when rerunning pangolin on all sequences. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 9cbdbea..ef56ce8 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -13,35 +13,36 @@ ottoDir=/hive/data/outside/otto/sarscov2phylo mkdir -p $ottoDir/ncbi.$today cd $ottoDir/ncbi.$today attempt=0 maxAttempts=5 retryDelay=300 #*** From Eric Cox 1/25/22 when download failed and they were debugging firewall issues: # --proxy https://www.st-va.ncbi.nlm.nih.gov/datasets/v1 \ #*** From Mirian Tsuchiya 6/3/22: add --debug; if there's a problem send Ncbi-Phid. while [[ $((++attempt)) -le $maxAttempts ]]; do echo "datasets attempt $attempt" if datasets download virus genome taxon 2697049 \ - --include genome,annotation,biosample \ + --include genome,biosample \ --filename ncbi_dataset.zip \ --no-progressbar \ --debug \ >& datasets.log.$attempt; then + head -c 10000 datasets.log.$attempt > tmp && mv tmp datasets.log.$attempt break; else echo "FAILED; will try again after $retryDelay seconds" if [[ -f ncbi_dataset.zip ]]; then mv ncbi_dataset.zip{,.fail.$attempt} fi if [[ $attempt -lt $maxAttempts ]]; then sleep $retryDelay fi # Double the delay to give NCBI progressively more time retryDelay=$(($retryDelay * 2)) fi done if [[ ! -f ncbi_dataset.zip ]]; then echo "datasets command failed $maxAttempts times; quitting." @@ -126,31 +127,31 @@ cp ../ncbi.latest/lineage_report.csv linRepYesterday tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \ > pangolin.prev.names comm -23 gb.names pangolin.prev.names > pangolin.names faSomeRecords <(xzcat genbank.fa.xz) pangolin.names stdout \ | sed -re '/^>/ s/^>([A-Z]{2}[0-9]{6}\.[0-9]+) \|[^,]+/>\1/;' \ > pangolin.fa pangolin -t 20 --skip-scorpio pangolin.fa >& pangolin.log tail -n+2 lineage_report.csv >> linRepYesterday mv linRepYesterday lineage_report.csv rm -f pangolin.fa else splitDir=splitForPangolin rm -rf $splitDir mkdir $splitDir - faSplit about <(xzcat genbank.fa.xz | sed -re '/^>/ s/ .*//;') 30000000 $splitDir/chunk + faSplit about <(xzcat genbank.fa.xz | sed -re '/^>/ s/ .*//;') 300000000 $splitDir/chunk find $splitDir -name chunk\*.fa \ | parallel -j 10 "runPangolin {}" head -1 $(ls -1 $splitDir/chunk*.csv | head -1) > lineage_report.csv for f in $splitDir/chunk*.csv; do tail -n+2 $f >> lineage_report.csv done rm -rf $splitDir fi wc -l lineage_report.csv # It turns out that sometimes new sequences sneak into ncbi_dataset.tsv before they're included in # genomic.fna. Filter those out so we don't have missing pangolin and nextclade for just-added # COG-UK sequences. (https://github.com/theosanderson/taxodium/issues/10#issuecomment-876800176) grep -Fwf gb.names ncbi_dataset.plusBioSample.tsv > tmp wc -l tmp ncbi_dataset.plusBioSample.tsv