src/hg/utils/otto/sarscov2phylo/getNcbi.sh 07d759a94ed39950bd7634e0f3e0cc717d7a7b3c

07d759a94ed39950bd7634e0f3e0cc717d7a7b3c
angie
  Tue May 14 16:26:03 2024 -0700
Remove unused annotation option from datasets download command; split into larger chunks when rerunning pangolin on all sequences.

diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh
index 9cbdbea..ef56ce8 100755
--- src/hg/utils/otto/sarscov2phylo/getNcbi.sh
+++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh
@@ -13,35 +13,36 @@
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 
 mkdir -p $ottoDir/ncbi.$today
 cd $ottoDir/ncbi.$today
 
 attempt=0
 maxAttempts=5
 retryDelay=300
 #*** From Eric Cox 1/25/22 when download failed and they were debugging firewall issues:
 #             --proxy https://www.st-va.ncbi.nlm.nih.gov/datasets/v1 \
 #*** From Mirian Tsuchiya 6/3/22: add --debug; if there's a problem send Ncbi-Phid.
 while [[ $((++attempt)) -le $maxAttempts ]]; do
     echo "datasets attempt $attempt"
     if datasets download virus genome taxon 2697049 \
-            --include genome,annotation,biosample \
+            --include genome,biosample \
             --filename ncbi_dataset.zip \
             --no-progressbar \
             --debug \
             >& datasets.log.$attempt; then
+        head -c 10000 datasets.log.$attempt > tmp && mv tmp datasets.log.$attempt
         break;
     else
         echo "FAILED; will try again after $retryDelay seconds"
         if [[ -f ncbi_dataset.zip ]]; then
             mv ncbi_dataset.zip{,.fail.$attempt}
         fi
         if [[ $attempt -lt $maxAttempts ]]; then
             sleep $retryDelay
         fi
         # Double the delay to give NCBI progressively more time
         retryDelay=$(($retryDelay * 2))
     fi
 done
 if [[ ! -f ncbi_dataset.zip ]]; then
     echo "datasets command failed $maxAttempts times; quitting."
@@ -126,31 +127,31 @@
     cp ../ncbi.latest/lineage_report.csv linRepYesterday
     tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \
         > pangolin.prev.names
     comm -23 gb.names pangolin.prev.names > pangolin.names
     faSomeRecords <(xzcat genbank.fa.xz) pangolin.names stdout \
     | sed -re '/^>/  s/^>([A-Z]{2}[0-9]{6}\.[0-9]+) \|[^,]+/>\1/;' \
         > pangolin.fa
     pangolin -t 20 --skip-scorpio pangolin.fa >& pangolin.log
     tail -n+2 lineage_report.csv >> linRepYesterday
     mv linRepYesterday lineage_report.csv
     rm -f pangolin.fa
 else
     splitDir=splitForPangolin
     rm -rf $splitDir
     mkdir $splitDir
-    faSplit about <(xzcat genbank.fa.xz | sed -re '/^>/ s/ .*//;') 30000000 $splitDir/chunk
+    faSplit about <(xzcat genbank.fa.xz | sed -re '/^>/ s/ .*//;') 300000000 $splitDir/chunk
     find $splitDir -name chunk\*.fa \
     | parallel -j 10 "runPangolin {}"
     head -1 $(ls -1 $splitDir/chunk*.csv | head -1) > lineage_report.csv
     for f in $splitDir/chunk*.csv; do
         tail -n+2 $f >> lineage_report.csv
     done
     rm -rf $splitDir
 fi
 wc -l lineage_report.csv
 
 # It turns out that sometimes new sequences sneak into ncbi_dataset.tsv before they're included in
 # genomic.fna.  Filter those out so we don't have missing pangolin and nextclade for just-added
 # COG-UK sequences.  (https://github.com/theosanderson/taxodium/issues/10#issuecomment-876800176)
 grep -Fwf gb.names ncbi_dataset.plusBioSample.tsv > tmp
 wc -l tmp ncbi_dataset.plusBioSample.tsv