src/hg/utils/otto/sarscov2phylo/getNcbi.sh e2df9883de70a2ab2d08b96aaec1a62f916a426a

e2df9883de70a2ab2d08b96aaec1a62f916a426a
angie
  Thu Jun 22 10:17:09 2023 -0700
Update datasets command line options and biosample filename for latest version. pangoLEARN has been discontinued so use default usher mode in pangolin and skip scorpio for speed.

diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh
index b3abc2a..bf672dd 100755
--- src/hg/utils/otto/sarscov2phylo/getNcbi.sh
+++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh
@@ -13,32 +13,31 @@
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 
 mkdir -p $ottoDir/ncbi.$today
 cd $ottoDir/ncbi.$today
 
 attempt=0
 maxAttempts=5
 retryDelay=300
 #*** From Eric Cox 1/25/22 when download failed and they were debugging firewall issues:
 #             --proxy https://www.st-va.ncbi.nlm.nih.gov/datasets/v1 \
 #*** From Mirian Tsuchiya 6/3/22: add --debug; if there's a problem send Ncbi-Phid.
 while [[ $((++attempt)) -le $maxAttempts ]]; do
     echo "datasets attempt $attempt"
     if datasets download virus genome taxon 2697049 \
-            --exclude-cds \
-            --exclude-protein \
+            --include genome,annotation,biosample \
             --filename ncbi_dataset.zip \
             --no-progressbar \
             --debug \
             >& datasets.log.$attempt; then
         break;
     else
         echo "FAILED; will try again after $retryDelay seconds"
         if [[ -f ncbi_dataset.zip ]]; then
             mv ncbi_dataset.zip{,.fail.$attempt}
         fi
         if [[ $attempt -lt $maxAttempts ]]; then
             sleep $retryDelay
         fi
         # Double the delay to give NCBI progressively more time
         retryDelay=$(($retryDelay * 2))
@@ -55,97 +54,90 @@
 # This makes something just like ncbi.datasets.tsv from the /table/ API query:
 time jq -c -r '[.accession, .biosample, .isolate.collectionDate, .location.geographicLocation, .host.sciName, .isolate.name, .completeness, (.length|tostring)] | join("\t")' \
     ncbi_dataset/data/data_report.jsonl \
 | sed -e 's/COMPLETE/complete/; s/PARTIAL/partial/;' \
 | sort -u \
     > ncbi_dataset.tsv
 
 # Make sure we didn't get a drastically truncated download despite apparently successful commands
 minSeqCount=6800000
 metadataLC=$(wc -l < ncbi_dataset.tsv)
 if (($metadataLC < $minSeqCount)); then
     echo "TOO FEW SEQUENCES: $metadataLC lines of ncbi_dataset.tsv, expected at least $minSeqCount"
     exit 1
 fi
 
-time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample.jsonl | uniq > gb.bioSample.tab
+time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample_report.jsonl \
+    | uniq > gb.bioSample.tab
 
 # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting
 # sample collection dates:
 $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \
     > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log
 
 # Manually patch some GB-to-BioSample associations that somehow got mangled at ENA, until
 # they fix them and the fix percolates through to NCBI...
 grep -vFwf <(cut -f 1 $ottoDir/ncbi.2022-06-25/gbToBioSample.changes.tsv) \
     ncbi_dataset.plusBioSample.tsv > tmp
 sort tmp $ottoDir/ncbi.2022-06-25/gbToBioSample.patch.tsv > ncbi_dataset.plusBioSample.tsv
 rm tmp
 
 # Make a file for joining collection date with ID:
 tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \
 | sort > gbToDate
 
 # Replace FASTA headers with reconstructed names from enhanced metadata.
 time cleanGenbank < ncbi_dataset/data/genomic.fna \
 | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \
     > genbank.maybeDups.fa
 time fastaNames genbank.maybeDups.fa | awk '{print $1 "\t" $0;}' > gb.rename
 time faUniqify genbank.maybeDups.fa stdout \
 | faRenameRecords stdin gb.rename stdout \
 | xz -T 20 \
     > genbank.fa.xz
 
 # Run pangolin and nextclade on sequences that are new since yesterday
-# TODO: remove the split, nextclade can handle big inputs now -- but truncate fasta names to
-# just the accession in a pipe to nextclade.
 export TMPDIR=/dev/shm
 fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | grep -vx pdb | sort -u > gb.names
-splitDir=splitForNextclade
-rm -rf $splitDir
-mkdir $splitDir
 zcat ../ncbi.latest/nextclade.full.tsv.gz > nextclade.full.tsv
 cp ../ncbi.latest/nextalign.fa.xz .
 comm -23 gb.names <(cut -f 1 nextclade.full.tsv | sort -u) > nextclade.names
 if [ -s nextclade.names ]; then
-    faSomeRecords <(xzcat genbank.fa.xz) nextclade.names stdout \
-    | faSplit about stdin 300000000 $splitDir/chunk
     nDataDir=~angie/github/nextclade/data/sars-cov-2
     outTsv=$(mktemp)
     outFa=$(mktemp)
-    for chunkFa in $splitDir/chunk*.fa; do
-        nextclade run -j 50 $chunkFa \
+    faSomeRecords <(xzcat genbank.fa.xz) nextclade.names stdout \
+    | sed -re 's/^>([A-Z]{2}[0-9]{6}\.[0-9])[ |].*/>\1/;' \
+    | nextclade run -j 50 \
             --input-dataset $nDataDir \
             --output-fasta $outFa \
             --output-tsv $outTsv >& nextclade.log
     tail -n+2 $outTsv | sed -re 's/"//g;' >> nextclade.full.tsv
     xz -T 20 < $outFa >> nextalign.fa.xz
     rm $outTsv $outFa
-    done
 fi
 wc -l nextclade.full.tsv
 pigz -f -p 8 nextclade.full.tsv
-rm -rf $splitDir
 
 set +x
 conda activate pangolin
 set -x
 runPangolin() {
     fa=$1
     out=$fa.pangolin.csv
     logfile=$(mktemp)
-    pangolin $fa --analysis-mode pangolearn --outfile $out > $logfile 2>&1
+    pangolin -t 4 $fa --skip-scorpio --outfile $out > $logfile 2>&1
     rm $logfile
 }
 export -f runPangolin
 if [ -e ../ncbi.latest/lineage_report.csv ]; then
     cp ../ncbi.latest/lineage_report.csv linRepYesterday
     tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \
         > pangolin.prev.names
     comm -23 gb.names pangolin.prev.names > pangolin.names
     faSomeRecords <(xzcat genbank.fa.xz) pangolin.names pangolin.fa
     pangolin --analysis-mode pangolearn pangolin.fa >& pangolin.log
     tail -n+2 lineage_report.csv >> linRepYesterday
     mv linRepYesterday lineage_report.csv
     rm -f pangolin.fa
 else
     splitDir=splitForPangolin