src/hg/utils/otto/sarscov2phylo/getNcbi.sh 72afe80e5fc9ade5df012cfb159e39d5ee953d92

72afe80e5fc9ade5df012cfb159e39d5ee953d92
angie
  Fri Aug 26 11:59:25 2022 -0700
Add --debug to datasets command to assist in debugging failures.  Patch some corrupted associations between nucleotide IDs and BioSample IDs until the fixes percolate through all the databases.  Update to latest versions of nextclade and pangolin.

diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh
index d997b6d..a7312b1 100755
--- src/hg/utils/otto/sarscov2phylo/getNcbi.sh
+++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh
@@ -1,158 +1,169 @@
 #!/bin/bash
 source ~/.bashrc
 set -beEu -x -o pipefail
 
 # Download SARS-CoV-2 GenBank FASTA and metadata using NCBI Datasets API.
 # Use BioSample metadata to fill in gaps in GenBank metadata and report conflicting dates.
 # Use enhanced metadata to rewrite FASTA headers for matching up with sequences in other databases.
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 today=$(date +%F)
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 
 mkdir -p $ottoDir/ncbi.$today
 cd $ottoDir/ncbi.$today
 
 attempt=0
 maxAttempts=5
 retryDelay=300
 #*** From Eric Cox 1/25/22 when download failed and they were debugging firewall issues:
 #             --proxy https://www.st-va.ncbi.nlm.nih.gov/datasets/v1 \
+#*** From Mirian Tsuchiya 6/3/22: add --debug; if there's a problem send Ncbi-Phid.
 while [[ $((++attempt)) -le $maxAttempts ]]; do
     echo "datasets attempt $attempt"
     if datasets download virus genome taxon 2697049 \
             --exclude-cds \
             --exclude-protein \
             --exclude-gpff \
             --exclude-pdb \
             --filename ncbi_dataset.zip \
-        |& tail -50 \
-            > datasets.log.$attempt; then
+            --no-progressbar \
+            --debug \
+            >& datasets.log.$attempt; then
         break;
     else
         echo "FAILED; will try again after $retryDelay seconds"
         rm -f ncbi_dataset.zip
         sleep $retryDelay
         # Double the delay to give NCBI progressively more time
         retryDelay=$(($retryDelay * 2))
     fi
 done
 if [[ ! -f ncbi_dataset.zip ]]; then
     echo "datasets command failed $maxAttempts times; quitting."
     exit 1
 fi
 rm -rf ncbi_dataset
 unzip -o ncbi_dataset.zip
 # Creates ./ncbi_dataset/
 
 # This makes something just like ncbi.datasets.tsv from the /table/ API query:
-jq -c -r '[.accession, .biosample, .isolate.collectionDate, .location.geographicLocation, .host.sciName, .isolate.name, .completeness, (.length|tostring)] | join("\t")' \
+time jq -c -r '[.accession, .biosample, .isolate.collectionDate, .location.geographicLocation, .host.sciName, .isolate.name, .completeness, (.length|tostring)] | join("\t")' \
     ncbi_dataset/data/data_report.jsonl \
 | sed -e 's/COMPLETE/complete/; s/PARTIAL/partial/;' \
 | sort -u \
     > ncbi_dataset.tsv
 
 time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample.jsonl | uniq > gb.bioSample.tab
 
 # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting
 # sample collection dates:
 $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \
     > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log
 
+# Manually patch some GB-to-BioSample associations that somehow got mangled at ENA, until
+# they fix them and the fix percolates through to NCBI...
+grep -vFwf <(cut -f 1 $ottoDir/ncbi.2022-06-25/gbToBioSample.changes.tsv) \
+    ncbi_dataset.plusBioSample.tsv > tmp
+sort tmp $ottoDir/ncbi.2022-06-25/gbToBioSample.patch.tsv > ncbi_dataset.plusBioSample.tsv
+rm tmp
+
 # Make a file for joining collection date with ID:
 tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \
 | sort > gbToDate
 
 # Replace FASTA headers with reconstructed names from enhanced metadata.
 time cleanGenbank < ncbi_dataset/data/genomic.fna \
 | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \
     > genbank.maybeDups.fa
 time fastaNames genbank.maybeDups.fa | awk '{print $1 "\t" $0;}' > gb.rename
 time faUniqify genbank.maybeDups.fa stdout \
 | faRenameRecords stdin gb.rename stdout \
 | xz -T 20 \
     > genbank.fa.xz
 
 # Run pangolin and nextclade on sequences that are new since yesterday
+# TODO: remove the split, nextclade can handle big inputs now -- but truncate fasta names to
+# just the accession in a pipe to nextclade.
 export TMPDIR=/dev/shm
 fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | grep -vx pdb | sort -u > gb.names
 splitDir=splitForNextclade
 rm -rf $splitDir
 mkdir $splitDir
 zcat ../ncbi.latest/nextclade.full.tsv.gz > nextclade.full.tsv
 cp ../ncbi.latest/nextalign.fa.xz .
 comm -23 gb.names <(cut -f 1 nextclade.full.tsv | sort -u) > nextclade.names
 if [ -s nextclade.names ]; then
     faSomeRecords <(xzcat genbank.fa.xz) nextclade.names stdout \
-    | faSplit about stdin 30000000 $splitDir/chunk
+    | faSplit about stdin 300000000 $splitDir/chunk
     nDataDir=~angie/github/nextclade/data/sars-cov-2
-    outDir=$(mktemp -d)
     outTsv=$(mktemp)
+    outFa=$(mktemp)
     for chunkFa in $splitDir/chunk*.fa; do
-        nextclade -j 50 -i $chunkFa \
+        nextclade run -j 50 $chunkFa \
             --input-dataset $nDataDir \
-            --output-dir $outDir \
-            --output-basename out \
+            --output-fasta $outFa \
             --output-tsv $outTsv >& nextclade.log
         tail -n+2 $outTsv | sed -re 's/"//g;' >> nextclade.full.tsv
-        xz -T 20 < $outDir/out.aligned.fasta >> nextalign.fa.xz
-        rm $outTsv
+        xz -T 20 < $outFa >> nextalign.fa.xz
+        rm $outTsv $outFa
     done
-    rm -rf $outDir
 fi
 wc -l nextclade.full.tsv
 pigz -f -p 8 nextclade.full.tsv
 rm -rf $splitDir
 
+set +x
 conda activate pangolin
+set -x
 runPangolin() {
     fa=$1
     out=$fa.pangolin.csv
     logfile=$(mktemp)
-    pangolin $fa --outfile $out > $logfile 2>&1
+    pangolin $fa --analysis-mode pangolearn --outfile $out > $logfile 2>&1
     rm $logfile
 }
 export -f runPangolin
 if [ -e ../ncbi.latest/lineage_report.csv ]; then
     cp ../ncbi.latest/lineage_report.csv linRepYesterday
     tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \
         > pangolin.prev.names
     comm -23 gb.names pangolin.prev.names > pangolin.names
     faSomeRecords <(xzcat genbank.fa.xz) pangolin.names pangolin.fa
-    pangolin pangolin.fa >& pangolin.log
+    pangolin --analysis-mode pangolearn pangolin.fa >& pangolin.log
     tail -n+2 lineage_report.csv >> linRepYesterday
     mv linRepYesterday lineage_report.csv
     rm -f pangolin.fa
 else
     splitDir=splitForPangolin
     rm -rf $splitDir
     mkdir $splitDir
     faSplit about <(xzcat genbank.fa.xz) 30000000 $splitDir/chunk
     find $splitDir -name chunk\*.fa \
     | parallel -j 10 "runPangolin {}"
     head -1 $(ls -1 $splitDir/chunk*.csv | head -1) > lineage_report.csv
     for f in $splitDir/chunk*.csv; do
         tail -n+2 $f >> lineage_report.csv
     done
     rm -rf $splitDir
 fi
 wc -l lineage_report.csv
 
 # It turns out that sometimes new sequences sneak into ncbi_dataset.tsv before they're included in
 # genomic.fna.  Filter those out so we don't have missing pangolin and nextclade for just-added
 # COG-UK sequences.  (https://github.com/theosanderson/taxodium/issues/10#issuecomment-876800176)
 grep -Fwf gb.names ncbi_dataset.plusBioSample.tsv > tmp
 wc -l tmp ncbi_dataset.plusBioSample.tsv
 mv tmp ncbi_dataset.plusBioSample.tsv
 
 rm -f $ottoDir/ncbi.latest
 ln -s ncbi.$today $ottoDir/ncbi.latest
 
 rm -f ~angie/public_html/sarscov2phylo/ncbi.$today
 ln -s $ottoDir/ncbi.$today ~angie/public_html/sarscov2phylo/ncbi.$today
 
 # Clean up
-rm -r ncbi_dataset
+rm -r ncbi_dataset genbank.maybeDups.fa