f2aee59866d6bdd224a7183323f72390a861b328
angie
  Sat Apr 22 18:17:02 2023 -0700
Auto-download new GenBase sequences from China National Center for Bioinformation.

diff --git src/hg/utils/otto/sarscov2phylo/getCncb.sh src/hg/utils/otto/sarscov2phylo/getCncb.sh
new file mode 100755
index 0000000..49dba58
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/getCncb.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+set -beEux -o pipefail
+
+# Download latest CNCB/NGDC fasta and metadata; update $ottoDir/cncb.latest link.
+
+scriptDir=$(dirname "${BASH_SOURCE[0]}")
+source $scriptDir/util.sh
+
+today=$(date +%F)
+
+ottoDir=/hive/data/outside/otto/sarscov2phylo
+
+metadataUrl='https://ngdc.cncb.ac.cn/ncov/api/es/genome/download?q=&accession=&country=&province=&city=&host=&minCollectDate=&maxCollectDate=&source=CNGBdb,GenBase,Genome+Warehouse,NMDC&qc=&complete=&minLength=15000&maxLength=31000&ns=&ds=&lineage=&whoLabel=&latest=0&md5=0&md5value=&minSubDate=&maxSubDate='
+genBaseFileApiBase='https://ngdc.cncb.ac.cn/genbase/api/file/fasta?acc='
+
+mkdir -p $ottoDir/cncb.$today
+cd $ottoDir/cncb.$today
+
+function curlRetry {
+    local url=$*
+    local attempt=0
+    local maxAttempts=100
+    local retryDelay=300
+    while [[ $((++attempt)) -le $maxAttempts ]]; do
+        >&2 echo "curl attempt $attempt"
+        if curl -Ss $url; then
+            break
+        else
+            >&2 echo "FAILED; will try again after $retryDelay seconds"
+            sleep $retryDelay
+        fi
+    done
+    if [[ $attempt -gt $maxAttempts ]]; then
+        $>2 echo "curl failed $maxAttempts times; quitting."
+        exit 1
+    fi
+}
+
+# Discard Pango lineage column so we can keep the same column order as before & won't have to
+# update scripts.  Also watch out for non-ASCII characters (e.g. NMDC60013002-06's "2019?"
+# was changed to "2019\302\240" -- change it back).
+curlRetry $metadataUrl \
+| cut -f 1-4,6- \
+| perl -wpe 's/[^[:print:]^\t^\n]+/?/g;' \
+    > cncb.metadata.tsv
+
+# Make a cncbToDate file for ID mapping.
+tail -n+2 cncb.metadata.tsv \
+| cut -f 1,10 \
+| cleanCncb \
+| sed -re 's/ //;' \
+    > cncbToDate
+
+# Make a renaming file to translate from accessions to the 'name | acc' headers expected by
+# Chris's ID-matching pipeline.  Exclude sequences that are also in GenBank.  If for some reason
+# the EPI_ ID is listed as primary and CNCB as secondary, swap them.
+tail -n+2 cncb.metadata.tsv \
+| grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \
+| tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $1, $2; }' \
+| cleanCncb \
+| sed -re 's/ //;' \
+| tawk '{print $2, $1 " | " $2;}' \
+| sort \
+    > accToNameBarAcc.tsv
+
+# See what sequences are in metadata but that we haven't already downloaded.
+fastaNames ../cncb.latest/cncb.nonGenBank.acc.fasta.xz | sort > prevAccs
+comm -23 <(cut -f 1 accToNameBarAcc.tsv) prevAccs | sort > missingIDs
+
+# Use the GenBase API to download missing GenBase sequences, unfortunately only one sequence
+# at a time.
+tail -n+2 cncb.metadata.tsv \
+| grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \
+| tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $2; }' \
+    > nonGenBankIds
+set +o pipefail
+grep ^C_ missingIDs \
+| grep -Fwf - nonGenBankIds \
+| cat \
+    > missingGenBaseIds
+set -o pipefail
+for acc in $(cat missingGenBaseIds); do
+    curlRetry "$genBaseFileApiBase$acc" \
+    | sed -re '/^>/  s/ .*//;'
+    sleep 10
+done > new.accs.fa
+
+cat <(xzcat ../cncb.latest/cncb.nonGenBank.acc.fasta.xz) new.accs.fa \
+| xz -T 20 > cncb.nonGenBank.acc.fasta.new.xz
+mv cncb.nonGenBank.acc.fasta.new.xz cncb.nonGenBank.acc.fasta.xz
+
+xzcat cncb.nonGenBank.acc.fasta.xz \
+| faSomeRecords stdin <(cut -f 1 accToNameBarAcc.tsv) stdout \
+| faRenameRecords stdin accToNameBarAcc.tsv cncb.nonGenBank.fasta
+
+# Run nextclade
+cp ../cncb.latest/nextclade.full.tsv.gz .
+cp ../cncb.latest/nextclade.tsv .
+if [ -s new.accs.fa ]; then
+    nDataDir=~angie/github/nextclade/data/sars-cov-2
+    time nextclade run -j 20 new.accs.fa \
+        --input-dataset $nDataDir \
+        --output-fasta nextalign.fa.xz \
+        --output-tsv nextclade.new.full.tsv.gz  >& nextclade.log
+    zcat nextclade.new.full.tsv.gz | cut -f 1,2 | tail -n+2 >> nextclade.tsv
+    sort -u nextclade.tsv > tmp
+    mv tmp nextclade.tsv
+    cat nextclade.new.full.tsv.gz >> nextclade.full.tsv.gz
+fi
+
+# Run pangolin
+cp ../cncb.latest/pangolin.tsv .
+if [ -s new.accs.fa ]; then
+    set +x
+    . ~/.bashrc
+    conda activate pangolin
+    set -x
+    set -e
+    time pangolin -t 20 new.accs.fa --analysis-mode pangolearn --outfile lineages.csv \
+        >& pangolin.log
+    awk -F, '{print $1 "\t" $2;}' lineages.csv | tail -n+2 >> pangolin.tsv
+    sort -u pangolin.tsv > tmp
+    mv tmp pangolin.tsv
+    conda deactivate
+fi
+
+rm new.accs.fa
+
+rm -f $ottoDir/cncb.latest
+ln -s cncb.$today $ottoDir/cncb.latest