src/hg/utils/otto/mpxv/getNcbiMpxv.sh f9ee0c7ea6540636411b7ff4a6ed54ab574701e2

f9ee0c7ea6540636411b7ff4a6ed54ab574701e2
angie
  Tue Jun 3 11:04:31 2025 -0700
Add Clade I.

diff --git src/hg/utils/otto/mpxv/getNcbiMpxv.sh src/hg/utils/otto/mpxv/getNcbiMpxv.sh
index 73708693cec..410d6768e93 100755
--- src/hg/utils/otto/mpxv/getNcbiMpxv.sh
+++ src/hg/utils/otto/mpxv/getNcbiMpxv.sh
@@ -49,31 +49,31 @@
     echo "metadata query failed $maxAttempts times; quitting."
     exit 1
 fi
 wc -l metadata.tsv
 
 if [[ ! -s metadata.tsv ]]; then
     echo "metadata query appeared to succeed but gave 0-length output"
     exit 1
 fi
 
 attempt=0
 maxAttempts=5
 retryDelay=300
 while [[ $((++attempt)) -le $maxAttempts ]]; do
     echo "fasta attempt $attempt"
-    if datasets download virus genome taxon $taxId --include genome,biosample; then
+    if datasets download virus genome taxon $taxId --include genome,biosample --no-progressbar; then
         break;
     else
         echo "FAILED fasta; will try again after $retryDelay seconds"
         rm -f ncbi_dataset.zip
         sleep $retryDelay
         # Double the delay to give NCBI progressively more time
         retryDelay=$(($retryDelay * 2))
     fi
 done
 if [[ ! -s ncbi_dataset.zip ]]; then
     echo "fasta query failed $maxAttempts times; quitting."
     exit 1
 fi
 unzip ncbi_dataset.zip
 faFilter -minSize=$minSize ncbi_dataset/data/genomic.fna stdout \
@@ -90,26 +90,56 @@
     else if ($1 ~ /^MT903342/) { $5 = "2019-04-30"; }
     else if ($1 ~ /^MT903342/) { $5 = "2019-05"; }
     else if ($1 ~ /^MT903343/) { $5 = "2018-09"; }
     else if ($1 ~ /^MT903344/) { $5 = "2018-09"; }
     else if ($1 ~ /^MT903345/) { $5 = "2018-09"; }
     print;
 }' metadata.tsv \
 | tawk '$5 ~ /^20(1[7-9]|2)/ &&
         $4 != "Central African Republic" && $4 != "'"Cote d'Ivoire"'" &&
         ($8 == "Homo sapiens" || $8 == "")' \
     > metadata.2017outbreak.tsv
 wc -l metadata.2017outbreak.tsv
 
 # Run nextclade on the lot (takes only a few seconds as of 2022-07-25) to get alignments,
 # QC, clade assignments.
+# Clade II 2017 outbreak:
 nextclade dataset get --name hMPXV --output-zip hMPXV.zip
 time nextclade run \
     -D hMPXV.zip \
     -j 30 \
-    --output-tsv nextclade.tsv \
-    --output-fasta nextalign.fa.xz \
+    --output-tsv nextclade.II.tsv.gz \
+    --output-fasta nextalign.II.fa.xz \
     --retry-reverse-complement true \
     genbank.fa.xz
 
+# Clade I: Nextstrain uses a different reference (DQ011155.1 Zaire_1979-005 instead of
+# RefSeq NC_003310.1 AF380138.1 Zaire-96-I-16); use the Nextstrain dataset for clade assignments,
+# but run nextclade separately with the RefSeq sequence for sequence alignments.
+nextclade dataset get --name nextstrain/mpox/clade-i --output-zip mpox_clade-i.zip
+nextclade run \
+    -D mpox_clade-i.zip \
+    -j 30 \
+    --output-tsv nextclade.I.tsv.gz \
+    --retry-reverse-complement true \
+    genbank.fa.xz
+
+# Use alignment parameters from pathogen.json in mpox_clade-i.zip
+nextclade run \
+    --input-ref $mpxvDir/GCF_000857045.1.NC_003310.1.fa \
+    --excess-bandwidth 100 \
+    --terminal-bandwidth 300 \
+    --allowed-mismatches 8 \
+    --window-size 40 \
+    --min-seed-cover 0.1 \
+    --output-fasta nextalign.I.fa.xz \
+    genbank.fa.xz
+
+# Extract metadata for only clade I sequences
+zcat nextclade.I.tsv.gz \
+| tawk '$3 ~ /^I[^I]/ {print $2;}' \
+| grep -Fwf - metadata.tsv \
+    > metadata.cladeI.tsv
+
+
 rm -f $mpxvDir/ncbi/ncbi.latest
 ln -s ncbi.$today $mpxvDir/ncbi/ncbi.latest