src/hg/utils/otto/sarscov2phylo/getNcbi.sh 179775232ff5fe31323f984619b75cbc77204c00

179775232ff5fe31323f984619b75cbc77204c00
angie
  Sun Jun 27 20:22:29 2021 -0700
Update nextclade to 1.0 (new required command line args).

diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh
index 2ee7749..385deb7 100755
--- src/hg/utils/otto/sarscov2phylo/getNcbi.sh
+++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh
@@ -96,34 +96,45 @@
 # Run pangolin and nextclade on sequences that are new since yesterday
 fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | sort > gb.names
 splitDir=splitForNextclade
 rm -rf $splitDir
 mkdir $splitDir
 if [ -e ../ncbi.latest/nextclade.tsv ]; then
     cp ../ncbi.latest/nextclade.tsv .
     cut -f 1 ../ncbi.latest/nextclade.tsv | sort > nextclade.prev.names
     comm -23 gb.names nextclade.prev.names > nextclade.names
     faSomeRecords <(xzcat genbank.fa.xz) nextclade.names nextclade.fa
     faSplit about nextclade.fa 30000000 $splitDir/chunk
 else
     cp /dev/null nextclade.tsv
     faSplit about <(xzcat genbank.fa.xz) 30000000 $splitDir/chunk
 fi
+nDataDir=~angie/github/nextclade/data/sars-cov-2
+outDir=$(mktemp -d)
+outTsv=$(mktemp)
 for chunkFa in $splitDir/chunk*.fa; do
-    nextclade -j 50 -i $chunkFa -t >(cut -f 1,2 | tail -n+2 >> nextclade.tsv) >& nextclade.log
+    nextclade -j 50 -i $chunkFa \
+        --input-root-seq $nDataDir/reference.fasta \
+        --input-tree $nDataDir/tree.json \
+        --input-qc-config $nDataDir/qc.json \
+        --output-dir $outDir \
+        --output-tsv $outTsv >& nextclade.log
+    cut -f 1,2 $outTsv | tail -n+2 >> nextclade.tsv
+    rm $outTsv
 done
 wc -l nextclade.tsv
+rm -rf $outDir
 rm -rf $splitDir nextclade.fa
 
 conda activate pangolin
 runPangolin() {
     fa=$1
     out=$fa.pangolin.csv
     logfile=$(mktemp)
     pangolin $fa --outfile $out > $logfile 2>&1
     rm $logfile
 }
 export -f runPangolin
 if [ -e ../ncbi.latest/lineage_report.csv ]; then
     cp ../ncbi.latest/lineage_report.csv linRepYesterday
     tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \
         > pangolin.prev.names