src/hg/utils/otto/sarscov2phylo/getNcbi.sh b038619ca5e7c1bf6ab5810bfb004dc4c702266b

b038619ca5e7c1bf6ab5810bfb004dc4c702266b
angie
  Tue Feb 22 11:02:56 2022 -0800
Use latest version of nextclade and keep full output.

diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh
index 79e9964..542f85f 100755
--- src/hg/utils/otto/sarscov2phylo/getNcbi.sh
+++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh
@@ -7,30 +7,32 @@
 # Use enhanced metadata to rewrite FASTA headers for matching up with sequences in other databases.
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 today=$(date +%F)
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 
 mkdir -p $ottoDir/ncbi.$today
 cd $ottoDir/ncbi.$today
 
 attempt=0
 maxAttempts=5
 retryDelay=300
+#*** From Eric Cox 1/25/22 when download failed and they were debugging firewall issues:
+#             --proxy https://www.st-va.ncbi.nlm.nih.gov/datasets/v1 \
 while [[ $((++attempt)) -le $maxAttempts ]]; do
     echo "datasets attempt $attempt"
     if datasets download virus genome taxon 2697049 \
             --exclude-cds \
             --exclude-protein \
             --exclude-gpff \
             --exclude-pdb \
             --filename ncbi_dataset.zip \
         |& tail -50 \
             > datasets.log.$attempt; then
         break;
     else
         echo "FAILED; will try again after $retryDelay seconds"
         rm -f ncbi_dataset.zip
         sleep $retryDelay
@@ -55,67 +57,63 @@
 
 time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample.jsonl > gb.bioSample.tab
 
 # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting
 # sample collection dates:
 $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \
     > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log
 
 # Make a file for joining collection date with ID:
 tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \
 | sort > gbToDate
 
 # Replace FASTA headers with reconstructed names from enhanced metadata.
 time cleanGenbank < ncbi_dataset/data/genomic.fna \
 | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \
-| xz -T 8 \
+| xz -T 20 \
     > genbank.fa.xz
 
 # Run pangolin and nextclade on sequences that are new since yesterday
 export TMPDIR=/dev/shm
 fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | grep -vx pdb | sort -u > gb.names
 splitDir=splitForNextclade
 rm -rf $splitDir
 mkdir $splitDir
-if [ -e ../ncbi.latest/nextclade.tsv ]; then
-    cp ../ncbi.latest/nextclade.tsv .
-    cut -f 1 nextclade.tsv | sort -u > nextclade.prev.names
-    comm -23 gb.names nextclade.prev.names > nextclade.names
-    faSomeRecords <(xzcat genbank.fa.xz) nextclade.names nextclade.fa
-    faSplit about nextclade.fa 30000000 $splitDir/chunk
-else
-    cp /dev/null nextclade.tsv
-    faSplit about <(xzcat genbank.fa.xz) 300000000 $splitDir/chunk
-fi
-if (( $(ls -1 splitForNextclade | wc -l) > 0 )); then
+zcat ../ncbi.latest/nextclade.full.tsv.gz > nextclade.full.tsv
+cp ../ncbi.latest/nextalign.fa.xz .
+comm -23 gb.names <(cut -f 1 nextclade.full.tsv | sort -u) > nextclade.names
+if [ -s nextclade.names ]; then
+    faSomeRecords <(xzcat genbank.fa.xz) nextclade.names stdout \
+    | faSplit about stdin 30000000 $splitDir/chunk
     nDataDir=~angie/github/nextclade/data/sars-cov-2
     outDir=$(mktemp -d)
     outTsv=$(mktemp)
     for chunkFa in $splitDir/chunk*.fa; do
         nextclade -j 50 -i $chunkFa \
-            --input-root-seq $nDataDir/reference.fasta \
-            --input-tree $nDataDir/tree.json \
-            --input-qc-config $nDataDir/qc.json \
+            --input-dataset $nDataDir \
             --output-dir $outDir \
+            --output-basename out \
             --output-tsv $outTsv >& nextclade.log
-        cut -f 1,2 $outTsv | tail -n+2 | sed -re 's/"//g;' >> nextclade.tsv
+        tail -n+2 $outTsv | sed -re 's/"//g;' >> nextclade.full.tsv
+        xz -T 20 < $outDir/out.aligned.fasta >> nextalign.fa.xz
         rm $outTsv
     done
     rm -rf $outDir
 fi
-wc -l nextclade.tsv
-rm -rf $splitDir nextclade.fa
+wc -l nextclade.full.tsv
+pigz -f -p 8 nextclade.full.tsv
+rm -rf $splitDir
 
 conda activate pangolin
 runPangolin() {
     fa=$1
     out=$fa.pangolin.csv
     logfile=$(mktemp)
     pangolin $fa --outfile $out > $logfile 2>&1
     rm $logfile
 }
 export -f runPangolin
 if [ -e ../ncbi.latest/lineage_report.csv ]; then
     cp ../ncbi.latest/lineage_report.csv linRepYesterday
     tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \
         > pangolin.prev.names
     comm -23 gb.names pangolin.prev.names > pangolin.names