179775232ff5fe31323f984619b75cbc77204c00 angie Sun Jun 27 20:22:29 2021 -0700 Update nextclade to 1.0 (new required command line args). diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 2ee7749..385deb7 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -96,34 +96,45 @@ # Run pangolin and nextclade on sequences that are new since yesterday fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | sort > gb.names splitDir=splitForNextclade rm -rf $splitDir mkdir $splitDir if [ -e ../ncbi.latest/nextclade.tsv ]; then cp ../ncbi.latest/nextclade.tsv . cut -f 1 ../ncbi.latest/nextclade.tsv | sort > nextclade.prev.names comm -23 gb.names nextclade.prev.names > nextclade.names faSomeRecords <(xzcat genbank.fa.xz) nextclade.names nextclade.fa faSplit about nextclade.fa 30000000 $splitDir/chunk else cp /dev/null nextclade.tsv faSplit about <(xzcat genbank.fa.xz) 30000000 $splitDir/chunk fi +nDataDir=~angie/github/nextclade/data/sars-cov-2 +outDir=$(mktemp -d) +outTsv=$(mktemp) for chunkFa in $splitDir/chunk*.fa; do - nextclade -j 50 -i $chunkFa -t >(cut -f 1,2 | tail -n+2 >> nextclade.tsv) >& nextclade.log + nextclade -j 50 -i $chunkFa \ + --input-root-seq $nDataDir/reference.fasta \ + --input-tree $nDataDir/tree.json \ + --input-qc-config $nDataDir/qc.json \ + --output-dir $outDir \ + --output-tsv $outTsv >& nextclade.log + cut -f 1,2 $outTsv | tail -n+2 >> nextclade.tsv + rm $outTsv done wc -l nextclade.tsv +rm -rf $outDir rm -rf $splitDir nextclade.fa conda activate pangolin runPangolin() { fa=$1 out=$fa.pangolin.csv logfile=$(mktemp) pangolin $fa --outfile $out > $logfile 2>&1 rm $logfile } export -f runPangolin if [ -e ../ncbi.latest/lineage_report.csv ]; then cp ../ncbi.latest/lineage_report.csv linRepYesterday tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \ > pangolin.prev.names