179775232ff5fe31323f984619b75cbc77204c00 angie Sun Jun 27 20:22:29 2021 -0700 Update nextclade to 1.0 (new required command line args). diff --git src/hg/utils/otto/sarscov2phylo/getCogUk.sh src/hg/utils/otto/sarscov2phylo/getCogUk.sh index f652693..92d1645 100755 --- src/hg/utils/otto/sarscov2phylo/getCogUk.sh +++ src/hg/utils/otto/sarscov2phylo/getCogUk.sh @@ -20,23 +20,34 @@ curl -S -s $cogUrlBase/cog_global_tree.newick > cog_global_tree.newick tail -n +2 cog_metadata.csv \ | awk -F, '{print $1 "\t" $5;}' | sort > cogUkToDate # Reuse nextclade assignments for older sequences; compute nextclade assignments for new seqs. cp $ottoDir/cogUk.latest/nextclade.tsv . comm -13 <(cut -f 1 nextclade.tsv | sort) <(fastaNames cog_all.fasta.xz | sort) \ > seqsForNextclade if [ -s seqsForNextclade ]; then faSomeRecords <(xzcat cog_all.fasta.xz) seqsForNextclade seqsForNextclade.fa splitDir=splitForNextclade rm -rf $splitDir mkdir $splitDir faSplit about seqsForNextclade.fa 30000000 $splitDir/chunk + nDataDir=~angie/github/nextclade/data/sars-cov-2 + outDir=$(mktemp -d) + outTsv=$(mktemp) for chunkFa in $splitDir/chunk*.fa; do - nextclade -j 50 -i $chunkFa -t >(cut -f 1,2 | tail -n+2 >> nextclade.tsv) >& nextclade.log + nextclade -j 50 -i $chunkFa \ + --input-root-seq $nDataDir/reference.fasta \ + --input-tree $nDataDir/tree.json \ + --input-qc-config $nDataDir/qc.json \ + --output-dir $outDir \ + --output-tsv $outTsv >& nextclade.log + cut -f 1,2 $outTsv | tail -n+2 >> nextclade.tsv + rm $outTsv done + rm -rf $outDir rm -rf $splitDir fi rm -f $ottoDir/cogUk.latest ln -s cogUk.$today $ottoDir/cogUk.latest