d898714060e14c8a185e3fde3d84dafe32f93baa angie Tue Feb 22 11:05:07 2022 -0800 Speed up compression by using more threads. diff --git src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh index 1f22096..11bca43 100755 --- src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh +++ src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh @@ -52,34 +52,34 @@ # Add pangolin lineage annotations to public protobuf grep -v EPI_ISL lineageToName > lineageToPublicName time $matUtils annotate -T 50 \ -i public-$today.all.masked.nextclade.pb \ -M $scriptDir/pango.clade-mutations.tsv \ -c lineageToPublicName \ -f 0.95 \ -D details.pango.public \ -o public-$today.all.masked.nextclade.pangolin.pb \ >& annotate.pango.public # Extract Newick and VCF from public-only tree time $matUtils extract -i public-$today.all.masked.pb \ -t public-$today.all.nwk \ -v public-$today.all.masked.vcf -time gzip -f public-$today.all.masked.vcf +time pigz -p 8 -f public-$today.all.masked.vcf zcat gisaidAndPublic.$today.metadata.tsv.gz \ | grep -v EPI_ISL_ \ -| gzip -c \ +| pigz -p 8 \ > public-$today.metadata.tsv.gz rm public-$today.all.masked.pb ln -f public-$today.all.masked.nextclade.pangolin.pb public-$today.all.masked.pb cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.txt $matUtils extract -i public-$today.all.masked.pb -u samples.public.$today sampleCountComma=$(echo $(wc -l < samples.public.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.description.txt