092adf5dfa56fee387d3836b3493ac46ee2c0281 angie Fri Feb 26 14:48:50 2021 -0800 Add 'matUtils annotate' calls to add Nextclade and Pangolin assignments to protobuf for hgPhyloPlace. diff --git src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh index 7e26bbe..8adb374 100755 --- src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh +++ src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh @@ -292,30 +292,56 @@ cogUkDate=$(ls -l $cogUkDir | sed -re 's/.*cogUk\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') if [ $ncbiDate == $cogUkDate ]; then echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $ncbiDate; CNCB sequences downloaded $cncbDate" \ > version.txt else echo "sarscov2phylo release 13-11-20; NCBI sequences downloaded $ncbiDate; COG-UK sequences downloaded $cogUkDate; CNCB sequences downloaded $cncbDate" \ > version.txt fi sampleCountComma=$(echo $sampleCount \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.description.txt +cp -p public-$today.all.masked.pb{,.bak} + +# Add nextclade annotations to protobuf +zcat public-$today.metadata.tsv.gz \ +| tail -n+2 | tawk '$8 != "" {print $8, $1;}' \ +| sed -re 's/^20E \(EU1\)/20E.EU1/;' \ + > cladeToPublicName +time ~/github/usher/build/matUtils annotate -T 50 \ + -i public-$today.all.masked.pb \ + -c cladeToPublicName \ + -o public-$today.all.masked.nextclade.pb \ + >& annotate.nextclade.out + +# Add pangolin lineage annotations to protobuf +zcat public-$today.metadata.tsv.gz \ +| tail -n+2 | tawk '$9 != "" {print $9, $1;}' \ + > lineageToPublicName +time ~/github/usher/build/matUtils annotate -T 50 \ + -i public-$today.all.masked.nextclade.pb \ + -c lineageToPublicName \ + -o public-$today.all.masked.nextclade.pangolin.pb \ + >& annotate.pangolin.out + +# Not all the Pangolin lineages can be assigned nodes so for now just use nextclade +cp -p public-$today.all.masked.nextclade.pb public-$today.all.masked.pb + # Update gbdb links -- not every day, too much churn for getting releases out and the # tracks are getting unmanageably large for VCF. if false; then for f in public-$today.all{,.minAf*}.vcf.gz ; do t=$(echo $f | sed -re "s/-$today//;") ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2PhyloPub/$t ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2PhyloPub/$t.tbi done ln -sf `pwd`/public-$today.all.nwk /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nwk ln -sf `pwd`/public-$today.all.parsimony.bw \ /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.parsimony.bw ln -sf `pwd`/public-$today.lineageColors.gz \ /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.lineageColors.gz ln -sf `pwd`/public-$today.nextstrainColors.gz \ /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nextstrainColors.gz