95c99e66b78b8b8c4e26bb1ea91a0113d64caf04 angie Thu Aug 19 11:54:39 2021 -0700 matUtils annotate -P fails too often because of little changes in the tree that affect which mutations are collapsed into the same code (or not). More sophisticated path-finding logic in matUtils could help with that, but until then, use the previous day's clade assignments as input for matUtils annotate -c for the big tree, and use the big tree's clade assignments as input for -c for the public-only tree. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index fb3411d..2d900ab 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -21,30 +21,31 @@ ncbiDir=$ottoDir/ncbi.latest cogUkDir=$ottoDir/cogUk.latest cncbDir=$ottoDir/cncb.latest gisaidDir=/hive/users/angie/gisaid minReal=20000 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit epiToPublic=$gisaidDir/epiToPublicAndDate.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) cd $ottoDir/$today prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb +prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils renaming=oldAndNewNames if [ ! -s new.masked.vcf.gz ]; then # Make lists of sequences already in the tree. $matUtils extract -i $prevProtobufMasked -u prevNames # Before updating the tree with new sequences, update the names used in the tree: # * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs # * COG-UK sequences that are in GenBank. Per Sam Nicholls the isolate alone is enough to identify @@ -261,34 +262,38 @@ >> $renaming set -o pipefail fi if [ -s newGisaid.filtered.fa ]; then zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(fastaNames newGisaid.filtered.fa) \ | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \ >> $renaming fi wc -l $renaming # Make masked VCF tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf # Add masked VCF to previous protobuf +#*** With horrible hack for time being, to mask 21987 (because it messes up the Delta branch)... +#*** TODO: make hgPhyloPlace handle protobufs that don't have all of the latest problematic sites +#*** masked more gracefully, and update the Problematic Sites track. time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ | faToVcf -maxDiff=1000 -excludeFile=../tooManyEpps.ids -verbose=2 stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | vcfFilter -excludeVcf=mask.vcf stdin \ +| tawk '$2 != 21987' \ | gzip -c \ > new.masked.vcf.gz fi # if [ ! -s new.masked.vcf.gz ] time $usher -u -T 80 \ -A \ -e 5 \ -v new.masked.vcf.gz \ -i prevRenamed.pb \ -o gisaidAndPublic.$today.masked.preTrim.pb \ >& usher.addNew.log mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk # Exclude sequences with a very high number of EPPs from future runs @@ -385,44 +390,76 @@ >> gisaidAndPublic.$today.metadata.tsv wc -l gisaidAndPublic.$today.metadata.tsv gzip -f gisaidAndPublic.$today.metadata.tsv # version/description files cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.plusGisaid.txt $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today sampleCountComma=$(echo $(wc -l < samples.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.plusGisaid.description.txt # Add nextclade annotations to protobuf +if [ -s $ottoDir/$prevDate/cladeToName ]; then + # Use yesterday's clade assignments to annotate clades on today's tree + time $matUtils annotate -T 50 \ + -l \ + -i gisaidAndPublic.$today.masked.pb \ + -c $ottoDir/$prevDate/cladeToName \ + -f 0.95 \ + -D details.nextclade \ + -o gisaidAndPublic.$today.masked.nextclade.pb \ + >& annnotate.nextclade +else time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ -P ../nextstrain.clade-paths.tsv \ -o gisaidAndPublic.$today.masked.nextclade.pb +fi # Add pangolin lineage annotations to protobuf. +if [ -s $ottoDir/$prevDate/lineageToName ]; then + time $matUtils annotate -T 50 \ + -i gisaidAndPublic.$today.masked.nextclade.pb \ + -c $ottoDir/$prevDate/lineageToName \ + -f 0.95 \ + -D details.pango \ + -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \ + >& annotate.pango +else time $matUtils annotate -T 50 \ -i gisaidAndPublic.$today.masked.nextclade.pb \ -P ../pango.clade-paths.tsv \ -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb +fi +# Replace protobuf with annotated protobuf. mv gisaidAndPublic.$today.masked{,.unannotated}.pb ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb +# Save clade & lineage annotations for use tomorrow. +$matUtils summary -i gisaidAndPublic.$today.masked.pb -C sample-clades +tail -n+2 sample-clades \ +| tawk '{print $2, $1;}' \ +| sort > cladeToName +tail -n+2 sample-clades \ +| tawk '{print $3, $1;}' \ +| sort > lineageToName + # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have # public names & IDs, we can match them. cut -f 1,3 $epiToPublic > epiToPublic.latest # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \ $dir/public.plusGisaid.latest.metadata.tsv.gz ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt ln -sf `pwd`/epiToPublic.latest $dir/ done $scriptDir/extractPublicTree.sh $today