7f46acdf8f879de28c3ba40534e58a881941e168 angie Sun Feb 4 09:24:53 2024 -0800 CNCB is updated daily so make version.txt more concise. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index 286b5a4..8932313 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -72,78 +72,77 @@ time $matOptimize \ -T 80 -m 0.00000001 -M 2 -S move_log.filtered \ -i merged.deltaMasked.filtered.pb \ -o gisaidAndPublic.$today.masked.preTrim.pb \ >& matOptimize.filtered.log # Fix grandparent-reversion nodes that cause some lineages to be incorrectly placed as # sublineages of siblings. $matUtils fix -i gisaidAndPublic.$today.masked.preTrim.pb -c 10 \ -o gisaidAndPublic.$today.masked.preTrim.fix.pb # Again prune samples with too many private mutations and internal branches that are too long. $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.fix.pb \ --max-parsimony 20 \ --max-branch-length 60 \ - --max-path-length 175 \ + --max-path-length 225 \ -O -o gisaidAndPublic.$today.masked.pb fi # Exclude sequences with a very high number of EPPs from future runs grep ^Current usher.addNew.log \ | awk '$16 >= 10 {print $8;}' \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ > tooManyEpps.ids cat tooManyEpps.ids >> ../tooManyEpps.ids $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today $scriptDir/combineMetadata.sh $prevDate $today # version/description files cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') -echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ +echo "sarscov2phylo release 13-11-20; GISAID, NCBI, COG-UK and CNCB sequences downloaded $today" \ > version.plusGisaid.txt sampleCountComma=$(echo $(wc -l < samples.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.plusGisaid.description.txt # Add nextclade annotations to protobuf (completely specified by nextstrain.clade-mutations.tsv) time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ -P $ottoDir/$prevDate/cladeToPath \ -M $scriptDir/nextstrain.clade-mutations.tsv \ -D details.nextclade \ -o gisaidAndPublic.$today.masked.nextclade.pb \ >& annotate.nextclade # Add pangolin lineage annotations to protobuf. time $matUtils annotate -T 50 \ -i gisaidAndPublic.$today.masked.nextclade.pb \ -P $ottoDir/$prevDate/lineageToPath \ -M $scriptDir/pango.clade-mutations.tsv \ -c $ottoDir/$prevDate/lineageToName \ -f 0.95 \ -D details.pango \ -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \ >& annotate.pango # Replace protobuf with annotated protobuf. -mv gisaidAndPublic.$today.masked{,.unannotated}.pb -ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb +mv gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb # Save paths and annotations for use tomorrow. $matUtils extract -i gisaidAndPublic.$today.masked.pb -C clade-paths tail -n+2 clade-paths \ | grep -E '^[12]' \ | cut -f 1,3 > cladeToPath tail -n+2 clade-paths \ | grep -E '^[A-Za-z]' \ | cut -f 1,3 > lineageToPath $matUtils summary -i gisaidAndPublic.$today.masked.pb -C sample-clades tail -n+2 sample-clades \ | tawk '{print $2, $1;}' \ | sort > cladeToName tail -n+2 sample-clades \ | tawk '{print $3, $1;}' \