7cef6a7083abdbdfb242575ccb0dcb7a73c42197 angie Thu Sep 30 09:58:01 2021 -0700 New script usherClusterRun.sh for running usher on ku, merging & optimizing results instead of single hgwdev job. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index 71798f2..8602463 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -24,50 +24,44 @@ epiToPublic=$gisaidDir/epiToPublicAndDate.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh mkdir -p $ottoDir/$today cd $ottoDir/$today usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils if [ ! -s new.masked.vcf.gz ]; then $scriptDir/makeNewMaskedVcf.sh $prevDate $today $problematicSitesVcf fi -time $usher -u -T 80 \ - -A \ - -e 5 \ - --max-parsimony-per-sample 20 \ - -v new.masked.vcf.gz \ - -i prevRenamed.pb \ - -o gisaidAndPublic.$today.masked.preTrim.pb \ - >& usher.addNew.log -mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk +if [ ! -s gisaidAndPublic.$today.masked.pb ]; then + $scriptDir/usherClusterRun.sh $today + # Prune samples with too many private mutations and internal branches that are too long. + $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \ + -b 30 \ + -O -o gisaidAndPublic.$today.masked.pb +fi # Exclude sequences with a very high number of EPPs from future runs grep ^Current usher.addNew.log \ | awk '$16 >= 10 {print $8;}' \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ - >> ../tooManyEpps.ids - -# Prune samples with too many private mutations and internal branches that are too long. -$matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \ - -b 30 \ - -O -o gisaidAndPublic.$today.masked.pb + > tooManyEpps.ids +cat tooManyEpps.ids >> ../tooManyEpps.ids $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today $scriptDir/combineMetadata.sh $prevDate $today # version/description files cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.plusGisaid.txt sampleCountComma=$(echo $(wc -l < samples.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.plusGisaid.description.txt # Add nextclade annotations to protobuf