88c5c33485279c9687d7be15a85da2665c16e91d angie Wed Jun 16 17:52:44 2021 -0700 Lots of work from the past month. De-dup COG-UK & GenBank; generate public tree from combined gisaidAndPublic tree. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index 64b644c..acd9fa0 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -17,69 +17,176 @@ prevDate=$1 problematicSitesVcf=$2 ottoDir=/hive/data/outside/otto/sarscov2phylo ncbiDir=$ottoDir/ncbi.latest cogUkDir=$ottoDir/cogUk.latest cncbDir=$ottoDir/cncb.latest gisaidDir=/hive/users/angie/gisaid minReal=20000 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit epiToPublic=$gisaidDir/epiToPublicAndDate.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) +y=$(date +%Y) +m=$(date +%m) +d=$(date +%d) + cd $ottoDir/$today prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils # Make lists of sequences already in the tree. -# matUtils creates full paths for output files, defaulting to current directory, so can't -# output to named pipe because it becomes $cwd//dev/fd/63 ... -$matUtils summary -i $prevProtobufMasked -s prevSamples -tail -n+2 prevSamples | cut -f 1 > prevNames +$matUtils extract -i $prevProtobufMasked -u prevNames + +# Before updating the tree with new sequences, update the names used in the tree: +# * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs +# * COG-UK sequences that are in GenBank. Per Sam Nicholls the isolate alone is enough to identify +# them -- no need to match country & year which are sometimes incorrect at first. +grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ +| tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ +| sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \ +| sort \ + > cogUkInGenBankIsolateToNewName +fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names +grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \ + > cogUkInGenBank +set +o pipefail +# From 2021-05-04 on there should be no more unrenamed COG-UK/ in prevNames, but doesn't hurt +# to check. +grep COG-UK/ prevNames \ +| awk -F\| '{print $1 "\t" $0;}' \ +| sed -re 's@COG-UK/@@;' \ +| sort \ + > cogUkInGenBankIsolateToPrevName +set -o pipefail +join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \ +| cut -f 2,3 \ + > prevTree.renaming +# Look for COG-UK isolates in prevNames that have just been added to GenBank and need to be renamed. +# Unfortunately for now we are not getting those from $epiToPublic. +grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) prevNames \ +| awk -F\| '$3 == ""' \ +| awk -F/ '{print $2 "\t" $0;}' \ +| sort \ + > cogUkInGenBankIsolateToPrevName +join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \ +| cut -f 2,3 \ + >> prevTree.renaming +# Look for names with EPI_IDs that have just today been mapped to public sequences. +grep EPI_ISL_ prevNames \ +| awk -F\| '{print $2 "\t" $0;}' \ +| sort \ + > epiToPrevName +set +o pipefail +grep -Fwf <(cut -f 1 epiToPrevName) $epiToPublic \ +| grep -v COG-UK/ \ +| tawk '{if ($4 != "" && $3 == $2) { print $1, $2 "|" $4; } else if ($4 != "") { print $1, $3 "|" $2 "|" $4; }}' \ +| sort \ + > epiToNewName +set -o pipefail +# Argh, missing sequences in COG-UK metadata can mean that a sequence may have been added to the +# tree both with and without EPI ID... so renaming makes a name conflict. +# If there are any of those then prune the sequences with EPI_ID or longer dates, so renaming doesn't +# cause conflict. +comm -12 <(cut -f 2 epiToNewName | sort) <(sort prevNames) > epiToNewNameAlreadyInTree +join -t$'\t' epiToPrevName <(grep -vFwf epiToNewNameAlreadyInTree epiToNewName) \ +| cut -f 2,3 \ + >> prevTree.renaming +cp /dev/null dupsToRemove +if [ -s epiToNewNameAlreadyInTree ]; then + grep -Fwf <(cut -d\| -f 1 epiToNewNameAlreadyInTree) prevNames \ + | grep EPI_ISL \ + >> dupsToRemove +fi +# And finally, sometimes there are duplicates due to country or date being corrected in COG-UK +# metadata. +set +o pipefail +grep -vFwf dupsToRemove prevTree.renaming \ +| cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \ +| cut -d/ -f 2 \ + > cogUkIsolateStillDup +grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \ +| awk -F, '{print $1 "|" $5;}' \ + > cogDupsLatestMetadata +grep -Fwf cogUkIsolateStillDup prevNames \ +| grep -vFwf dupsToRemove \ +| grep -vFwf cogDupsLatestMetadata \ +| cat \ + >> dupsToRemove +set -o pipefail +startingProtobuf=$prevProtobufMasked +if [ -s dupsToRemove ]; then + startingProtobuf=prevDupTrimmed.pb + $matUtils extract -i $prevProtobufMasked -p -s dupsToRemove -o $startingProtobuf +fi + +if [ -s prevTree.renaming ]; then + $matUtils mask -r prevTree.renaming -i $startingProtobuf -o prevRenamed.pb >& rename.out + $matUtils extract -i prevRenamed.pb -u prevNames +else + ln -sf $startingProtobuf prevRenamed.pb +fi + +# OK, now that the tree names are updated, figure out which seqs are already in there and +# which need to be added. awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc -awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ -| grep -E '^(England|Northern|Scotland|Wales)' > prevCogUk +grep -E '^(England|Northern|Scotland|Wales)' prevNames \ +| cut -d\| -f1 > prevCogUk awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^EPI_ISL_' > prevGisaid # Add public sequences that have been mapped to GISAID sequences to prevGisaid. grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid wc -l prev* # Get new GenBank sequences with at least $minReal non-N bases. -#*** TODO: exclude seqs already in tree in their GISAID forms +# Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic. +set +o pipefail +egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \ +| grep -Fwf prevGisaid - \ +| grep -vFwf prevGbAcc \ +| cat \ + >> prevGbAcc +set -o pipefail xzcat $ncbiDir/genbank.fa.xz \ | faSomeRecords -exclude stdin prevGbAcc newGenBank.fa faSize -veryDetailed newGenBank.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gbTooSmall # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too. +set +o pipefail fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall +set -o pipefail faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa faSize newGenBank.filtered.fa # Get new COG-UK sequences with at least $minReal non-N bases. +# Also exclude cog_all.fasta sequences not found in cog_metadata.csv. +comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \ + <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \ + > cogFaNotMeta +# Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above). xzcat $cogUkDir/cog_all.fasta.xz \ -| faSomeRecords -exclude stdin prevCogUk newCogUk.fa +| faSomeRecords -exclude stdin <(cat prevCogUk cogFaNotMeta cogUkInGenBank) newCogUk.fa faSize -veryDetailed newCogUk.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > cogUkTooSmall faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa faSize newCogUk.filtered.fa # Get new GISAID sequences with at least $minReal non-N bases. xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \ | sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \ | faSomeRecords -exclude stdin prevGisaid newGisaid.fa faSize -veryDetailed newGisaid.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gisaidTooSmall faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa faSize newGisaid.filtered.fa @@ -99,166 +206,328 @@ cat new*.filtered.fa > new.fa faSize new.fa # Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results # as a multifasta alignment (from which we can extract VCF with SNVs): #conda install -c bioconda mafft alignedFa=new.aligned.fa rm -f $alignedFa export TMPDIR=/dev/shm time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \ -i new.fa \ -o $alignedFa \ -t 50 faSize $alignedFa -#*** TODO: if EPIs have recently been mapped to public sequences, we should convert their -#*** tree names from EPI to public. - -# Now make a renaming that keeps all the prevNames and adds full names (with shortened dates) -# for the new seqs. +# Now make a renaming that keeps all the prevNames and adds full names for the new seqs. renaming=oldAndNewNames tawk '{print $1, $1;}' prevNames > $renaming if [ -s newCogUk.filtered.fa ]; then # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained. set +o pipefail fastaNames newCogUk.filtered.fa \ | grep -Fwf - $cogUkDir/cog_metadata.csv \ | awk -F, '{print $1 "\t" $1 "|" $5;}' \ - | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \ >> $renaming set -o pipefail fi if [ -s newGenBank.filtered.fa ]; then + # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year + set +o pipefail + fastaNames newGenBank.filtered.fa \ + | grep COG-UK/ \ + | sed -re 's/[ |].*//' \ + | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ + | tawk '{print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ + | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \ + >> $renaming fastaNames newGenBank.filtered.fa \ + | grep -v COG-UK/ \ | sed -re 's/[ |].*//' \ | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | tawk '{ if ($3 == "") { $3 = "?"; } if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; } else { print $1 "\t" $1 "|" $3; } }' \ | cleanGenbank \ | sed -re 's/ /_/g' \ - | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \ >> $renaming + set -o pipefail fi if [ -s newGisaid.filtered.fa ]; then zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(fastaNames newGisaid.filtered.fa) \ | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \ >> $renaming fi wc -l $renaming -# Remove old tree's GISAID items that have been mapped to public sequences so we don't have dups -set +o pipefail -cut -f 1 $epiToPublic \ -| grep -Fwf - prevNames \ -| cat \ - > mappedGisaidToRemove -set -o pipefail -wc -l mappedGisaidToRemove - -if [ -s mappedGisaidToRemove ]; then - $matUtils extract -i $prevProtobufMasked \ - -s mappedGisaidToRemove \ - -p \ - -o prevTrimmed.pb -else - cp -p $prevProtobufMasked prevTrimmed.pb -fi - # Make masked VCF tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf # Add masked VCF to previous protobuf time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ | faToVcf stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | vcfFilter -excludeVcf=mask.vcf stdin \ | gzip -c \ > new.masked.vcf.gz time $usher -u -T 50 \ + -A \ + -e 5 \ -v new.masked.vcf.gz \ - -i prevTrimmed.pb \ - -o gisaidAndPublic.$today.masked.pb \ + -i prevRenamed.pb \ + -o gisaidAndPublic.$today.masked.preTrim.pb \ >& usher.addNew.log -mv uncondensed-final-tree.nh gisaidAndPublic.$today.nwk +mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk + +# Prune samples with too many private mutations and internal branches that are too long. +$matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \ + -a 20 \ + -b 30 \ + -O -o gisaidAndPublic.$today.masked.pb -# Metadata for hgPhyloPlace: for now, start with already-built public metadata. -zcat public-$today.metadata.tsv.gz > gisaidAndPublic.$today.metadata.tsv +# Metadata for hgPhyloPlace: +# Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them: +echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \ + > gisaidAndPublic.$today.metadata.tsv +# It's not always possible to recreate both old and new names correctly from metadata, +# so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs. +cut -f 2 $renaming \ +| awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \ +| sort \ + > idToName +# NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name +grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ +| tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \ +| sed -re 's@COG-UK/@@g; s/United Kingdom://g; s/(\/[0-9]{4})(-[0-9]+)*/\1/; + s@Northern Ireland/@NorthernIreland/@;' \ + > tmp +# NCBI metadata for non-COG-UK (strip colon-separated location after country if present): +grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ +| tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \ +| sed -re 's/\t([A-Za-z -]+):[A-Za-z0-9 ,()_-]+\t/\t\1\t/;' \ +| perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \ +| cleanGenbank \ +| sort tmp - > gb.metadata +if [ -e $ncbiDir/lineage_report.csv ]; then + echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv + tail -n+2 $ncbiDir/lineage_report.csv \ + | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \ + | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \ + | sort \ + > gbToLineage +else + echo Getting GenBank Pangolin lineages from $prevMeta + zcat $prevMeta \ + | tail -n+2 \ + | tawk '$2 != "" && $8 != "" { print $2, $8; }' \ + | sort \ + > gbToLineage +fi +wc -l gbToLineage +if [ -e $ncbiDir/nextclade.tsv ]; then + sort $ncbiDir/nextclade.tsv > gbToNextclade +else + touch gbToNextclade +fi +wc -l gbToNextclade +join -t$'\t' -a 1 gb.metadata gbToNextclade \ +| join -t$'\t' -a 1 - gbToLineage \ +| tawk '{ if ($2 == "") { $2 = "?"; } + print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \ +| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ + >> gisaidAndPublic.$today.metadata.tsv +# COG-UK metadata: +if [ -e $cogUkDir/nextclade.tsv ]; then + sort $cogUkDir/nextclade.tsv > cogUkToNextclade +else + touch cogUkToNextclade +fi +#*** Could also add sequence length to metadata from faSizes output... +tail -n+2 $cogUkDir/cog_metadata.csv \ +| awk -F, -v 'OFS=\t' '{print $1, "", $5, $2, "", "", "", $7; }' \ +| sort \ +| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \ +| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ + >> gisaidAndPublic.$today.metadata.tsv +# CNCB metadata: +tail -n+2 $cncbDir/cncb.metadata.tsv \ +| tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") { + print $2, "", $10, $11, $9, $5, $6} }' \ +| sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@; s/Sapiens/sapiens/;' \ +| sort \ +| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \ +| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \ +| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ + >> gisaidAndPublic.$today.metadata.tsv +wc -l gisaidAndPublic.$today.metadata.tsv zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(cut -f 2 $renaming | grep EPI_ISL | cut -d\| -f 2) \ | tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \ >> gisaidAndPublic.$today.metadata.tsv wc -l gisaidAndPublic.$today.metadata.tsv gzip gisaidAndPublic.$today.metadata.tsv # version/description files cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.plusGisaid.txt $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today sampleCountComma=$(echo $(wc -l < samples.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.plusGisaid.description.txt # Add nextclade annotations to protobuf zcat gisaidAndPublic.$today.metadata.tsv.gz \ | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \ | sed -re 's/^20E \(EU1\)/20E.EU1/;' \ > cladeToName time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ -c cladeToName \ - -m mutations.nextclade \ + -u mutations.nextclade \ -D details.nextclade \ -o gisaidAndPublic.$today.masked.nextclade.pb \ >& annotate.nextclade.out # Add pangolin lineage annotations to protobuf. Use pangoLEARN training samples; # convert EPI IDs to public to match tree IDs. -tail -n+2 ~angie/github/pangoLEARN/pangoLEARN/data/lineages.metadata.csv \ -| awk -F, '{print $7 "\t" $2;}' \ +tail -n+2 ~angie/github/pango-designation/lineages.metadata.csv \ +| grep -vFwf $ottoDir/clades.blackList \ +| awk -F, '{print $9 "\t" $2;}' \ | sort > epiExemplarToLineage subColumn -miss=/dev/null 1 epiExemplarToLineage \ - <(cut -f 1,2 $gisaidDir/epiToPublicAndDate.$today) stdout \ + <(cut -f 1,2 $epiToPublic) stdout \ | sort > idExemplarToLineage -cut -f 2 $renaming | grep -Fwf <(cut -f 1 idExemplarToLineage) \ +grep -Fwf <(cut -f 1 idExemplarToLineage) samples.$today \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \ | sort > idExemplarToName join -t$'\t' idExemplarToName idExemplarToLineage \ | tawk '{print $3, $2;}' \ | sort > lineageToName -linFile=lineageToName - -# Until pangoLEARN training is done, use pango-designation file for now. -#linFile=/hive/users/angie/matLineages/linToPublicPlusGisaidName.2021-03-16 +# $epiToPublic maps some cogUkInGenBank EPI IDs to their COG-UK names not GenBank IDs unfortunately +# so some of those don't match between idExemplarToLineage and idExemplarToName. Find missing +# sequences and try a different means of adding those. +comm -13 <( cut -f 1 idExemplarToName | sort) <(cut -f 1 idExemplarToLineage| sort) \ +| grep -Fwf - ~angie/github/pango-designation/lineages.metadata.csv \ +| sed -re 's/Northern_/Northern/;' \ +| awk -F, '{print $1 "\t" $2;}' \ +| sort > exemplarNameNotFoundToLineage +grep -Fwf <(cut -f 1 exemplarNameNotFoundToLineage) samples.$today \ +| awk -F\| '{print $1 "\t" $0;}' \ +| sort > exemplarNameNotFoundToFullName +join -t$'\t' exemplarNameNotFoundToLineage exemplarNameNotFoundToFullName \ +| cut -f 2,3 \ +| sort -u lineageToName - ../lineageToName.newLineages > tmp +mv tmp lineageToName time $matUtils annotate -T 50 \ -i gisaidAndPublic.$today.masked.nextclade.pb \ - -c $linFile \ - -m mutations.pangolin \ + -c lineageToName \ + -u mutations.pangolin \ -D details.pangolin \ -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \ >& annotate.pangolin.out mv gisaidAndPublic.$today.masked{,.unannotated}.pb ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb -# Extract public samples from tree -$matUtils summary -i gisaidAndPublic.$today.masked.pb -s newSamples -tail -n+2 newSamples | cut -f 1 > newNames -grep -v EPI_ISL_ newNames > newPublicNames -$matUtils extract -i gisaidAndPublic.$today.masked.pb \ - -s newPublicNames \ - -O -o public.$today.masked.pb +# EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have +# public names & IDs, we can match them. +cut -f 1,3 $epiToPublic > epiToPublic.latest +# Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \ $dir/public.plusGisaid.latest.metadata.tsv.gz ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt + ln -sf `pwd`/epiToPublic.latest $dir/ done +# Extract public samples from tree +$matUtils extract -i gisaidAndPublic.$today.masked.pb -u newNames +grep -v EPI_ISL_ newNames > newPublicNames +$matUtils extract -i gisaidAndPublic.$today.masked.pb \ + -s newPublicNames \ + -O -o public-$today.all.masked.pb +# Extract Newick and VCF from public-only tree +$matUtils extract -i public-$today.all.masked.pb \ + -t public-$today.all.nwk \ + -v public-$today.all.masked.vcf +gzip -f public-$today.all.masked.vcf +zcat gisaidAndPublic.$today.metadata.tsv.gz \ +| grep -v EPI_ISL_ \ +| gzip -c \ + > public-$today.metadata.tsv.gz + +grep -v EPI_ISL_ cladeToName > cladeToPublicName +grep -v EPI_ISL_ lineageToName > lineageToPublicName + +# Add nextclade annotations to public protobuf +time $matUtils annotate -T 50 \ + -l \ + -i public-$today.all.masked.pb \ + -c cladeToPublicName \ + -u mutations.nextclade.public \ + -D details.nextclade.public \ + -o public-$today.all.masked.nextclade.pb \ + >& annotate.nextclade.public.out + +# Add pangolin lineage annotations to public protobuf +time $matUtils annotate -T 50 \ + -i public-$today.all.masked.nextclade.pb \ + -c lineageToPublicName \ + -u mutations.pangolin.public \ + -D details.pangolin.public \ + -o public-$today.all.masked.nextclade.pangolin.pb \ + >& annotate.pangolin.public.out + +# Not all the Pangolin lineages can be assigned nodes so for now just use nextclade +rm public-$today.all.masked.pb +ln public-$today.all.masked.nextclade.pb public-$today.all.masked.pb + +cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') +echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ + > version.txt + +$matUtils extract -i public-$today.all.masked.pb -u samples.public.$today +sampleCountComma=$(echo $(wc -l < samples.public.$today) \ + | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') +echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ + > hgPhyloPlace.description.txt + + +# Link to public trees download directory hierarchy +archiveRoot=/hive/users/angie/publicTrees +archive=$archiveRoot/$y/$m/$d +mkdir -p $archive +gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz +ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/ +gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz +ln `pwd`/public-$today.metadata.tsv.gz $archive/ +gzip -c public-$today.all.masked.nextclade.pangolin.pb \ + > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz +gzip -c cladeToPublicName > $archive/cladeToPublicName.gz +gzip -c lineageToPublicName > $archive/lineageToPublicName.gz +ln `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt +# Update 'latest' in $archiveRoot +ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz +ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb +ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz +ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz +ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz +ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt + +# Update hgdownload-test link for archive +mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m +ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m + +# Update links to latest public protobuf and metadata in hgwdev cgi-bin directories +for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do + ln -sf `pwd`/public-$today.all.masked.pb $dir/public-latest.all.masked.pb + ln -sf `pwd`/public-$today.metadata.tsv.gz $dir/public-latest.metadata.tsv.gz + ln -sf `pwd`/hgPhyloPlace.description.txt $dir/public-latest.version.txt +done