3d604489e76b3d834ebed86f8e9b610f2c9c20d8
angie
  Fri Dec 20 15:57:02 2024 -0800
Call makeNewMaskedMaple.sh instead of makeNewMaskedVcf.sh (no more new*fa to clean up at end); use .pb.gz instead of .pb; install hgPhyloPlace files in /gbdb instead of cgi-bin.  Make omicron-only subtree; not sure if we'll keep that though, it's still quite large at half the tree.

diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedMaple.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedMaple.sh
new file mode 100755
index 0000000..e5452e2
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedMaple.sh
@@ -0,0 +1,337 @@
+#!/bin/bash
+set -beEu -x -o pipefail
+
+#	Do not modify this script, modify the source tree copy:
+#	kent/src/hg/utils/otto/sarscov2phylo/makeNewMaskedMaple.sh
+
+usage() {
+    echo "usage: $0 prevDate today problematicSitesVcf [baseProtobuf]"
+    echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated."
+}
+
+if (( $# != 3 && $# != 4 )); then
+  usage
+  exit 1
+fi
+
+prevDate=$1
+today=$2
+problematicSitesVcf=$3
+if [ $# == 4 ]; then
+    baseProtobuf=$4
+else
+    baseProtobuf=
+fi
+
+ottoDir=/hive/data/outside/otto/sarscov2phylo
+ncbiDir=$ottoDir/ncbi.latest
+cogUkDir=$ottoDir/cogUk.latest
+cncbDir=$ottoDir/cncb.latest
+gisaidDir=/hive/users/angie/gisaid
+minReal=20000
+ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
+epiToPublic=$gisaidDir/epiToPublicAndDate.latest
+
+lineageProposalsRecombinants=https://raw.githubusercontent.com/sars-cov-2-variants/lineage-proposals/main/recombinants.tsv
+
+scriptDir=$(dirname "${BASH_SOURCE[0]}")
+source $scriptDir/util.sh
+
+mkdir -p $ottoDir/$today
+cd $ottoDir/$today
+
+# If there's a version that I didn't want to push out to the main site, but wanted to be used
+# as the basis for the next day's build (for example with some extra pruning), use that:
+if [ -e $ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.useMe.pb.gz ]; then
+    prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.useMe.pb.gz
+else
+    prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb.gz
+fi
+
+usherDir=~angie/github/usher
+usher=$usherDir/build/usher
+matUtils=$usherDir/build/matUtils
+
+renaming=oldAndNewNames
+
+if [ "$baseProtobuf" == "" ]; then
+    baseProtobuf=$prevProtobufMasked
+fi
+
+# Make lists of sequences already in the tree.
+$matUtils extract -i $baseProtobuf -u prevNames
+
+# First, in order to catch & remove duplicates and update names that have changed, extract the
+# accession from every name in the tree.
+awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' prevNames \
+| subColumn 1 -miss=/dev/null stdin <(cut -f 1,2 $epiToPublic) stdout \
+| sort \
+    > prevIdToName
+# Arbitrarily pick one of each duplicated ID to keep in the tree
+sort -k1,1 -u prevIdToName > prevIdToNameDeDup
+# Remove whichever duplicated items were not arbitrarily chosen to remain
+comm -23 prevIdToName prevIdToNameDeDup | cut -f 2 > prevNameToRemove
+# Also remove items for which we have no metadata (i.e. they were removed from repository,
+# or .1 was replaced by .2 etc.)
+cut -f 1 prevIdToNameDeDup | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]+' > gb.acc
+comm -13 <(cut -f 1 $ncbiDir/ncbi_dataset.plusBioSample.tsv) gb.acc > gb.removed.acc
+if [ -s gb.removed.acc ]; then
+    grep -Fwf gb.removed.acc prevIdToName | cut -f 2 >> prevNameToRemove
+fi
+cut -f 1 prevIdToNameDeDup | grep -E '^EPI_ISL_' > epi.acc
+comm -13 <(zcat $gisaidDir/metadata_batch_$today.tsv.gz | cut -f 3 | sort) epi.acc > epi.removed.acc
+if [ -s epi.removed.acc ]; then
+    grep -Fwf epi.removed.acc prevIdToName | cut -f 2 >> prevNameToRemove
+fi
+cut -f 1 prevIdToNameDeDup | grep -E '^(England|Northern|Scotland|Wales)' > cog.acc
+comm -13 <(zcat $cogUkDir/cog_metadata.csv.gz | cut -d, -f 1 | sort) cog.acc > cog.removed.acc
+if [ -s cog.removed.acc ]; then
+    grep -Fwf cog.removed.acc prevIdToName \
+    | grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|EPI_ISL_)' \
+    | cut -f 2 >> prevNameToRemove
+fi
+cut -f 1 prevIdToNameDeDup \
+| grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|EPI_ISL_|England|Northern|Scotland|Wales)' \
+    > cncb.acc
+comm -13 <(cut -f 2 $cncbDir/cncb.metadata.tsv | sort) cncb.acc > cncb.removed.acc
+if [ -s cncb.removed.acc ]; then
+    grep -Fwf cncb.removed.acc prevIdToName | cut -f 2 >> prevNameToRemove
+fi
+
+# If a sequence is in both INSDC and COG-UK, keep the INSDC and remove the COG-UK as dup.
+set +o pipefail
+cut -f 6 $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+| grep ^COG-UK/ \
+| sed -re 's@COG-UK/@@' \
+| grep -Fwf - prevNames \
+| grep -vE '\|[A-Z]{2}[0-9]{6}\.[0-9]+\|' \
+| grep -vE '\|EPI_ISL_[0-9]+\|' \
+| cat \
+    >> prevNameToRemove
+set -o pipefail
+
+# Remove duplicates and withdrawn sequences
+if [ -s prevNameToRemove ]; then
+    rm -f prevDedup.pb.gz
+    $matUtils extract -i $baseProtobuf \
+        -p -s <(sort -u prevNameToRemove) \
+        -u prevDedupNames \
+        -o prevDedup.pb.gz
+else
+    ln -sf $baseProtobuf prevDedup.pb.gz
+    $matUtils extract -i prevDedup.pb.gz -u prevDedupNames
+fi
+
+function gbAccCogRenaming {
+    # pipeline: one INSDC accession per line of stdin, acc to full name if COG-UK on stdout
+    grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+    | grep COG-UK/ \
+    | tawk '{ if ($4 != "") { print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3; } else { if ($3 != "") { print $1, $6  "/" $3 "|" $1 "|" $3; } else { print $1, $6 "|" $1 "|?"; } } }' \
+    | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;'
+}
+
+function gbAccNonCogRenaming {
+    # pipeline: one INSDC accession per line of stdin, acc to full name if non-COG-UK on stdout
+    grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+    | grep -v COG-UK/ \
+    | cleanGenbank \
+    | tawk '{ if ($3 == "") { $3 = "?"; }
+              if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; }
+              else { print $1 "\t" $1 "|" $3; } }' \
+    | sed -re 's/ /_/g'
+}
+
+# To update names that have changed and simplify detection of new sequences to add, relate to acc.
+# Strip country and year from COG-UK names to get COG acc.
+awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' prevDedupNames \
+| subColumn 1 -miss=/dev/null stdin <(cut -f 1,2 $epiToPublic) stdout \
+| sed -re 's@^(England|Northern_?Ireland|Scotland|Wales)/([A-Z]+[_-]?[A-Za-z0-9]+)/[0-9]+@COG:\2@;' \
+| sort \
+    > accToPrevDedupName
+
+# Break down accs by source -- we will need those both for renaming acc to full name and for
+# figuring out which seqs the tree already has vs. which are new.
+cut -f 1 accToPrevDedupName | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]+' > prevGbAcc
+cut -f 1 accToPrevDedupName | grep -E '^COG:' | sed -re 's/^COG://;' > prevCogUk
+cut -f 1 accToPrevDedupName | grep -E '^EPI_ISL_' > prevGisaid
+cut -f 1 accToPrevDedupName | grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|COG:|EPI_ISL_)' > prevCncb
+
+# GenBank renaming has both COG-UK and non-COG-UK versions:
+gbAccCogRenaming < prevGbAcc > accToNewName
+gbAccNonCogRenaming < prevGbAcc >> accToNewName
+# Restore the COG:isolate format for non-GenBank COG-UK sequences:
+zcat $cogUkDir/cog_metadata.csv.gz \
+| grep -Fwf prevCogUk \
+| awk -F, '{print $1 "\t" $1 "|" $5;}' \
+| sed -re 's@^(England|Northern_?Ireland|Scotland|Wales)/([A-Z]+[_-]?[A-Za-z0-9]+)/[0-9]+@COG:\2@;' \
+    >> accToNewName
+# GISAID:
+zcat $gisaidDir/metadata_batch_$today.tsv.gz \
+| grep -Fwf prevGisaid \
+| tawk '$3 != "" {print $3 "\t" $1 "|" $3 "|" $5;}' \
+    >> accToNewName
+# CNCB:
+grep -Fwf prevCncb $cncbDir/cncb.metadata.tsv \
+| cleanCncb \
+| sed -re 's/ /_/g;' \
+| tawk '{print $2 "\t" $1 "|" $2 "|" $10;}' \
+    >> accToNewName
+join -t$'\t' accToPrevDedupName <(sort accToNewName) \
+| tawk '$2 != $3 {print $2, $3;}' \
+    > prevDedupNameToNewName
+
+$matUtils mask -i prevDedup.pb.gz -r prevDedupNameToNewName -o prevRenamed.pb.gz \
+    >& renaming.out
+rm renaming.out
+
+# OK, now that the tree names are updated, figure out which seqs are already in there and
+# which need to be added.
+# Add GenBank COG-UK sequences to prevCogUk so we don't add dups.
+cut -f 6 $ncbiDir/ncbi_dataset.plusBioSample.tsv | grep COG-UK | sed -re 's@^COG-UK/@@;' >> prevCogUk
+# Add public sequences that have been mapped to GISAID sequences to prevGisaid.
+cut -f 1 $epiToPublic >> prevGisaid
+wc -l prev{GbAcc,CogUk,Gisaid,Cncb}
+
+# Exclude some sequences based on nextclade counts of reversions and other-clade mutations.
+zcat $gisaidDir/chunks/nextclade.full.tsv.gz \
+| $scriptDir/findDropoutContam.pl > gisaid.dropoutContam
+zcat $ncbiDir/nextclade.full.tsv.gz \
+| $scriptDir/findDropoutContam.pl > gb.dropoutContam
+zcat $cogUkDir/nextclade.full.tsv.gz \
+| $scriptDir/findDropoutContam.pl > cog.dropoutContam
+zcat $cncbDir/nextclade.full.tsv.gz \
+| $scriptDir/findDropoutContam.pl > cncb.dropoutContam
+cut -f 1 *.dropoutContam \
+| awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
+    > dropoutContam.ids
+# Also exclude sequences with unbelievably low numbers of mutations given sampling dates.
+zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 2,11 \
+| awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \
+| $scriptDir/findRefBackfill.pl > gisaid.refBackfill
+zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 2,11 | sort \
+| join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \
+| $scriptDir/findRefBackfill.pl > gb.refBackfill
+zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 2,11 | sort \
+| join -t $'\t' <(zcat $cogUkDir/cog_metadata.csv.gz | cut -d, -f 1,5 | tr , $'\t' | sort) - \
+| $scriptDir/findRefBackfill.pl > cog.refBackfill
+zcat $cncbDir/nextclade.full.tsv.gz | cut -f 2,11 | sort \
+| join -t$'\t' <(cut -f 2,10 $cncbDir/cncb.metadata.tsv | sort) - \
+| $scriptDir/findRefBackfill.pl > cncb.refBackfill
+cut -f 1 *.refBackfill > refBackfill.ids
+curl -sS $lineageProposalsRecombinants  | tail -n+2 | cut -f 1 \
+| sed -re 's@(England|Northern[ _]?Ireland|Scotland|Wales)/([A-Z0-9_-]+).*@\2@;
+           s/.*(EPI_ISL_[0-9]+|[A-Z]{2}[0-9]+{6}(\.[0-9]+)?).*/\1/;' \
+    > tmp
+grep -Fwf tmp $epiToPublic | cut -f 2 | grep -E '^[A-Z]{2}[0-9]{6}' > tmp2
+sort -u tmp tmp2 > lpRecombinantIds
+rm tmp tmp2
+sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \
+| grep -vFwf <(tail -n+2 $scriptDir/includeRecombinants.tsv | cut -f 1) \
+| grep -vFwf lpRecombinantIds \
+    > exclude.ids
+
+# Get IDs of new GenBank sequences
+set +o pipefail
+cut -f 1 $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+| grep -vFwf <(cat prevGbAcc exclude.ids) \
+| cat \
+    > gbNew
+
+# Get IDs of new COG-UK sequences
+zcat $cogUkDir/cog_metadata.csv.gz | cut -d, -f 1 \
+| grep -vFwf <(cat prevCogUk exclude.ids) \
+| cat \
+    > cogUkNew
+
+# Get IDs of new GISAID sequences
+zcat $gisaidDir/metadata_batch_$today.tsv.gz \
+| cut -f 3 \
+| grep -vFwf <(cat prevGisaid exclude.ids) \
+| cat \
+    > gisaidNew
+
+# Get IDs of new CNCB sequences
+cut -f 2 $cncbDir/cncb.metadata.tsv \
+| grep -vFwf <(cat prevCncb exclude.ids) \
+| cat \
+    > cncbNew
+
+# Now make a renaming that converts accessions back to full name|acc|year names.
+cp /dev/null $renaming
+if [ -s cogUkNew ]; then
+    grep -Fwf cogUkNew <(zcat $cogUkDir/cog_metadata.csv.gz) \
+    | awk -F, '{print $1 "\t" $1 "|" $5;}' \
+        >> $renaming
+fi
+if [ -s gbNew ]; then
+    # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year
+    cat gbNew \
+    | gbAccCogRenaming \
+        >> $renaming
+    cat gbNew \
+    | gbAccNonCogRenaming \
+        >> $renaming
+fi
+if [ -s gisaidNew ]; then
+    zcat $gisaidDir/metadata_batch_$today.tsv.gz \
+    | grep -Fwf gisaidNew \
+    | tawk '$3 != "" {print $3 "\t" $1 "|" $3 "|" $5;}' \
+        >> $renaming
+fi
+if [ -s cncbNew ]; then
+    cleanCncb < $cncbDir/cncb.metadata.tsv \
+    | sed -re 's/ /_/g;' \
+    | grep -Fwf cncbNew \
+    | tawk '{print $2 "\t" $1 "|" $2 "|" $10;}' \
+        >> $renaming
+fi
+set -o pipefail
+wc -l $renaming
+
+# Make mask.bed from $problematicSitesVcf
+tawk '$7 == "mask" {print $1, $2-1, $2;}' $problematicSitesVcf > mask.bed
+
+# Make MAPLE diff from nextclade TSV output from each source
+# Strip out deletions because usher-sampled treats them as N and can infer substitutions
+# which usually means garbage.
+cp /dev/null new.masked.mpl.gz
+if [ -s gbNew ]; then
+    cat <(zcat $ncbiDir/nextclade.full.tsv.gz | head -1) \
+        <(zcat $ncbiDir/nextclade.full.tsv.gz | grep -Fwf gbNew) \
+    | nextcladeToMaple -refLen=29903 -renameOrPrune=$renaming -maskBed=mask.bed -maxSubst=200 \
+        -minReal=$minReal stdin stdout \
+    | grep -v '^-' \
+    | pigz -p 8 \
+        >> new.masked.mpl.gz
+fi
+if [ -s cogUkNew ]; then
+    cat <(zcat $cogUkDir/nextclade.full.tsv.gz | head -1) \
+        <(zcat $cogUkDir/nextclade.full.tsv.gz | grep -Fwf cogUkNew) \
+    | nextcladeToMaple -refLen=29903 -renameOrPrune=$renaming -maskBed=mask.bed -maxSubst=200 \
+        -minReal=$minReal stdin stdout \
+    | grep -v '^-' \
+    | pigz -p 8 \
+        >> new.masked.mpl.gz
+fi
+if [ -s gisaidNew ]; then
+    # The GISAID nextclade.full.tsv.gz has full names not IDs; strip down to IDs so lack of
+    # renaming doesn't lead to pruning.
+    cat <(zcat $gisaidDir/chunks/nextclade.full.tsv.gz | head -1) \
+        <(zcat $gisaidDir/chunks/nextclade.full.tsv.gz \
+          | sed -re 's/[^\t]+\|(EPI_ISL_[0-9]+)\|[^\t]+/\1/' \
+          | grep -Fwf gisaidNew) \
+    | nextcladeToMaple -refLen=29903 -renameOrPrune=$renaming -maskBed=mask.bed -maxSubst=200 \
+        -minReal=$minReal stdin stdout \
+    | grep -v '^-' \
+    | pigz -p 8 \
+        >> new.masked.mpl.gz
+fi
+if [ -s cncbNew ]; then
+    cat <(zcat $cncbDir/nextclade.full.tsv.gz | head -1) \
+        <(zcat $cncbDir/nextclade.full.tsv.gz | grep -Fwf cncbNew) \
+    | nextcladeToMaple -refLen=29903 -renameOrPrune=$renaming -maskBed=mask.bed -maxSubst=200 \
+        -minReal=$minReal stdin stdout \
+    | grep -v '^-' \
+    | pigz -p 8 \
+        >> new.masked.mpl.gz
+fi