src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh 873bc700cc4fbedfa5164ba912f7e69adf80b127

873bc700cc4fbedfa5164ba912f7e69adf80b127
angie
  Thu Jul 15 12:46:17 2021 -0700
Use matUtils annotate's new -P/--clade-paths option to assign clades using curated path files instead of -c/--clade-names.  It is much faster and allows manual correction of not-quite-right paths inferred by -c.  It will also require manual updates (but so did ../lineageToName.newLineages) and will fail if the tree structure changes significantly.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index bc8966b..91c8f79 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -313,31 +313,31 @@
 # It's not always possible to recreate both old and new names correctly from metadata,
 # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs.
 cut -f 2 $renaming \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \
 | sort \
     > idToName
 # NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name
 grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \
 | sed -re 's@COG-UK/@@g; s/United Kingdom://g;  s/(\/[0-9]{4})(-[0-9]+)*/\1/;
            s@Northern Ireland/@NorthernIreland/@;' \
     > tmp
 # NCBI metadata for non-COG-UK (strip colon-separated location after country if present):
 grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \
-| sed -re 's@\t([A-Za-z -]+):[A-Za-z0-9 .,()_/-]+\t@\t\1\t@;' \
+| sed -re 's@\t([A-Za-z -]+):[^\t]+\t@\t\1\t@;' \
 | perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \
 | cleanGenbank \
 | sort tmp - > gb.metadata
 if [ -e $ncbiDir/lineage_report.csv ]; then
     echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv
     tail -n+2  $ncbiDir/lineage_report.csv \
     | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \
     | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \
     | sort \
         > gbToLineage
 else
     echo Getting GenBank Pangolin lineages from $prevMeta
     zcat $prevMeta \
     | tail -n+2 \
     | tawk '$2 != "" && $8 != "" { print $2, $8; }' \
@@ -388,90 +388,41 @@
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
 gzip gisaidAndPublic.$today.metadata.tsv
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
-zcat gisaidAndPublic.$today.metadata.tsv.gz \
-| tail -n+2 | tawk '$8 != "" {print $8, $1;}' \
-| subColumn -miss=/dev/null 1 stdin ../nextcladeToShort cladeToName
-
 time $matUtils annotate -T 50 \
     -l \
     -i gisaidAndPublic.$today.masked.pb \
-    -c cladeToName \
-    -u mutations.nextclade \
-    -D details.nextclade \
-    -o gisaidAndPublic.$today.masked.nextclade.pb \
-    >& annotate.nextclade.out
-
-# Add pangolin lineage annotations to protobuf.  Use pangoLEARN training samples;
-# convert EPI IDs to public to match tree IDs.
-tail -n+2 ~angie/github/pango-designation/lineages.metadata.csv \
-| grep -vFwf $ottoDir/clades.blackList \
-| awk -F, '{print $9 "\t" $2;}' \
-| sed -re 's/B\.1\.1\.464\.1/AW.1/;  s/B\.1\.526\.[0-9]+/B.1.526/;' \
-| sort > epiExemplarToLineage
-subColumn -miss=/dev/null 1 epiExemplarToLineage \
-    <(cut -f 1,2 $epiToPublic) stdout \
-| sort > idExemplarToLineage
-grep -Fwf <(cut -f 1 idExemplarToLineage) samples.$today \
-| awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \
-| sort > idExemplarToName
-join -t$'\t' idExemplarToName idExemplarToLineage \
-| tawk '{print $3, $2;}' \
-| sort > lineageToName
-# $epiToPublic maps some cogUkInGenBank EPI IDs to their COG-UK names not GenBank IDs unfortunately
-# so some of those don't match between idExemplarToLineage and idExemplarToName.  Find missing
-# sequences and try a different means of adding those.
-comm -13 <( cut -f 1 idExemplarToName | sort) <(cut -f 1 idExemplarToLineage| sort) \
-| grep -Fwf - ~angie/github/pango-designation/lineages.metadata.csv \
-| sed -re 's/Northern_/Northern/;' \
-| awk -F, '{print $1 "\t" $2;}' \
-| sort > exemplarNameNotFoundToLineage
-grep -Fwf <(cut -f 1 exemplarNameNotFoundToLineage) samples.$today \
-| awk -F\| '{print $1 "\t" $0;}' \
-| sort > exemplarNameNotFoundToFullName
-join -t$'\t' exemplarNameNotFoundToLineage exemplarNameNotFoundToFullName \
-| cut -f 2,3 \
-| sort -u lineageToName - ../lineageToName.newLineages \
-| sed -re 's/B\.1\.1\.464\.1/AW.1/;' \
-> tmp
-mv tmp lineageToName
-
-# Yatish's suggestion: use pangolin/pangoLEARN assignments instead of lineages.csv
-zcat gisaidAndPublic.$today.metadata.tsv.gz \
-| tail -n+2 | tawk '$9 != "" && $9 != "None" {print $9, $1;}' \
-| grep -vFwf $ottoDir/clades.blackList \
-    > lineageToName.assigned
+    -P ../nextstrain.clade-paths.tsv \
+    -o gisaidAndPublic.$today.masked.nextclade.pb
 
+# Add pangolin lineage annotations to protobuf.
 time $matUtils annotate -T 50 \
     -i gisaidAndPublic.$today.masked.nextclade.pb \
-    -c lineageToName \
-    -u mutations.pangolin \
-    -D details.pangolin \
-    -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
-    >& annotate.pangolin.out
+    -P ../pango.clade-paths.tsv \
+    -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb
 
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
 ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have
 # public names & IDs, we can match them.
 cut -f 1,3 $epiToPublic > epiToPublic.latest
 
 # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb
     ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \
         $dir/public.plusGisaid.latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt
     ln -sf `pwd`/epiToPublic.latest $dir/
@@ -480,79 +431,69 @@
 # Extract public samples from tree
 grep -v EPI_ISL_ samples.$today > newPublicNames
 $matUtils extract -i gisaidAndPublic.$today.masked.pb \
     -s newPublicNames \
     -O -o public-$today.all.masked.pb
 # Extract Newick and VCF from public-only tree
 $matUtils extract -i public-$today.all.masked.pb \
     -t public-$today.all.nwk \
     -v public-$today.all.masked.vcf
 gzip -f public-$today.all.masked.vcf
 zcat gisaidAndPublic.$today.metadata.tsv.gz \
 | grep -v EPI_ISL_ \
 | gzip -c \
     > public-$today.metadata.tsv.gz
 
-grep -v EPI_ISL_ cladeToName > cladeToPublicName
-grep -v EPI_ISL_ lineageToName > lineageToPublicName
-
 # Add nextclade annotations to public protobuf
 time $matUtils annotate -T 50 \
     -l \
     -i public-$today.all.masked.pb \
-    -c cladeToPublicName \
-    -u mutations.nextclade.public \
-    -D details.nextclade.public \
-    -o public-$today.all.masked.nextclade.pb \
-    >& annotate.nextclade.public.out
+    -P ../nextstrain.clade-paths.public.tsv \
+    -o public-$today.all.masked.nextclade.pb
 
 # Add pangolin lineage annotations to public protobuf
 time $matUtils annotate -T 50 \
     -i public-$today.all.masked.nextclade.pb \
-    -c lineageToPublicName \
-    -u mutations.pangolin.public \
-    -D details.pangolin.public \
-    -o public-$today.all.masked.nextclade.pangolin.pb \
-    >& annotate.pangolin.public.out
+    -P ../pango.clade-paths.public.tsv \
+    -o public-$today.all.masked.nextclade.pangolin.pb
 
-# Not all the Pangolin lineages can be assigned nodes so for now just use nextclade
 rm public-$today.all.masked.pb
-ln public-$today.all.masked.nextclade.pb public-$today.all.masked.pb
+ln -f public-$today.all.masked.nextclade.pangolin.pb public-$today.all.masked.pb
 
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.txt
 
 $matUtils extract -i public-$today.all.masked.pb -u samples.public.$today
 sampleCountComma=$(echo $(wc -l < samples.public.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.description.txt
 
 
 # Link to public trees download directory hierarchy
 archiveRoot=/hive/users/angie/publicTrees
 archive=$archiveRoot/$y/$m/$d
 mkdir -p $archive
 gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz
 ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
 gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz
 ln `pwd`/public-$today.metadata.tsv.gz $archive/
 gzip -c public-$today.all.masked.nextclade.pangolin.pb \
     > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz
-gzip -c cladeToPublicName > $archive/cladeToPublicName.gz
-gzip -c lineageToPublicName > $archive/lineageToPublicName.gz
+gzip -c ../nextstrain.clade-paths.public.tsv > $archive/nextstrain.clade-paths.public.tsv.gz
+gzip -c ../pango.clade-paths.public.tsv > $archive/pango.clade-paths.public.tsv.gz
 ln `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt
 
 # Update 'latest' in $archiveRoot
 ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz
 ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb
 ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz
 ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz
 ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz
 ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt
 
 # Update hgdownload-test link for archive
 mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
 ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
 
 # Update links to latest public protobuf and metadata in hgwdev cgi-bin directories