src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh ab3504a9bc0e6077a70cdcefc29264833ced8126

ab3504a9bc0e6077a70cdcefc29264833ced8126
angie
  Mon Jul 12 16:36:23 2021 -0700
Add in sub-UK country to match COG-UK sequence names; correct outdated lineage names in lineages.metadata.csv; make alternate lineageToName file, still need to do comparison of using it.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index 63bbdc5..dcca06e 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -313,31 +313,31 @@
 # It's not always possible to recreate both old and new names correctly from metadata,
 # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs.
 cut -f 2 $renaming \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \
 | sort \
     > idToName
 # NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name
 grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \
 | sed -re 's@COG-UK/@@g; s/United Kingdom://g;  s/(\/[0-9]{4})(-[0-9]+)*/\1/;
            s@Northern Ireland/@NorthernIreland/@;' \
     > tmp
 # NCBI metadata for non-COG-UK (strip colon-separated location after country if present):
 grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \
-| sed -re 's/\t([A-Za-z -]+):[A-Za-z0-9 ,()_-]+\t/\t\1\t/;' \
+| sed -re 's@\t([A-Za-z -]+):[A-Za-z0-9 .,()_/-]+\t@\t\1\t@;' \
 | perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \
 | cleanGenbank \
 | sort tmp - > gb.metadata
 if [ -e $ncbiDir/lineage_report.csv ]; then
     echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv
     tail -n+2  $ncbiDir/lineage_report.csv \
     | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \
     | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \
     | sort \
         > gbToLineage
 else
     echo Getting GenBank Pangolin lineages from $prevMeta
     zcat $prevMeta \
     | tail -n+2 \
     | tawk '$2 != "" && $8 != "" { print $2, $8; }' \
@@ -353,31 +353,32 @@
 wc -l gbToNextclade
 join -t$'\t' -a 1 gb.metadata gbToNextclade \
 | join -t$'\t' -a 1 - gbToLineage \
 | tawk '{ if ($2 == "") { $2 = "?"; }
           print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv
 # COG-UK metadata:
 if [ -e $cogUkDir/nextclade.tsv ]; then
     sort $cogUkDir/nextclade.tsv > cogUkToNextclade
 else
     touch cogUkToNextclade
 fi
 #*** Could also add sequence length to metadata from faSizes output...
 tail -n+2 $cogUkDir/cog_metadata.csv \
-| awk -F, -v 'OFS=\t' '{print $1, "", $5, $2, "", "", "", $7; }' \
+| awk -F, -v 'OFS=\t' '{print $1, "", $5, $3, "", "", "", $7; }' \
+| sed -re 's/UK-ENG/England/; s/UK-NIR/Northern Ireland/; s/UK-SCT/Scotland/; s/UK-WLS/Wales/;' \
 | sort \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv
 # CNCB metadata:
 tail -n+2 $cncbDir/cncb.metadata.tsv \
 | tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") {
             print $2, "", $10, $11, $9, $5, $6} }' \
 | sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@;  s/Sapiens/sapiens/;' \
 | sort \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
@@ -405,56 +406,65 @@
 
 time $matUtils annotate -T 50 \
     -l \
     -i gisaidAndPublic.$today.masked.pb \
     -c cladeToName \
     -u mutations.nextclade \
     -D details.nextclade \
     -o gisaidAndPublic.$today.masked.nextclade.pb \
     >& annotate.nextclade.out
 
 # Add pangolin lineage annotations to protobuf.  Use pangoLEARN training samples;
 # convert EPI IDs to public to match tree IDs.
 tail -n+2 ~angie/github/pango-designation/lineages.metadata.csv \
 | grep -vFwf $ottoDir/clades.blackList \
 | awk -F, '{print $9 "\t" $2;}' \
+| sed -re 's/B\.1\.1\.464\.1/AW.1/;  s/B\.1\.526\.[0-9]+/B.1.526/;' \
 | sort > epiExemplarToLineage
 subColumn -miss=/dev/null 1 epiExemplarToLineage \
     <(cut -f 1,2 $epiToPublic) stdout \
 | sort > idExemplarToLineage
 grep -Fwf <(cut -f 1 idExemplarToLineage) samples.$today \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \
 | sort > idExemplarToName
 join -t$'\t' idExemplarToName idExemplarToLineage \
 | tawk '{print $3, $2;}' \
 | sort > lineageToName
 # $epiToPublic maps some cogUkInGenBank EPI IDs to their COG-UK names not GenBank IDs unfortunately
 # so some of those don't match between idExemplarToLineage and idExemplarToName.  Find missing
 # sequences and try a different means of adding those.
 comm -13 <( cut -f 1 idExemplarToName | sort) <(cut -f 1 idExemplarToLineage| sort) \
 | grep -Fwf - ~angie/github/pango-designation/lineages.metadata.csv \
 | sed -re 's/Northern_/Northern/;' \
 | awk -F, '{print $1 "\t" $2;}' \
 | sort > exemplarNameNotFoundToLineage
 grep -Fwf <(cut -f 1 exemplarNameNotFoundToLineage) samples.$today \
 | awk -F\| '{print $1 "\t" $0;}' \
 | sort > exemplarNameNotFoundToFullName
 join -t$'\t' exemplarNameNotFoundToLineage exemplarNameNotFoundToFullName \
 | cut -f 2,3 \
-| sort -u lineageToName - ../lineageToName.newLineages > tmp
+| sort -u lineageToName - ../lineageToName.newLineages \
+| sed -re 's/B\.1\.1\.464\.1/AW.1/;' \
+> tmp
 mv tmp lineageToName
 
+# Yatish's suggestion: use pangolin/pangoLEARN assignments instead of lineages.csv
+zcat gisaidAndPublic.$today.metadata.tsv.gz \
+| tail -n+2 | tawk '$9 != "" && $9 != "None" {print $9, $1;}' \
+| grep -vFwf $ottoDir/clades.blackList \
+    > lineageToName.assigned
+
 time $matUtils annotate -T 50 \
     -i gisaidAndPublic.$today.masked.nextclade.pb \
     -c lineageToName \
     -u mutations.pangolin \
     -D details.pangolin \
     -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
     >& annotate.pangolin.out
 
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
 ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have
 # public names & IDs, we can match them.
 cut -f 1,3 $epiToPublic > epiToPublic.latest