ab3504a9bc0e6077a70cdcefc29264833ced8126 angie Mon Jul 12 16:36:23 2021 -0700 Add in sub-UK country to match COG-UK sequence names; correct outdated lineage names in lineages.metadata.csv; make alternate lineageToName file, still need to do comparison of using it. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index 63bbdc5..dcca06e 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -313,31 +313,31 @@ # It's not always possible to recreate both old and new names correctly from metadata, # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs. cut -f 2 $renaming \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \ | sort \ > idToName # NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \ | sed -re 's@COG-UK/@@g; s/United Kingdom://g; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s@Northern Ireland/@NorthernIreland/@;' \ > tmp # NCBI metadata for non-COG-UK (strip colon-separated location after country if present): grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \ -| sed -re 's/\t([A-Za-z -]+):[A-Za-z0-9 ,()_-]+\t/\t\1\t/;' \ +| sed -re 's@\t([A-Za-z -]+):[A-Za-z0-9 .,()_/-]+\t@\t\1\t@;' \ | perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \ | cleanGenbank \ | sort tmp - > gb.metadata if [ -e $ncbiDir/lineage_report.csv ]; then echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv tail -n+2 $ncbiDir/lineage_report.csv \ | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \ | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \ | sort \ > gbToLineage else echo Getting GenBank Pangolin lineages from $prevMeta zcat $prevMeta \ | tail -n+2 \ | tawk '$2 != "" && $8 != "" { print $2, $8; }' \ @@ -353,31 +353,32 @@ wc -l gbToNextclade join -t$'\t' -a 1 gb.metadata gbToNextclade \ | join -t$'\t' -a 1 - gbToLineage \ | tawk '{ if ($2 == "") { $2 = "?"; } print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ >> gisaidAndPublic.$today.metadata.tsv # COG-UK metadata: if [ -e $cogUkDir/nextclade.tsv ]; then sort $cogUkDir/nextclade.tsv > cogUkToNextclade else touch cogUkToNextclade fi #*** Could also add sequence length to metadata from faSizes output... tail -n+2 $cogUkDir/cog_metadata.csv \ -| awk -F, -v 'OFS=\t' '{print $1, "", $5, $2, "", "", "", $7; }' \ +| awk -F, -v 'OFS=\t' '{print $1, "", $5, $3, "", "", "", $7; }' \ +| sed -re 's/UK-ENG/England/; s/UK-NIR/Northern Ireland/; s/UK-SCT/Scotland/; s/UK-WLS/Wales/;' \ | sort \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ >> gisaidAndPublic.$today.metadata.tsv # CNCB metadata: tail -n+2 $cncbDir/cncb.metadata.tsv \ | tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") { print $2, "", $10, $11, $9, $5, $6} }' \ | sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@; s/Sapiens/sapiens/;' \ | sort \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ >> gisaidAndPublic.$today.metadata.tsv wc -l gisaidAndPublic.$today.metadata.tsv @@ -405,56 +406,65 @@ time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ -c cladeToName \ -u mutations.nextclade \ -D details.nextclade \ -o gisaidAndPublic.$today.masked.nextclade.pb \ >& annotate.nextclade.out # Add pangolin lineage annotations to protobuf. Use pangoLEARN training samples; # convert EPI IDs to public to match tree IDs. tail -n+2 ~angie/github/pango-designation/lineages.metadata.csv \ | grep -vFwf $ottoDir/clades.blackList \ | awk -F, '{print $9 "\t" $2;}' \ +| sed -re 's/B\.1\.1\.464\.1/AW.1/; s/B\.1\.526\.[0-9]+/B.1.526/;' \ | sort > epiExemplarToLineage subColumn -miss=/dev/null 1 epiExemplarToLineage \ <(cut -f 1,2 $epiToPublic) stdout \ | sort > idExemplarToLineage grep -Fwf <(cut -f 1 idExemplarToLineage) samples.$today \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \ | sort > idExemplarToName join -t$'\t' idExemplarToName idExemplarToLineage \ | tawk '{print $3, $2;}' \ | sort > lineageToName # $epiToPublic maps some cogUkInGenBank EPI IDs to their COG-UK names not GenBank IDs unfortunately # so some of those don't match between idExemplarToLineage and idExemplarToName. Find missing # sequences and try a different means of adding those. comm -13 <( cut -f 1 idExemplarToName | sort) <(cut -f 1 idExemplarToLineage| sort) \ | grep -Fwf - ~angie/github/pango-designation/lineages.metadata.csv \ | sed -re 's/Northern_/Northern/;' \ | awk -F, '{print $1 "\t" $2;}' \ | sort > exemplarNameNotFoundToLineage grep -Fwf <(cut -f 1 exemplarNameNotFoundToLineage) samples.$today \ | awk -F\| '{print $1 "\t" $0;}' \ | sort > exemplarNameNotFoundToFullName join -t$'\t' exemplarNameNotFoundToLineage exemplarNameNotFoundToFullName \ | cut -f 2,3 \ -| sort -u lineageToName - ../lineageToName.newLineages > tmp +| sort -u lineageToName - ../lineageToName.newLineages \ +| sed -re 's/B\.1\.1\.464\.1/AW.1/;' \ +> tmp mv tmp lineageToName +# Yatish's suggestion: use pangolin/pangoLEARN assignments instead of lineages.csv +zcat gisaidAndPublic.$today.metadata.tsv.gz \ +| tail -n+2 | tawk '$9 != "" && $9 != "None" {print $9, $1;}' \ +| grep -vFwf $ottoDir/clades.blackList \ + > lineageToName.assigned + time $matUtils annotate -T 50 \ -i gisaidAndPublic.$today.masked.nextclade.pb \ -c lineageToName \ -u mutations.pangolin \ -D details.pangolin \ -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \ >& annotate.pangolin.out mv gisaidAndPublic.$today.masked{,.unannotated}.pb ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have # public names & IDs, we can match them. cut -f 1,3 $epiToPublic > epiToPublic.latest