01859939a397ed998e752ad25c54cdd22742a49c angie Fri Dec 20 15:00:05 2024 -0800 Lots of additions for H5N1 outbreak work. * Add Andersen Lab's assemblies of USDA SRA data from 2024 H5N1 outbreak * Add Bloom Lab metadata from Deep Mutational Scanning (DMS) experiments on H5N1 HA (referenced to American Wigeon 2021 vaccine strain) and PB2 * Add build of concatenated segments from the 2024 H5N1 outbreak * Better handling of serotype and segment in INSDC metadata diff --git src/hg/utils/otto/fluA/runDms.sh src/hg/utils/otto/fluA/runDms.sh new file mode 100755 index 0000000..3a66b1e --- /dev/null +++ src/hg/utils/otto/fluA/runDms.sh @@ -0,0 +1,38 @@ +#!/bin/bash +source ~/.bashrc +set -beEu -o pipefail + +# Select H5N1 HA sequences from 2020 or newer, align to Bloom Lab's Deep Mutational Scanning +# reference sequence, tally DMS scores for each sequence. + +# https://www.biorxiv.org/content/10.1101/2024.05.23.595634v1.full.pdf +# reference = A/American Wigeon/South Carolina/USDA-000345-001/2021(H5N1) +# Or with its undoctored name, in GenBank: +# OQ958044.1 A/American Wigeon/South Carolina/22-000345-001/2021 (2021-12-30) +dmsRefAcc=OQ958044.1 + +fluADir=/hive/data/outside/otto/fluA +fluANcbiDir=$fluADir/ncbi/ncbi.latest +fluAScriptDir=$(dirname "${BASH_SOURCE[0]}") + +# Look in metadata.tsv for year >= 2020 and segment == 4 (HA) and type is H5N1. +tawk '$16 == "H5N1" && ($17 == 4 || $17 == "HA") && $5 ~ /^202[0-9]/ {print $1;}' tweakedMetadata.tsv \ +| faSomeRecords <(xzcat $fluANcbiDir/genbank.fa.xz) stdin stdout \ +| faRenameRecords stdin renaming.tsv recentH5N1HA.fa + +# Combine those with the HA segment sequences from Andersen Lab's SRA assemblies, run nextclade: +cat recentH5N1HA.fa \ + <(fastaNames $fluADir/andersen_lab.srrNotGb.renamed.fa | grep _HA/ \ + | faSomeRecords $fluADir/andersen_lab.srrNotGb.renamed.fa stdin stdout) \ +| nextclade run -j 32 \ + --input-ref $fluADir/OQ958044.1.fa --input-annotation $fluADir/OQ958044.1.gff3 \ + --output-columns-selection seqName,clade,totalSubstitutions,totalDeletions,totalInsertions,totalMissing,totalNonACGTNs,alignmentStart,alignmentEnd,substitutions,deletions,insertions,aaSubstitutions,aaDeletions,aaInsertions,missing,unknownAaRanges,nonACGTNs \ + --output-tsv recentH5N1HA.nextclade.tsv \ + >& tmp.log + +$fluAScriptDir/dms_annotate_wigeon.py \ + ~/github/Flu_H5_American-Wigeon_South-Carolina_2021-H5N1_DMS/results/summaries/all_sera_escape.csv \ + recentH5N1HA.nextclade.tsv \ + H5N1_HA_DMS_metadata.tsv \ + H5N1_HA_DMS_colorings.json +