01859939a397ed998e752ad25c54cdd22742a49c angie Fri Dec 20 15:00:05 2024 -0800 Lots of additions for H5N1 outbreak work. * Add Andersen Lab's assemblies of USDA SRA data from 2024 H5N1 outbreak * Add Bloom Lab metadata from Deep Mutational Scanning (DMS) experiments on H5N1 HA (referenced to American Wigeon 2021 vaccine strain) and PB2 * Add build of concatenated segments from the 2024 H5N1 outbreak * Better handling of serotype and segment in INSDC metadata diff --git src/hg/utils/otto/fluA/joinSegments.py src/hg/utils/otto/fluA/joinSegments.py new file mode 100755 index 0000000..d482b4b --- /dev/null +++ src/hg/utils/otto/fluA/joinSegments.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +# adapted from https://github.com/nextstrain/avian-flu/blob/master/scripts/join-segments.py + +from Bio import SeqIO +from collections import defaultdict +import argparse, sys, re + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--segments', type = str, required = True, nargs='+', + help = "per-segment aligned fasta") + parser.add_argument('--output', type = str, required = True, + help = "output whole genome aligned fasta") + args = parser.parse_args() + + records = {} + strain_counts = defaultdict(int) + insdcRe = re.compile('^[A-Z]{2}[0-9]{6}\\.[0-9]+$') + for fname in args.segments: + records[fname] = {} + for record in SeqIO.parse(fname, 'fasta'): + # Remove GenBank accession, but not SRA, from record.name + nameParts = record.name.split("|") + if len(nameParts) == 3 and insdcRe.search(nameParts[1]): + name = "|".join([nameParts[0], nameParts[2]]) + else: + name = record.name + records[fname][name] = str(record.seq) + for key in records[fname]: strain_counts[key]+=1 + print(f"{fname}: parsed {len(records[fname].keys())} sequences") + + with open(args.output, 'w') as fh: + print("writing concatenated segments to ", args.output) + for name,count in strain_counts.items(): + if count!=len(args.segments): + print(f"Excluding {name} as it only appears in {count} segments") + continue + genome = "".join([records[seg][name] for seg in args.segments]) + print(f">{name}\n{genome}", file=fh)