src/hg/utils/otto/fluA/joinSegments.py 01859939a397ed998e752ad25c54cdd22742a49c

01859939a397ed998e752ad25c54cdd22742a49c
angie
  Fri Dec 20 15:00:05 2024 -0800
Lots of additions for H5N1 outbreak work.
* Add Andersen Lab's assemblies of USDA SRA data from 2024 H5N1 outbreak
* Add Bloom Lab metadata from Deep Mutational Scanning (DMS) experiments on H5N1 HA (referenced to American Wigeon 2021 vaccine strain) and PB2
* Add build of concatenated segments from the 2024 H5N1 outbreak
* Better handling of serotype and segment in INSDC metadata

diff --git src/hg/utils/otto/fluA/joinSegments.py src/hg/utils/otto/fluA/joinSegments.py
new file mode 100755
index 0000000..d482b4b
--- /dev/null
+++ src/hg/utils/otto/fluA/joinSegments.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+# adapted from https://github.com/nextstrain/avian-flu/blob/master/scripts/join-segments.py
+
+from Bio import SeqIO
+from collections import defaultdict
+import argparse, sys, re
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--segments', type = str, required = True, nargs='+',
+                        help = "per-segment aligned fasta")
+    parser.add_argument('--output', type = str, required = True,
+                        help = "output whole genome aligned fasta")
+    args = parser.parse_args()
+
+    records = {}
+    strain_counts = defaultdict(int)
+    insdcRe = re.compile('^[A-Z]{2}[0-9]{6}\\.[0-9]+$')
+    for fname in args.segments:
+        records[fname] = {}
+        for record in SeqIO.parse(fname, 'fasta'):
+            # Remove GenBank accession, but not SRA, from record.name
+            nameParts = record.name.split("|")
+            if len(nameParts) == 3 and insdcRe.search(nameParts[1]):
+                name = "|".join([nameParts[0], nameParts[2]])
+            else:
+                name = record.name
+            records[fname][name] = str(record.seq)
+        for key in records[fname]: strain_counts[key]+=1
+        print(f"{fname}: parsed {len(records[fname].keys())} sequences")
+
+    with open(args.output, 'w') as fh:
+        print("writing concatenated segments to ", args.output)
+        for name,count in strain_counts.items():
+            if count!=len(args.segments):
+                print(f"Excluding {name} as it only appears in {count} segments")
+                continue
+            genome = "".join([records[seg][name] for seg in args.segments])
+            print(f">{name}\n{genome}", file=fh)