01859939a397ed998e752ad25c54cdd22742a49c angie Fri Dec 20 15:00:05 2024 -0800 Lots of additions for H5N1 outbreak work. * Add Andersen Lab's assemblies of USDA SRA data from 2024 H5N1 outbreak * Add Bloom Lab metadata from Deep Mutational Scanning (DMS) experiments on H5N1 HA (referenced to American Wigeon 2021 vaccine strain) and PB2 * Add build of concatenated segments from the 2024 H5N1 outbreak * Better handling of serotype and segment in INSDC metadata diff --git src/hg/utils/otto/fluA/processMetadata.pl src/hg/utils/otto/fluA/processMetadata.pl new file mode 100755 index 0000000..7e0df43 --- /dev/null +++ src/hg/utils/otto/fluA/processMetadata.pl @@ -0,0 +1,148 @@ +#!/usr/bin/env perl + +# Parse influenza A metadata downloaded from GenBank vi an NCBI Virus query. Most rows have +# values in the date, serotype, strain and segment columns, but when they don't, try to extract them +# from the verbose GenBank title. + +use warnings; +use strict; + +sub strainFromName($$) { + my ($isolate, $title) = @_; + if ($isolate =~ m@^A/\w+.*/\d+ ?\(?H\d+(N\d+[^)^ ]*)?@) { + return $isolate; + } elsif ($title =~ m@\b(A/\w+.*/\d+ ?_?\(?H\d+(N\d+[^)^ ]*\)?)?)@ || + $title =~ m@\b(A/\w+.*\(H\d+N\d+\))@) { + return $1; + } elsif ($isolate =~ m@^A/\w+.*/\d+@) { + return $isolate; + } elsif ($title =~ m@\b(A/\w+.*/\d+)@) { + return $1; + } elsif ($isolate =~ m@/@) { + return $isolate; + } elsif ($isolate) { + return $isolate; + } + if ($isolate || ! $title =~ /^Influenza A virus genome assembly/) { + print STDERR "No dice for isolate='$isolate', title='$title'\n"; + } + return ""; +} # strainFromName + + +sub serotypeFromName($) { + my ($strain) = @_; + if ($strain =~ m@\((H\d+N\d+)\).*\((H\d+N\d+)\)@) { + if ($1 eq $2) { + return $1; + } else { + # Recombinant of different serotypes, submitter didn't specify serotype --> unknown + return ""; + } + } elsif ($strain =~ m@\((H\d+(N\d+[^)^ ]*)?)\)@) { + return $1; + } elsif ($strain =~ m@\b(H\d+(N\d+[^)^ ]*)?)@) { + return $1; + } + return ""; +} # serotypeFromName + + +sub segmentFromTitle($) { + my ($title) = @_; + if ($title =~ m/ha?ea?m?m[ae]g?glut/i || $title =~ m/surface glycoprotein/i) { + return 4; + } elsif ($title =~ m/neurami/i) { + return 6; + } elsif ($title =~ m/polymerase basic (protein|subunit) 2/i || + $title =~ m/polymerase (protein )?2/i) { + return 1; + } elsif ($title =~ m/polymerase basic (protein|subunit) 1/i || + $title =~ m/polymerase (protein )?1/i) { + return 2; + } elsif ($title =~ m/(polymerase acid|polymerase (protein ?)A)/i) { + return 3; + } elsif ($title =~ m/(nucleoprotein|nucleocapsid)/i) { + return 5; + } elsif ($title =~ m/matrix/i) { + return 7; + } elsif ($title =~ m/non-?structural protein/i) { + return 8; + } elsif ($title =~ m/\bPB2|P2\b/i) { + return 1; + } elsif ($title =~ m/\bPB1\b/i) { + return 2; + } elsif ($title =~ m/\bPA\b/i) { + return 3; + } elsif ($title =~ m/\bHA[12]?\b/i) { + return 4; + } elsif ($title =~ m/\bNP\b/i) { + return 5; + } elsif ($title =~ m/\bNA\b/i) { + return 6; + } elsif ($title =~ m/\bM[12A]?\b/i) { + return 7; + } elsif ($title =~ m/\bNS[12]?\b/i) { + return 8; + } + return ""; +} # segmentFromName + + +sub yearFromName($) { + my ($strain) = @_; + if ($strain =~ m@/(\d{4})\b[^/]*$@) { + return $1; + } elsif ($strain =~ m@/(\d{2})\b[^/]*$@) { + my $y2 = $1; + if ($y2 > 24) { + return 1900 + $y2; + } + } + return ""; +} # yearFromName + +my %segName = ( '1' => 'PB2', + '2' => 'PB1', + '3' => 'PA', + '4' => 'HA', + '5' => 'NP', + '6' => 'NA', + '7' => 'MP', + '8' => 'NS', + 'M' => 'MP', + 'MA'=> 'MP' + ); + + +while (<>) { + chomp; + my @w = split(/\t/); + # If trailing columns are empty, that makes too few columns; make sure we have the right number. + my $colCount = @w; + if ($colCount < 17) { + for (my $i = $colCount; $i < 17; $i++) { + $w[$i] = ""; + } + } + my ($dateIx, $strainIx, $serotypeIx, $segmentIx) = (4, 14, 15, 16); + my ($acc, $isolate, $title) = ($w[0], $w[1], $w[11]); + if (! $w[$strainIx]) { + $w[$strainIx] = strainFromName($isolate, $title); + } + if (! $w[$serotypeIx] || $w[$serotypeIx] eq "A") { + $w[$serotypeIx] = serotypeFromName($w[$strainIx]); + } + if (! $w[$segmentIx]) { + $w[$segmentIx] = segmentFromTitle($title); + } + $w[$segmentIx] =~ s/RNA //; + $w[$segmentIx] =~ s/segment //; + if (exists $segName{$w[$segmentIx]}) { + $w[$segmentIx] = $segName{$w[$segmentIx]}; + } + if (! $w[$dateIx]) { + $w[$dateIx] = yearFromName($w[$strainIx]); + } + print join("\t", @w) . "\n"; +}