0e7d82dd8faf78dd7e8a0a4ed8f29c9215382c37
angie
  Fri Sep 5 17:24:40 2025 -0700
Strip comma from host component of name, so it doesn't mess up newick parsing in usher.

diff --git src/hg/utils/otto/fluA/updateAndersenLab.sh src/hg/utils/otto/fluA/updateAndersenLab.sh
index 0e06c87f1e1..d320013ba93 100755
--- src/hg/utils/otto/fluA/updateAndersenLab.sh
+++ src/hg/utils/otto/fluA/updateAndersenLab.sh
@@ -19,31 +19,31 @@
 # Extract sequences that are in SRA but not (yet) in GenBank.
 find ~/github/avian-influenza/fasta -name \*.fa \
 | grep -vFwf <(cut -f 1 genbank_mapping.tsv) \
 | xargs cat > sraNotGb.fa
 faSize sraNotGb.fa | head -2
 
 # Rename those sequences to look nicer in the tree and include some metadata.
 csvToTab < ~/github/avian-influenza/metadata/SraRunTable_automated.csv \
 | tail -n+2 \
 | cut -f 1,10,16,19,31 \
 | perl -wne 'chomp;
     ($run, $date, $country, $host, $sample) = split(/\t/);
     $year = $date;  $year =~ s/^(\d{4}).*/$1/;
     $host = ucfirst(lc($host));
     $host =~ s/ /-/g;
-    $host =~ s/['"'"':;\[\]()]//g;
+    $host =~ s/['"'"':;\[\](),]//g;
     $country =~ s/:.*//;
     if ($sample =~ m@^Influenza A virus (A/.*)/\d{4}\(H\dN\d\)\)?@) {
       $sample = $1;
       foreach $segment (qw/HA MP M2 NA NP NS PA PB1 PB2/) {
         print "Consensus_${run}_${segment}_cns_threshold_0.5_quality_20\t${sample}_$segment/$year|$run|$date\n";
         print "Consensus_${run}_${segment}_cns_threshold_0.75_quality_20\t${sample}_$segment/$year|$run|$date\n";
       }
     } else {
       $sample =~ s/-original$//; $sample =~ s/-repeat2?//;
       $sample =~ s/([0-9]{2}-[0-9]{6}-[0-9]{3})-(300|MTM)/$1/;
       foreach $segment (qw/HA MP M2 NA NP NS PA PB1 PB2/) {
         print "Consensus_${run}_${segment}_cns_threshold_0.5_quality_20\tA/$host/$country/${sample}_$segment/$year|$run|$date\n";
         print "Consensus_${run}_${segment}_cns_threshold_0.75_quality_20\tA/$host/$country/${sample}_$segment/$year|$run|$date\n";
       }
     }' \
@@ -54,31 +54,31 @@
 faRenameRecords sraNotGb.fa srr_renaming.tsv andersen_lab.srrNotGb.renamed.fa
 faSize andersen_lab.srrNotGb.renamed.fa | head -2
 
 mv andersen_lab.srrNotGb.renamed.fa $fluADir/
 
 # Format metadata for my build
 echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications\tserotype\tsegment" \
         > $fluADir/andersen_lab.srrNotGb.renamed.metadata.tsv
 csvToTab < ~/github/avian-influenza/metadata/SraRunTable_automated.csv \
 | cut -f 1,5,6,9,10,16,18,19,31,33 \
 | perl -wne 'chomp;
     ($run, $proj, $biosamp, $center, $date, $country, $loc, $host, $sample, $serotype) = split(/\t/);
     $year = $date;  $year =~ s/^(\d{4}).*/$1/;
     $host = ucfirst(lc($host));
     $host =~ s/ /-/g;
-    $host =~ s/['"'"':;\[\]()]//g;
+    $host =~ s/['"'"':;\[\](),]//g;
     $country =~ s/: ?(.*)//;
     $loc =~ s/: /:/;
     $loc = $1 if ($loc eq "" && $1 ne "");
     if ($sample =~ m@^Influenza A virus (A/.*)/\d{4}\(H\dN\d\)\)?@) {
       $sample = $1;
       foreach $segment (qw/HA MP M2 NA NP NS PA PB1 PB2/) {
         print join("\t", "${sample}_$segment/$year|$run|$date", "", $date, $country, $loc, "",
                    $host, $proj, $biosamp, $run, $center, "", $serotype, $segment) . "\n";
       }
     } else {
       $sample =~ s/-original$//; $sample =~ s/-repeat2?//;
       $sample =~ s/([0-9]{2}-[0-9]{6}-[0-9]{3})-(300|MTM)/$1/;
       foreach $segment (qw/HA MP M2 NA NP NS PA PB1 PB2/) {
         print join("\t", "A/$host/$country/${sample}_$segment/$year|$run|$date", "", $date, $country, $loc, "",
                    $host, $proj, $biosamp, $run, $center, "", $serotype, $segment) . "\n";