e616b1ad06be9b542979d2224de81fa26a355ada
angie
  Tue Feb 2 11:36:05 2021 -0800
Loosen up regex for BioSample attributes with stray quotes.

diff --git src/hg/utils/otto/sarscov2phylo/bioSampleTextToTab.pl src/hg/utils/otto/sarscov2phylo/bioSampleTextToTab.pl
index 7e5a424f..897a25a 100755
--- src/hg/utils/otto/sarscov2phylo/bioSampleTextToTab.pl
+++ src/hg/utils/otto/sarscov2phylo/bioSampleTextToTab.pl
@@ -1,129 +1,129 @@
 #!/usr/bin/env perl
 
 use warnings;
 use strict;
 
 sub isReal($) {
   my ($val) = @_;
   return ($val &&
           (lc($val) ne "missing") &&
           (lc($val) ne "unknown") &&
           (lc($val) ne "not applicable") &&
           (lc($val) ne "not collected") &&
           (lc($val) ne "not provided") &&
           (lc($val) ne "restricted access"));
 }
 
 my %attribs = ();
 
 while (<>) {
   chomp;
   if (/^Identifiers: BioSample: (\w+)(; Sample name: ([^;]+))?(; SRA: (\w+))?/) {
     my ($acc, $sampleName, $sra) = ($1, $3, $5);
     $attribs{__acc} = $acc;
     $attribs{__sampleName} = $sampleName if ($sampleName);
     $attribs{__sra} = $sra if ($sra);
   } elsif (/^Identifiers: /) {
     die "Can't parse Identifiers line $.:\n$_\t";
-  } elsif (/^    \/([^=]+)="([^"]+)"$/) {
+  } elsif (/^    \/([^=]+)="(.+)"$/) {
     my ($attr, $val) = ($1, $2);
     if (isReal($val)) {
       $attribs{$attr} = $val;
     }
   } elsif (/^    \//) {
     die "Can't parse attribute line $.:\n$_\t";
   } elsif (/^(EPI_ISL_\d+)/) {
     $attribs{__epi} = $1;
   } elsif (/^Accession: (\w+)\sID: (\d+)/) {
     # Last line of record; reconcile whatever attributes were accumulated with the columns that
     # we want to define.
     my ($acc, $gi) = ($1, $2);
     die "acc mismatch '$acc' vs. '$attribs{__acc}'" if ($acc ne $attribs{__acc});
     my $name = "";
     if (exists $attribs{"sample name"}) {
       $name = $attribs{"sample name"};
     } elsif (exists $attribs{"Submitter Id"}) {
       $name = $attribs{"Submitter Id"};
     } elsif (exists $attribs{strain}) {
       $name = $attribs{strain};
     } elsif (exists $attribs{isolate}) {
       $name = $attribs{isolate};
     } elsif (exists $attribs{title}) {
       $name = $attribs{title};
     } elsif (exists $attribs{"virus identifier"}) {
       $name = $attribs{"virus identifier"};
     } elsif (exists $attribs{__sampleName}) {
       $name = $attribs{__sampleName};
     }
     $name =~ s@^SARS-Co[Vv]-2/@@;
     $name =~ s@^hCo[Vv]-19/@@;
     $name =~ s@^[Hh]uman/@@;
     $name =~ s@^Severe acute respiratory syndrome coronavirus 2/@@;
     $name =~ s@^/North America/@@;
     my $date = "";
     if (exists $attribs{"receipt date"} && exists $attribs{"collection date"}) {
       # Use the longer of the two in case one is just "2020"
       if (length($attribs{"receipt date"}) > length($attribs{"collection date"})) {
         $date = $attribs{"receipt date"};
       } else {
         $date = $attribs{"collection date"};
       }
     } elsif (exists $attribs{"receipt date"}) {
       $date = $attribs{"receipt date"};
     } elsif (exists $attribs{"collection date"}) {
       $date = $attribs{"collection date"};
     }
     my $lab = "";
     if (exists $attribs{"collecting institution"}) {
       $lab = $attribs{"collecting institution"};
     } elsif (exists $attribs{"collected by"}) {
       $lab = $attribs{"collected by"};
     } elsif (exists $attribs{"INSDC center name"}) {
       $lab = $attribs{"INSDC center name"};
     }
     my $author = "";
     if (exists $attribs{"collector name"}) {
       $author = $attribs{"collector name"};
     }
     my $country = "";
     if (exists $attribs{"geographic location"}) {
       $country = $attribs{"geographic location"};
     }
     my $locale = "";
     if (exists $attribs{"geographic location (region and locality)"}) {
       $locale = $attribs{"geographic location (region and locality)"};
     } elsif ($country =~ m/(.*):\s*(.*)/) {
       ($country, $locale) = ($1, $2);
     }
     my $hostId = "";
     if (exists $attribs{"host subject id"}) {
       $hostId = $attribs{"host subject id"};
     }
     my $sraId = "";
     if (exists $attribs{__sra}) {
       $sraId = $attribs{__sra};
     }
     my $epiId = "";
     if (exists $attribs{gisaid_accession}) {
       $epiId = $attribs{gisaid_accession};
     } elsif (exists $attribs{gisaid_accession_id}) {
       $epiId = $attribs{gisaid_accession_id};
     } elsif (exists $attribs{gisaid}) {
       $epiId = $attribs{gisaid};
     } elsif (exists $attribs{"gisaid id"}) {
       $epiId = $attribs{"gisaid id"};
     } elsif (exists $attribs{"gisaid accession id"}) {
       $epiId = $attribs{"gisaid accession id"};
     } elsif (exists $attribs{subgroup}) {
       $epiId = $attribs{subgroup};
     } elsif (exists $attribs{__epi}) {
       $epiId = $attribs{__epi};
     }
 
     print join("\t", $gi, $acc, $name, $date, $lab, $author, $country, $locale,
                $hostId, $sraId, $epiId) ."\n";
     %attribs = ();
   } elsif (/^Accession: /) {
     die "Can't parse Accession line $.:\n$_\t";
   }
 }