2463467c57af3dff3d900a94ebed353bb5a749bd
angie
  Mon Jul 12 16:33:25 2021 -0700
Get GISAID country name from sequence names, correcting a few known typos and substituting Chinese cities/regions to China.

diff --git src/hg/utils/otto/sarscov2phylo/gisaidNameToCountry.pl src/hg/utils/otto/sarscov2phylo/gisaidNameToCountry.pl
new file mode 100755
index 0000000..4573f12
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/gisaidNameToCountry.pl
@@ -0,0 +1,82 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+my %fixup = ( 'ITA' => 'Italy',
+              'FRANCE' => 'France',
+              'FRance' => 'France',
+              'MAlta' => 'Malta',
+              'PAKISTAN' => 'Pakistan',
+              'BurkinaFaso' => 'Burkina Faso',
+              'HongKong' => 'Hong Kong',
+              'SouthAfrica' => 'South Africa',
+              'USA-IN' => 'USA',
+              'Anhui' => 'China',
+              'Beijing' => 'China',
+              'Changde' => 'China',
+              'Changzhou' => 'China',
+              'Chongqing' => 'China',
+              'Foshan' => 'China',
+              'Fujian' => 'China',
+              'Fuzhou' => 'China',
+              'Gansu' => 'China',
+              'Ganzhou' => 'China',
+              'Guangdong' => 'China',
+              'Guangxi' => 'China',
+              'Guangzhou' => 'China',
+              'Hangzhou' => 'China',
+              'Harbin' => 'China',
+              'Hebei' => 'China',
+              'Heilongjiang' => 'China',
+              'Henan' => 'China',
+              'Hunan' => 'China',
+              'Jian' => 'China',
+              'Jiangsu' => 'China',
+              'Jiangxi' => 'China',
+              'Jingzhou' => 'China',
+              'Kashgar' => 'China',
+              'Jiujiang' => 'China',
+              'Liaoning' => 'China',
+              'Lishui' => 'China',
+              'Lu\'an' => 'China',
+              'Meizhou' => 'China',
+              'Nan Chang' => 'China',
+              'Nanchang' => 'China',
+              'Pingxiang' => 'China',
+              'Qingdao' => 'China',
+              'Shaanxi' => 'China',
+              'Shandong' => 'China',
+              'Shanghai' => 'China',
+              'Shangrao' => 'China',
+              'Shaoxing' => 'China',
+              'Shenzhen' => 'China',
+              'Shulan' => 'China',
+              'Sichuan' => 'China',
+              'Tianmen' => 'China',
+              'Urumqi' => 'China',
+              'Weifang' => 'China',
+              'Wuhan' => 'China',
+              'Xinyu' => 'China',
+              'Yichun' => 'China',
+              'Yingtan' => 'China',
+              'Yunnan' => 'China',
+              'Zhejiang' => 'China',
+            );
+
+while (<>) {
+  chomp; chomp;
+  $_ =~ s/\r$//;
+  my $wholeName = $_;
+  $wholeName =~ s/[ ',()]//g;
+  if (/^([a-z ]+\/|North America\/)?([A-Z][a-zA-Z '_-]+)\//) {
+    my $country = $2;
+    if (exists $fixup{$country}) {
+      $country = $fixup{$country};
+    }
+    print "$wholeName\t$country\n";
+  } else {
+    print "$wholeName\t?\n";
+  }
+}
+