2463467c57af3dff3d900a94ebed353bb5a749bd angie Mon Jul 12 16:33:25 2021 -0700 Get GISAID country name from sequence names, correcting a few known typos and substituting Chinese cities/regions to China. diff --git src/hg/utils/otto/sarscov2phylo/gisaidNameToCountry.pl src/hg/utils/otto/sarscov2phylo/gisaidNameToCountry.pl new file mode 100755 index 0000000..4573f12 --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/gisaidNameToCountry.pl @@ -0,0 +1,82 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +my %fixup = ( 'ITA' => 'Italy', + 'FRANCE' => 'France', + 'FRance' => 'France', + 'MAlta' => 'Malta', + 'PAKISTAN' => 'Pakistan', + 'BurkinaFaso' => 'Burkina Faso', + 'HongKong' => 'Hong Kong', + 'SouthAfrica' => 'South Africa', + 'USA-IN' => 'USA', + 'Anhui' => 'China', + 'Beijing' => 'China', + 'Changde' => 'China', + 'Changzhou' => 'China', + 'Chongqing' => 'China', + 'Foshan' => 'China', + 'Fujian' => 'China', + 'Fuzhou' => 'China', + 'Gansu' => 'China', + 'Ganzhou' => 'China', + 'Guangdong' => 'China', + 'Guangxi' => 'China', + 'Guangzhou' => 'China', + 'Hangzhou' => 'China', + 'Harbin' => 'China', + 'Hebei' => 'China', + 'Heilongjiang' => 'China', + 'Henan' => 'China', + 'Hunan' => 'China', + 'Jian' => 'China', + 'Jiangsu' => 'China', + 'Jiangxi' => 'China', + 'Jingzhou' => 'China', + 'Kashgar' => 'China', + 'Jiujiang' => 'China', + 'Liaoning' => 'China', + 'Lishui' => 'China', + 'Lu\'an' => 'China', + 'Meizhou' => 'China', + 'Nan Chang' => 'China', + 'Nanchang' => 'China', + 'Pingxiang' => 'China', + 'Qingdao' => 'China', + 'Shaanxi' => 'China', + 'Shandong' => 'China', + 'Shanghai' => 'China', + 'Shangrao' => 'China', + 'Shaoxing' => 'China', + 'Shenzhen' => 'China', + 'Shulan' => 'China', + 'Sichuan' => 'China', + 'Tianmen' => 'China', + 'Urumqi' => 'China', + 'Weifang' => 'China', + 'Wuhan' => 'China', + 'Xinyu' => 'China', + 'Yichun' => 'China', + 'Yingtan' => 'China', + 'Yunnan' => 'China', + 'Zhejiang' => 'China', + ); + +while (<>) { + chomp; chomp; + $_ =~ s/\r$//; + my $wholeName = $_; + $wholeName =~ s/[ ',()]//g; + if (/^([a-z ]+\/|North America\/)?([A-Z][a-zA-Z '_-]+)\//) { + my $country = $2; + if (exists $fixup{$country}) { + $country = $fixup{$country}; + } + print "$wholeName\t$country\n"; + } else { + print "$wholeName\t?\n"; + } +} +