ad04145791b59754c6a78f4dac94efdf92ede89d galt Wed Jun 19 15:29:48 2019 -0700 Updated geoIp mapping for our 3 official mirrors - RR, euro, asia with much better quality data from MaxMind. diff --git src/hg/geoIp/README.v2 src/hg/geoIp/README.v2 new file mode 100644 index 0000000..7bc30b8 --- /dev/null +++ src/hg/geoIp/README.v2 @@ -0,0 +1,209 @@ +NEW genome-asia! + +Our previous update was from 2011 when genome-euro was first made, +so we are going to update everything with fresh data. + +RESTARTING update from ground zero, +all tables backed up and cleared out on hgFixed + +wget 'http://dev.maxmind.com/static/csv/codes/iso3166.csv' -O country.csv + +To convert csv to tab-separated (removing quotes): +cat country.csv | gawk -F "," '{print $1 "\t" $2}' | sed 's/"//g' > country.tab + + +wget 'http://dev.maxmind.com/static/csv/codes/country_continent.csv' -O countryToContinent.csv +To convert csv to tab-separated: +cat countryToContinent.csv | gawk -F "," '{print $1 "\t" $2}' > countryToContinent.tab + +https://gist.github.com/nobuti/3816985 +Manually made continent.csv +It seems odd that nobody has an easy to download cvs or tab-delimited continents list. +To convert csv to tab-separated: +cat continent.csv | gawk -F "," '{print $1 "\t" $2}' > continent.tab + +I have my data, so now I load the tables: + +This is a little script that runs autoSql on each .as to get the .sql files +However these .sql files are not checked in to git since they are +so easily generated. + +./make-sql + +took the old gbNode.tab with US and euro, +and added the japan record: +vi gbNode.tab +| 3 | genome-asia.ucsc.edu | JP | Asian node of UCSC Genome Browser at the RIKEN institute of Japan | Asian Server | +changed "\" to tab char. + +made a script to create and load the .as tables +some of them actually do not have any data yet, +since the data is created by joins of other tables + +./load-tables + +note that because the script is so simple +and some of the tables do not have .tab files, +they will give errors. We will fill the empty +tables later by inserting from a join of other tables. + +gbNode +geoIpNode +geoIpCountry +country +continent +countryToContinent + +Check the integrity of the continent/country tables: + +forgot to strip off the 1st row of the csv file. +Cleanup left-over header row. +delete from countryToContinent where continentId = '"continent code"'; + +DONE maybe I need to delete A1 and A2 which are weird special categories +that are not real countries and do not have an assigned continent which is given the value "--". +| A1 | Anonymous Proxy | +| A2 | Satellite Provider | +These turn out not to be a problem. + + +mysql> select * from countryToContinent where countryId not in (select id from country); ++-----------+-------------+ +| countryId | continentId | ++-----------+-------------+ +| AN | NA | +| FX | EU | ++-----------+-------------+ + +AN Netherlands Antilles + +insert into country values ('AN', 'Netherlands Antilles'); + +insert into country values ('FX', 'France, Metropolitan'); + +mysql> select * from countryToContinent where continentId not in (select id from continent); ++-----------+-------------+ +| countryId | continentId | ++-----------+-------------+ +| A1 | -- | +| A2 | -- | +| O1 | -- | ++-----------+-------------+ + +insert into continent values ('--','not applicable'); + +mysql> select count(*) from geoIpCountry where countryId not in (select id from country); ++----------+ +| count(*) | ++----------+ +| 0 | ++----------+ + +-- + +start off with crude continent-level assignments: + +make everything 1 by default + +insert into continentToNode select id, 1 from continent; + +Europe +update continentToNode set node=2 where continentId = "EU"; + +Asia +update continentToNode set node=3 where continentId = "AS"; + +Now we just can make a join + +insert into countryToNode (select countryId, node from countryToContinent cc, continentToNode cn where cc.continentId = cn.continentId); + +# fix ones requested by Bob to be included in euro instead of defaulting to america. +# these are mostly around the middle east. +update countryToNode set node=2 where countryId in ('AE','AM','BH','CY','GE','IL','IQ','JO','KW','LB','OM','PS','QA','SA','SY','YE'); + +I compared the before and after updating with public_html/geoMap/test.csh at + http://hgwdev.soe.ucsc.edu/~galt/geoMap/countryToNodeMap.html +And it looked identical. So we did not lose anything. + +mysql> select distinct countryId from geoIpCountry where countryId not in (select distinct countryId from countryToNode); ++-----------+ +| countryId | ++-----------+ +| SS | +| CW | +| SX | +| BQ | ++-----------+ + +mysql> select * from country where id in ('SS','CW','SX','BQ'); ++----+--------------+ +| id | name | ++----+--------------+ +| BQ | Bonaire | +| CW | Curacao | +| SS | South Sudan | +| SX | Sint Maarten | ++----+--------------+ + + +So, what happened to these countries, and why are they not in countryToNode? +Shall we add them? + +insert into countryToContinent values ('SS', 'AF'); +insert into countryToContinent values ('SX', 'NA'); +insert into countryToContinent values ('CW', 'NA'); +insert into countryToContinent values ('BQ', 'NA'); + +insert into countryToNode values ('SS', '1'); +insert into countryToNode values ('SX', '1'); +insert into countryToNode values ('CW', '1'); +insert into countryToNode values ('BQ', '1'); + + +mysql> select distinct countryId from geoIpCountry where countryId not in (select distinct countryId from countryToNode); +Empty set (0.13 sec) + +Re-ran test.csh and checked again: +http://hgwdev.soe.ucsc.edu/~galt/geoMap/countryToNodeMap.html + +This means we should have full integrity. + +insert into geoIpNode (select ipStart, ipEnd, node from geoIpCountry gic, countryToNode ctn where gic.countryId=ctn.countryId); + +Query OK, 164390 rows affected (1.61 sec) +Records: 164390 Duplicates: 0 Warnings: 0 + +NOW IT IS READY! + +Note that in the future we may be putting other contries with other servers, +like we did with the middle-east countries, overriding their default continent mappings +in countryToNode. + + +Not sure yet if I am going to do this again. +It was what I did last time. +The idea was to keep a backup of these important tables. +That way perhaps hgFixed could be used when generating +other sets of testing tables for redirect. + +TODO at the end: +rename tables to real: +gbNodeReal +geoIpNodeReal + + +Make an extra backup of just the git untracked files just in case: + +mkdir /hive/data/outside/geoIp/ +gitu | xargs -I X cp X /hive/data/outside/geoIp/ + + +Make a backup of the data too: + +mkdir /hive/data/outside/geoIp/geoIpTableDumps +mkdir /hive/data/outside/geoIp/geoIpTableDumps/2016-06-23 +# sadly we need to temporarily make it write-able by all so mysqld can write dump there. +chmod 777 /hive/data/outside/geoIp/geoIpTableDumps/2016-06-23 +hgsqldump hgFixed -T 2016-06-23 continent continentToNode country countryToContinent countryToNode gbNode geoIpCountry geoIpNode +chmod 775 /hive/data/outside/geoIp/geoIpTableDumps/2016-06-23 +