6d02c4bab47a7d7acd4625628975857667e0fc45
galt
  Wed Jun 19 15:34:29 2019 -0700
geoIp cleanup part2

diff --git src/hg/geoIp/geoIpToCountryMaxMind.c src/hg/geoIp/geoIpToCountryMaxMind.c
new file mode 100644
index 0000000..43fb6d7
--- /dev/null
+++ src/hg/geoIp/geoIpToCountryMaxMind.c
@@ -0,0 +1,227 @@
+/* program geoIpToCountryMaxMind 
+ * by Galt Barber 2011-04-15
+ * Read csv input geoip data and output format for use with genome-browser cgis 
+ * to map user IP addresses to country-code. */
+
+#include "common.h"
+#include "linefile.h"
+#include "options.h"
+#include "sqlNum.h"
+#include "hash.h"
+#include "obscure.h"
+#include "csv.h"
+#include "internet.h"
+
+#define MAXWORDS 1024
+
+/* command line option specifications */
+static struct optionSpec optionSpecs[] = {
+    {"-help"    , OPTION_BOOLEAN},
+    {NULL, 0}
+};
+
+struct hash *locHash = NULL;
+
+void usage(char *p) 
+/* display correct usage/syntax */
+{
+errAbort("Usage:\n"
+    "%s GeoLite2-Country-Blocks-IPv4.csv\n"
+    "Processes the IP ranges from decimal form to IP-string form, ignores comments and country-code ZZ Reserved.\n"
+    ,p);
+}
+
+void readLocations()
+/* read locations data into hash */
+{
+locHash = hashNew(10);
+char *locName = "GeoLite2-Country-Locations-en.csv";
+// format: 7909807,en,AF,Africa,SS,"South Sudan",0 
+
+struct lineFile *lf = lineFileOpen(locName, TRUE);
+int lineSize;
+char *line;
+int wordCount;
+int lineCount = 0;
+while (lineFileNext(lf, &line, &lineSize))
+    {
+    struct slName *list = csvParse(line);
+    // cannot use chopByString since the input has empty strings and they get skipped.
+    wordCount = slCount(list);
+    if (wordCount != 7)
+	errAbort("Invalid row found, wordCount = %d != 7", wordCount);
+    if (lineCount++ == 0) // 1st line is a comment
+	continue;
+    struct slName *f;
+    int fnum = 0;
+    char *geoname_id = NULL;
+    char *countryCode = NULL;
+    char *continentCode = NULL;  // lamely, 64 records have blank countryCode, but they do give continent codes.
+    for (f=list, fnum=0; f; f=f->next, ++fnum)
+	{
+	if (fnum == 0)
+	    geoname_id = f->name;
+
+	if (fnum == 4)
+	    countryCode = f->name;
+
+	if (fnum == 2)  // just for lame blank ones.
+	    continentCode = f->name;
+	}
+
+    //printf(" geoname_id %s ", geoname_id);
+    //printf("countryCode %s \n", countryCode);
+
+    if (sameString(countryCode, ""))
+	{
+	if (sameString(continentCode, "AS"))  // asia
+	    {
+	    countryCode = "JP";  // fake some other country in that continent, japan
+	    }
+	else if (sameString(continentCode, "EU"))  // asia
+            {
+            countryCode = "DE";  // fake some other country in that continent, germany
+            }
+	else
+	    errAbort("unexpected country code is empty string in line #%d", lineCount);
+	}
+
+    hashAdd(locHash, geoname_id, countryCode);
+
+    }
+
+lineFileClose(&lf);
+}
+
+bits32 internetPackIp(unsigned char unpacked[4])
+/* Convert from 4-byte format with most significant
+ * byte first to native 32-bit format. */
+{
+int i;
+bits32 packed = 0;
+for (i=0; i<=3; ++i)
+    {
+    packed <<= 8;
+    packed |= unpacked[i];
+    }
+return packed;
+}
+
+void parseCIDR(char *cidr, bits32 *pStartIp, bits32 *pEndIp)
+/* parse input CIDR format IP range (or subnet) */
+{
+char *s = cloneString(cidr);
+char *c = strchr(s, '/');
+if (!c)
+    errAbort("expected slash char '/' in input cidr %s\n", cidr);
+*c++ = 0;
+char *ip = s;
+unsigned int bits = sqlUnsigned(c);
+//printf("ip=%s, bits=%d \n", ip, bits);   // DEBUG REMOVE
+unsigned char quadIp[4];
+internetParseDottedQuad(ip, quadIp);
+//int i;
+//for(i=0;i<4;++i)
+//    printf("ip[%d]=%d\n", i, quadIp[i]);   // DEBUG REMOVE
+bits32 packedIp = 0;
+packedIp = internetPackIp(quadIp);  // TODO should this go in the library internet.c? 
+//printf("packed32 bits=%u %08x\n", packedIp, packedIp);   // DEBUG REMOVE
+int r = 32 - bits;
+bits32 start = packedIp & (((unsigned int) 0xFFFFFFFF) << r);
+bits32 end;
+// on this platform shr or shl 32 of a 32-bit value actually does nothing at all rather than turning it to 0s.
+if (bits == 32)
+    end = packedIp;
+else
+    end = packedIp | (((unsigned int) 0xFFFFFFFF) >> bits);
+//printf("start=%u %08x\n", start, start);   // DEBUG REMOVE
+//printf("end  =%u %08x\n", end,   end  );   // DEBUG REMOVE
+
+char startIpS[17];
+char endIpS[17];
+internetIpToDottedQuad(start, startIpS);
+internetIpToDottedQuad(end, endIpS);
+
+//printf("dottedQuad start %s end %s\n", startIpS, endIpS);
+
+*pStartIp = start;
+*pEndIp = end;
+
+}
+
+
+void geoIpToCountry(char *fileName) 
+/* List each field in tab-separated file on a new line, dashed-lines separate records */
+{
+
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+int lineSize;
+char *line;
+char *words[MAXWORDS];
+int wordCount;
+int lineCount = 0;
+while (lineFileNext(lf, &line, &lineSize))
+    {
+
+    // input format
+    // network,geoname_id,registered_country_geoname_id,represented_country_geoname_id,is_anonymous_proxy,is_satellite_provider
+    // 1.0.0.0/24,2077456,2077456,,0,0
+
+
+    // cannot use chopByString since the input has empty strings and they get skipped.
+    wordCount = chopByChar(line, ',', words, MAXWORDS);
+    if (wordCount != 6)
+	errAbort("Invalid row found, wordCount = %d != 6", wordCount);
+    if (lineCount++ == 0) // 1st line is a comment
+	continue;
+
+    // get network info
+    char *network = words[0];
+
+    bits32 startIp, endIp;
+
+    parseCIDR(network, &startIp, &endIp);
+
+    // get country info
+    char *geoname_id = words[1];
+    char *registered_country_geoname_id = words[2];
+
+    //printf("network %s ", network);
+    //printf(" geoname_id %s\n", geoname_id);
+    struct hashEl *el = hashLookup(locHash, geoname_id);
+    if (!el)
+	{
+ 	el = hashLookup(locHash, registered_country_geoname_id);
+	if (!el)
+	    {
+	    warn("%s missing %s and %s in location lookup", network, geoname_id, registered_country_geoname_id);
+	    continue;
+	    }
+	}
+    char *countryCode = el->val;
+
+
+    //if (!sameString(countryCode, "ZZ")) // Filter out Reserved Ip ranges
+    printf("%u\t%u\t%s\n", startIp, endIp, countryCode);
+
+    //printf("----------------------------------------\n");
+    }
+
+lineFileClose(&lf);
+}
+
+
+
+int main (int argc, char *argv[]) 
+{
+char *fileName="stdin";
+optionInit(&argc, argv, optionSpecs);
+if ((argc != 2) || optionExists("-help"))
+    usage(argv[0]);
+fileName=argv[1];
+readLocations();
+geoIpToCountry(fileName);
+return 0; 
+} 
+
+