4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/logCrawl/hgAccessCrawl/hgAccessCrawl.c src/hg/logCrawl/hgAccessCrawl/hgAccessCrawl.c
index a2e1026..46f9100 100644
--- src/hg/logCrawl/hgAccessCrawl/hgAccessCrawl.c
+++ src/hg/logCrawl/hgAccessCrawl/hgAccessCrawl.c
@@ -1,595 +1,595 @@
 /* hgAccessCrawl - Go through Apache access log collecting stats on hgXXX programs. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "obscure.h"
 #include "cheapcgi.h"
 #include "apacheLog.h"
 
 
 FILE *errLog = NULL;
 int errCode = 0;
 FILE *nonRoboLog = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "hgAccessCrawl - Go through Apache access log collecting stats on hgXXX programs\n"
   "usage:\n"
   "   hgAccessCrawl access_log(s)\n"
   "options:\n"
   "   -errLog=err.log - Put errors into err.log\n"
   "   -errCode=NNN - Only write out to errLog when status code matches errCode\n"
   "   -nonRobot=file - Write out non-robot CGI lines to file\n"
   "   -verbose=N  - Set verbosity level.  0 for silent, 1 for input data warnings, \n"
   "                 2 for status.\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"errLog", OPTION_STRING},
    {"errCode", OPTION_INT},
    {"nonRobot", OPTION_STRING},
    {NULL, 0},
 };
 
 boolean cgiHashVal(struct hash *cgiHash, char *var, char *val)
 /* Return TRUE if var exists in hash with given value. */
 {
 struct cgiVar *cv = hashFindVal(cgiHash, var);
 return cv != NULL && sameString(cv->val, val);
 }
 
 struct nameCount
 /* List of name/count pairs. */
     {
     struct nameCount *next;
     char *name;
     int count;
     };
 
 boolean isRobot(char *ip, char *program)
 /* Return TRUE if it appears to be a robot ip address. */
 {
 static struct hash *roboHash = NULL;
 if (startsWith("Java", program))
     return TRUE;
 else if (startsWith("Wget", program))
     return TRUE;
 else if (startsWith("AgentName", program))
     return TRUE;
 else if (startsWith("libwww-perl", program))
     return TRUE;
 else if (startsWith("Googlebot", program))
     return TRUE;
 else if (startsWith("ia_archiver", program))
     return TRUE;
 else if (startsWith("Hatena Antenna", program))
     return TRUE;
 else if (startsWith("webBlat", program))
     return TRUE;
 else if (startsWith("HTTP::Lite", program))
     return TRUE;
 else if (startsWith("Python-urllib", program))
     return TRUE;
 else if (startsWith("LWP::Simple", program))
     return TRUE;
 else if (startsWith("httpunit", program))
     return TRUE;
 else if (startsWith("Teleport Pro", program))
     return TRUE;
 else if (startsWith("WWW-Mechanize", program))
     return TRUE;
 else if (startsWith("Bio::Das::", program))
     return TRUE;
 else if (stringIn("Googlebot", program))
     return TRUE;
 else if (sameString("-", program))
     return TRUE;
 else if (startsWith("Microsoft Data Access", program))
     return TRUE;
 if (roboHash == NULL)
     {
     roboHash = hashNew(0);
     hashAdd(roboHash, "joiner.stanford.edu", NULL);
     hashAdd(roboHash, "pc-glass-1.ucsd.edu", NULL);
     hashAdd(roboHash, "pc-glass-2.ucsd.edu", NULL);
     hashAdd(roboHash, "pc-glass-3.ucsd.edu", NULL);
     hashAdd(roboHash, "64-170-97-98.ded.pacbell.net", NULL);
     hashAdd(roboHash, "ce.hosts.jhmi.edu", NULL);
     hashAdd(roboHash, "technetium.hgsc.bcm.tmc.edu", NULL);
     hashAdd(roboHash, "62.232.24.178", NULL);
     }
 return hashLookup(roboHash, ip) != NULL;
 }
 
 int nameCountCmp(const void *va, const void *vb)
 /* Compare to sort based on count - biggest first. */
 {
 const struct nameCount *a = *((struct nameCount **)va);
 const struct nameCount *b = *((struct nameCount **)vb);
 return b->count - a->count;
 }
 
 struct visCount
 /* Keep track of visibility settings observed. */
     {
     struct visCount *next;
     char *track;		/* Name of track.  Not allocated here. */
     int hideCount;
     int denseCount;
     int squishCount;
     int packCount;
     int fullCount;
     int visCount;	/* Sum of all but hide. */
     };
 
 int visCountCmp(const void *va, const void *vb)
 /* Compare to sort based on visCount - biggest first. */
 {
 const struct visCount *a = *((struct visCount **)va);
 const struct visCount *b = *((struct visCount **)vb);
 return b->visCount - a->visCount;
 }
 
 boolean isTrackVal(char *val)
 /* Return TRUE if this is a value we expect in a track name */
 {
 return sameString(val, "hide") || sameString(val, "dense")
 	|| sameString(val, "squish") || sameString(val, "pack")
 	|| sameString(val, "full");
 }
 
 void recordTrackVis(struct cgiVar *cv, struct hash *trackHash, 
 	struct visCount **pVcList)
 /* If var looks like it is a track, update trackHash with visibility
  * value. */
 {
 char *val = cv->val;
 if (isTrackVal(val))
     {
     struct visCount *vc = hashFindVal(trackHash, cv->name);
     if (vc == NULL)
 	{
 	AllocVar(vc);
 	hashAddSaveName(trackHash, cv->name, vc, &vc->track);
 	slAddHead(pVcList, vc);
 	}
     if (sameString(val, "hide"))
         vc->hideCount += 1;
     else if (sameString(val, "dense"))
 	{
         vc->denseCount += 1;
 	vc->visCount += 1;
 	}
     else if (sameString(val, "squish"))
 	{
         vc->squishCount += 1;
 	vc->visCount += 1;
 	}
     else if (sameString(val, "pack"))
 	{
         vc->packCount += 1;
 	vc->visCount += 1;
 	}
     else if (sameString(val, "full"))
 	{
         vc->fullCount += 1;
 	vc->visCount += 1;
 	}
     }
 }
 
 struct cgiProgram
 /* Data on one cgi program. */
     {
     struct cgiProgram *next;
     char *name;	/* Name of program. */
     int totalHits;	/* Total hit count. */
     int roboHits;	/* Robot hit count. */
     };
 
 int cgiProgramCmp(const void *va, const void *vb)
 /* Compare to sort based on count - biggest first. */
 {
 const struct cgiProgram *a = *((struct cgiProgram **)va);
 const struct cgiProgram *b = *((struct cgiProgram **)vb);
 return b->totalHits - a->totalHits;
 }
 
 
 void hgAccessCrawl(int logCount, char *logFiles[])
 /* hgAccessCrawl - Go through Apache access log collecting stats on hgXXX programs. */
 {
 struct cgiProgram *progList = NULL, *prog;
 struct hash *progHash = hashNew(0);
 int i;
 int hgTracksTotal = 0;
 int hgTracksPosted = 0;
 /* int hgNearTotal = 0;
 int hgGeneTotal = 0;
 int hgTextTotal = 0;
 int hgBlatTotal = 0;
 int hgTablesTotal = 0; */
 int hgcTotal = 0;
 int dbTotal = 0;
 int other = 0;
 int fromGateway = 0;
 int fromHgBlat = 0;
 int fromOtherBlat = 0;
 int fromHgGene = 0;
 int fromHgc = 0;
 int fromHgNear = 0;
 int fromEncode = 0;
 int fromOutside = 0;
 int zoomIn = 0;
 int zoomOut = 0;
 int dink = 0;
 int left = 0;
 int right = 0;
 int jump = 0;
 int refresh = 0;
 int gatewayMultiple = 0;
 int zoomInRuler = 0;
 int hgTracksRobot = 0;
 /* int hgTablesRobot = 0;
 int hgTextRobot = 0;
 int hgGeneRobot = 0;
 int hgNearRobot = 0;
 int hgBlatRobot = 0; */
 int undisclosedOutsideSimple = 0;
 int undisclosedOutsideWithCustom = 0;
 int resetAll = 0;
 int hideAll = 0;
 int postScriptOutput = 0;
 int addYourOwn = 0;
 struct hash *gHash = hashNew(0);	/* g (track) var for hgc */
 struct nameCount *gList = NULL, *gEl, *gNone, *gPost, *gRobot;
 struct hash *dbHash = hashNew(8);       /* db var in hgTracks after hgGateway */
 struct nameCount *dbList = NULL, *dbEl;
 struct hash *trackHash = hashNew(10);
 struct visCount *vcList = NULL;
 /* struct visCount *vcList = NULL, *vc; */
 
 /* Allocate dummy group for POSTed htc's. */
 AllocVar(gPost);
 gPost->name = "Posted (no CGI vars available)";
 slAddHead(&gList, gPost);
 
 /* Allocate dummy group for htc's with no 'g' variable. */
 AllocVar(gNone);
 gNone->name = "no 'g'";
 slAddHead(&gList, gNone);
 
 /* Allocate dummy group for htc robots. */
 AllocVar(gRobot);
 gRobot->name = "robot";
 slAddHead(&gList, gRobot);
 
 for (i=0; i<logCount; ++i)
     {
     char *fileName = logFiles[i];
     struct lineFile *lf = lineFileOpen(fileName, TRUE);
     char *line;
     while (lineFileNext(lf, &line, NULL))
         {
 	struct apacheAccessLog *ll = apacheAccessLogParse(line, lf->fileName, lf->lineIx);
 	if (ll != NULL)
 	    {
 	    if (errLog != NULL 
 	    	&& ll->status != 200 && ll->status != 304 
 		&& ll->status != 206 && ll->status != 301)
 	       {
 	       if (errCode == 0 || errCode == ll->status)
 		   if (!isRobot(ll->ip, ll->program))
 		       fprintf(errLog, "%s\n", line);
 	       }
 	    if (startsWith("/cgi-bin/", ll->url))
 		{
 		boolean thisIsRobot = isRobot(ll->ip, ll->program);
 		char *progNameStart = ll->url + strlen("/cgi-bin/");
 		char *progNameEnd = strchr(progNameStart, '?');
 		char *progName;
 		if (!thisIsRobot && nonRoboLog != NULL)
 		   fprintf(nonRoboLog, "%s\n", line);
 		if (progNameEnd == NULL)
 		    progName = cloneString(progNameStart);
 		else
 		    progName = cloneStringZ(progNameStart, progNameEnd-progNameStart);
 		progNameEnd = strchr(progName, '/');
 		if (progNameEnd != NULL)
 		     *progNameEnd = 0;
 		prog = hashFindVal(progHash, progName);
 		if (prog == NULL)
 		    {
 		    AllocVar(prog);
 		    hashAddSaveName(progHash, progName, prog, &prog->name);
 		    slAddHead(&progList, prog);
 		    }
 		prog->totalHits += 1;
 		if (thisIsRobot)
 		    prog->roboHits += 1;
 		if (sameString(ll->method, "GET"))
 		    {
 		    struct hash *cgiHash;
 		    struct cgiVar *cgiList, *cv;
 		    char *cgiString = strchr(ll->url, '?');
 		    int cgiCount;
 		    if (cgiString == NULL)
 			cgiString = "";
 		    else
 			cgiString += 1;
 		    cgiString = cloneString(cgiString);
 		    if (!cgiParseInput(cgiString, &cgiHash, &cgiList))
 			{
 			if (verboseLevel() > 1)
 			    printf("%s\n", ll->url);
 			continue;
 			}
 		    cgiCount = slCount(cgiList);
 		    if (startsWith("/cgi-bin/hgTracks", ll->url))
 			{
 			++hgTracksTotal;
 
 			/* Here we try to determine the popularity of each 
 			 * database (organism+assembly) by looking at
 			 * initial entries from hgGateway into hgTracks. */
 			if (ll->referrer != NULL && !thisIsRobot && stringIn("hgGateway", ll->referrer))
 			    {
 			    cv = hashFindVal(cgiHash, "db");
 			    if (cv != NULL)
 				{
 				char *db = cv->val;
 				dbEl = hashFindVal(dbHash, db);
 				if (dbEl == NULL)
 				    {
 				    AllocVar(dbEl);
 				    hashAddSaveName(dbHash, db, dbEl, &dbEl->name);
 				    slAddHead(&dbList, dbEl);
 				    }
 				dbEl->count += 1;
 				dbTotal += 1;
 				}
 			    }
 
 			/* Count up dense/squished/packed/full track usage */
 			if (!thisIsRobot)
 			    {
 			    for (cv = cgiList; cv != NULL; cv = cv->next)
 				recordTrackVis(cv, trackHash, &vcList);
 			    }
 
 			/* Count up hits in a bunch of mutually exclusive
 			 * categories. */
 			if (thisIsRobot)
 			    ++hgTracksRobot;
 			else if (cgiHashVal(cgiHash, "Submit", "Submit"))
 			    ++fromGateway;
 			else if (cgiHashVal(cgiHash, "submit", "jump"))
 			    ++jump;
 			else if (cgiHashVal(cgiHash, "submit", "refresh"))
 			    ++refresh;
 			else if (stringIn("hgt.out", ll->url))
 			    ++zoomOut;
 			else if (stringIn("hgt.in", ll->url))
 			    ++zoomIn;
 			else if (stringIn("hgt.left", ll->url))
 			    ++left;
 			else if (stringIn("hgt.right", ll->url))
 			    ++right;
 			else if (stringIn("hgt.dink", ll->url))
 			    ++dink;
 			else if (ll->referrer != NULL && stringIn("cgi-bin/hgBlat", ll->referrer))
 			    ++fromHgBlat;
 			else if (ll->referrer == NULL && 
 			    hashLookup(cgiHash, "ss"))
 			    ++fromOtherBlat;
 			else if (ll->referrer != NULL 
 				&& !startsWith("http://genome.ucsc.edu", ll->referrer)
 				&& !startsWith("http://genome.soe.ucsc.edu", ll->referrer))
 			    ++fromOutside;
 			else if (cgiCount == 2 
 			    && hashLookup(cgiHash, "position") 
 			    && hashLookup(cgiHash, "hgsid"))
 			    {
 			    ++zoomInRuler;
 			    }
 			else if (cgiCount == 3 
 			    && hashLookup(cgiHash, "position") 
 			    && hashLookup(cgiHash, "hgsid"))
 			    {
 			    ++gatewayMultiple;
 			    }
 			else if (stringIn("dummyEnterButton", ll->url))
 			    {
 			    if (stringIn("guideline", ll->url))
 				++jump;
 			    else
 				++fromGateway;
 			    }
 			else if (ll->referrer == NULL && cgiCount == 2  &&
 			    hashLookup(cgiHash, "db") && hashLookup(cgiHash, "position"))
 			    ++undisclosedOutsideSimple;
 			else if (ll->referrer == NULL && cgiCount == 2  &&
 			    hashLookup(cgiHash, "org") && hashLookup(cgiHash, "position"))
 			    ++undisclosedOutsideSimple;
 			else if (ll->referrer == NULL && cgiCount == 3  &&
 			    hashLookup(cgiHash, "db") 
 			    && hashLookup(cgiHash, "org")
 			    && hashLookup(cgiHash, "position"))
 			    ++undisclosedOutsideSimple;
 			else if (ll->referrer == NULL && hashLookup(cgiHash, "hgt.customText"))
 			    ++undisclosedOutsideWithCustom;
 			else if (hashLookup(cgiHash, "hgt.reset"))
 			    ++resetAll;
 			else if (hashLookup(cgiHash, "hgt.hideAll"))
 			    ++hideAll;
 			else if (ll->referrer != NULL && stringIn("cgi-bin/hgc", ll->referrer))
 			    ++fromHgc;
 			else if (ll->referrer != NULL && stringIn("cgi-bin/hgNear", ll->referrer))
 			    ++fromHgNear;
 			else if (ll->referrer != NULL && stringIn("cgi-bin/hgGene", ll->referrer))
 			    ++fromHgGene;
 			else if (ll->referrer != NULL && 
 			     (stringIn("ENCODE", ll->referrer) || stringIn("Encode", ll->referrer)) )
 			    ++fromEncode;
 			else if (hashLookup(cgiHash, "hgt.psOutput"))
 			    ++postScriptOutput;
 			else if (stringIn("Add+Your+Own", ll->url))
 			    ++addYourOwn;
 			else
 			    {
 			    ++other;
 			    if (verboseLevel() >= 3)
 				printf("%s\n", line);
 			    }
 			}
 		    else if (startsWith("/cgi-bin/hgc", ll->url))
 			{
 			struct cgiVar *cv = hashFindVal(cgiHash, "g");
 			struct nameCount *gEl;
 			++hgcTotal;
 			if (thisIsRobot)
 			    {
 			    gRobot->count += 1;
 			    }
 			else if (cv == NULL)
 			    {
 			    gNone->count += 1;
 			    }
 			else
 			    {
 			    gEl = hashFindVal(gHash, cv->val);
 			    if (gEl == NULL)
 				{
 				AllocVar(gEl);
 				hashAddSaveName(gHash, cv->val, gEl, &gEl->name);
 				slAddHead(&gList, gEl);
 				}
 			    gEl->count += 1;
 			    }
 			}
 		    hashFree(&cgiHash);
 		    slFreeList(&cgiList);
 		    freez(&cgiString);
 		    }
 		else if (sameString(ll->method, "POST"))
 		    {
 		    if (startsWith("/cgi-bin/hgc", ll->url))
 			{
 			hgcTotal += 1;
 			if (isRobot(ll->ip, ll->program))
 			    gRobot->count += 1;
 			else
 			    gPost->count += 1;
 			}
 		    else if (startsWith("/cgi-bin/hgTracks", ll->url))
 			{
 			hgTracksTotal += 1;
 			if (isRobot(ll->ip, ll->program))
 			    hgTracksRobot += 1;
 			else
 			    hgTracksPosted += 1;
 			}
 		    }
 		freez(&progName);
 		}
 	    apacheAccessLogFree(&ll);
 	    }
 	}
     }
 
 printf("CGI Programs:\n");
 slSort(&progList, cgiProgramCmp);
 for (prog = progList; prog != NULL; prog = prog->next)
     {
     char *name = prog->name;
     if (strchr(name, '%') == NULL && !endsWith(name, "_files"))
 	printf("%s total %d, robot %d (%3.2f%%)\n", prog->name, prog->totalHits,
 	    prog->roboHits, 100.0 * prog->roboHits/prog->totalHits);
     }
 printf("\n");
 
 
 slSort(&dbList, nameCountCmp);
 printf("Total entries from hgGateway with db set: %d\n", dbTotal);
 for (dbEl = dbList; dbEl != NULL; dbEl = dbEl->next)
     {
     printf("%4.2f%% db %s: %d\n", 100.0 * dbEl->count/dbTotal, 
     	dbEl->name, dbEl->count);
     }
 
 printf("hgTracksTotal: %d\n", hgTracksTotal);
 printf("hgTracksPosted: %d\n", hgTracksPosted);
 printf("fromGateway: %d\n", fromGateway);
 printf("gatewayMultiple: %d\n", gatewayMultiple);
 printf("fromHgBlat: %d\n", fromHgBlat);
 printf("fromOtherBlat: %d\n", fromOtherBlat);
 printf("fromHgNear: %d\n", fromHgNear);
 printf("fromHgc: %d\n", fromHgc);
 printf("fromHgGene: %d\n", fromHgGene);
 printf("zoomIn: %d\n", zoomIn);
 printf("zoomOut: %d\n", zoomOut);
 printf("dink: %d\n", dink);
 printf("left: %d\n", left);
 printf("right: %d\n", right);
 printf("jump: %d\n", jump);
 printf("refresh: %d\n", refresh);
 printf("zoomInRuler: %d\n", zoomInRuler);
 printf("fromOutside: %d\n", fromOutside);
 printf("undisclosedOutsideSimple: %d\n", undisclosedOutsideSimple);
 printf("undisclosedOutsideWithCustom: %d\n", undisclosedOutsideWithCustom);
 printf("robot: %d\n", hgTracksRobot);
 printf("resetAll: %d\n", resetAll);
 printf("hideAll: %d\n", hideAll);
 printf("fromEncode: %d\n", fromEncode);
 printf("postScriptOutput: %d\n", postScriptOutput);
 printf("addYourOwn: %d\n", addYourOwn);
 printf("other: %d\n", other);
 printf("\n");
 
 #ifdef OLD /* Sadly track visibilities are now posted now so we don't know. */
 printf("\n");
 slSort(&vcList, visCountCmp);
 printf("Count of track visibility\n");
 for (vc = vcList; vc != NULL; vc = vc->next)
     {
     int total = vc->visCount + vc->hideCount;
     double scale = 100.0/total;
     printf("%s: %d visible, %4.2f%% hidden, %4.2f%% dense, "
     	   "%4.2f%% squish, %4.2f%% pack, %4.2f%% full\n",
 	   vc->track, vc->visCount, scale*vc->hideCount,
 	   scale*vc->denseCount, scale*vc->squishCount,
 	   scale*vc->packCount, scale*vc->fullCount);
     }
 #endif /* OLD */
 
 slSort(&gList, nameCountCmp);
 printf("total hgc clicks: %d\n", hgcTotal);
 for (gEl = gList; gEl != NULL; gEl = gEl->next)
     {
     printf("%4.2f%% hgc %s: %d\n", 100.0 * gEl->count/hgcTotal, 
     	gEl->name, gEl->count);
     }
 printf("\n");
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (optionExists("errLog"))
     errLog = mustOpen(optionVal("errLog", NULL), "w");
 errCode = optionInt("errCode", 0);
 if (optionExists("nonRobot"))
     nonRoboLog = mustOpen(optionVal("nonRobot", NULL), "w");
 if (argc < 2)
     usage();
 hgAccessCrawl(argc-1, argv+1);
 return 0;
 }