bbd1e55c6e3053e1bbf28a9fabe36335a9a5f387
lrnassar
  Mon Oct 28 16:29:51 2024 -0700
Adding new cron that counts occurence of various keywords in the error logs, refs #33007

diff --git src/utils/qa/errorLogKeywordSearch.py src/utils/qa/errorLogKeywordSearch.py
new file mode 100755
index 0000000..1cb9abf
--- /dev/null
+++ src/utils/qa/errorLogKeywordSearch.py
@@ -0,0 +1,155 @@
+#Looks through the error logs and graphs out the occurence of various keywords
+
+import matplotlib
+#Don't try to display the plot
+matplotlib.use('Agg')
+
+import datetime
+from collections import OrderedDict
+import getpass
+import subprocess
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mticker
+import matplotlib.dates as mdates
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
+
+def bashNoErrorCatch(cmd):
+    """Run the cmd in bash subprocess, don't catch error since grep returns exit code 1 when no match is found"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout.rstrip().split("\n")
+    except:
+        bashStdoutt = []
+    return(bashStdoutt)
+
+def copyLatestLogs():
+    user = getpass.getuser()
+
+    # Get the year to query proper wwwstats directory
+    today = datetime.datetime.today()
+    year = str(today).split('-')[0]
+
+    # Get latest error logs from the RR
+    latestLogs = bash('ls /hive/data/inside/wwwstats/RR/2024/hgw1').rstrip().split("\n")
+    latestLogs = latestLogs[len(latestLogs)-11:len(latestLogs)-1]
+
+    nodes = ['RR', 'asiaNode', 'euroNode'] #Add nodes with error logs, nodes can be added or removed
+    machines = ['hgw1','hgw2'] #Add hgw machines to check
+
+    for node in nodes:
+        if node == 'RR':
+            for machine in machines:
+                for log in latestLogs: 
+                    bash("ln -sf /hive/data/inside/wwwstats/RR/"+year+"/"+machine+"/"+log+' /hive/users/'+user+'/ErrorLogs/'+node+machine+log)
+        else:
+            for log in latestLogs:
+                bash("ln -sf /hive/data/inside/wwwstats/"+node+"/"+year+"/"+log+' /hive/users/'+user+'/ErrorLogs/'+node+log)
+    return(user,latestLogs)
+
+def createDicOfSearchTerms():
+    totalLinesInLog = dict(label='Total lines in logs', description='Total number of lines seen in the logs', value=[], searchKeyWord="wc -l")
+    totalUniqueIPs = dict(label='Total unique IPs', description='Total number of unique IPs without port number, e.g. N.N.N and not N.N.N:NNN', value=[], searchKeyWord='grep "\[client" | cut -f4 -d "]" | cut -f3 -d " " | cut -f1 -d ":" | sort | uniq | wc -l')
+    totalUniqueIPsSubnets = dict(label='Total unique IP subnets', description='Total number of unique IPs with only partial subnet, e.g. NNN.NNN and not NNN.NNN.N.NN', value=[], searchKeyWord='grep "\[client" | cut -f4 -d "]" | cut -f3 -d " " | cut -f1 -d ":" | cut -f1-2 -d "." | sort | uniq | wc -l')
+    totalUniqueHgsids = dict(label='Total unique hgsIDs', description='Total number of unique hgsIDs', value=[], searchKeyWord=r"grep 'hgsid' | sed -n 's/.*[?&]hgsid=\([0-9A-Za-z_]*\).*/\1/p' | sort | uniq | wc -l")
+    totalLoadedSessions = dict(label='Total loaded sessions', description='Total number of loaded sessions', value=[], searchKeyWord=' grep "CGI_TIME: hgTracks" | grep "/cgi-bin/hgSession?" | wc -l')
+    totalSavedCTs = dict(label='Total saved CTs', description='Total number of saved custom tracks', value=[], searchKeyWord='grep "customTrack: saved" | wc -l')
+    totalCTerrors = dict(label='Total CT errors', description='Total number of custom track load errors', value=[], searchKeyWord='grep "hgCustom load error" | wc -l')
+    totalStackDumps = dict(label='Total stack dumps', description='Total number of stack dumps', value=[], searchKeyWord='grep "Stack dump" | wc -l')
+    totalTryingToAllocate = dict(label='Total 500Mb allocate memory', description="Happens if code tries to allocate a chunk bigger than hard-wired limit of 500m. Could indicate naughty CGI", value=[], searchKeyWord='grep "needMem: trying to allocate" | wc -l')
+    totalOutOfMemory = dict(label='Total out of memory', description='Happens if malloc() fails because the OS native limits (or hg.conf maxMem limits)', value=[], searchKeyWord='grep "needMem: Out of memory" | wc -l')
+    totalHogExits = dict(label='hogExit', description='hogExit: Total number of people that hit the bottleneck', value=[], searchKeyWord='grep "hogExit" | wc -l')
+    totalHgCollectionsExpire = dict(label='hgCollections', description='Total number of expired hgCollections', value=[], searchKeyWord='grep "Track Collections expire 48" | wc -l')
+    totalWarnTimings = dict(label='warnTiming', description='warnTiming: Number of people that hit the warnSeconds hg.conf var. Warns them about image taking too long to load', value=[], searchKeyWord='grep "warnTiming" | wc -l')
+
+    itemsToFind = [totalLinesInLog,totalUniqueIPs,totalUniqueIPsSubnets,totalUniqueHgsids,totalLoadedSessions,totalSavedCTs,totalCTerrors,totalHgCollectionsExpire,totalHogExits,totalStackDumps,totalTryingToAllocate,totalOutOfMemory,totalWarnTimings]
+    return(itemsToFind)
+
+def searchForTermsInLogs():
+    user,latestLogs = copyLatestLogs()
+    itemsToFind = createDicOfSearchTerms()
+
+    # n=0 Uncomment these lines to see progress
+    for log in latestLogs:
+#         n+=1
+        logPath = "zcat /hive/users/"+user+"/ErrorLogs/*"+log+" | "
+        for searchTerm in itemsToFind:
+            searchTerm['value'].append(int(bash(logPath+searchTerm['searchKeyWord'])))
+#         print("Current progress:", n/len(latestLogs))
+
+    bash("rm /hive/users/"+user+"/ErrorLogs/*")
+    return(user,latestLogs,itemsToFind)
+
+def generateGraphs(user,latestLogs,itemsToFind):
+    logDates = [log.split(".")[1] for log in latestLogs]
+    dateRange = str(logDates[0])+"-"+str(logDates[len(logDates)-1])
+    saveDir = "/hive/users/"+user+"/errorLogSearchCronResults/"+dateRange
+    bash("mkdir -p "+saveDir)
+    htmlPageOutput = open(saveDir+"/index.html",'w')
+
+    n=0
+    for report in itemsToFind:
+        n+=1
+        # x axis values
+        x_dates = [datetime.datetime.strptime(date, "%Y%m%d") for date in logDates]
+        # corresponding y axis values
+        y = report['value']
+
+        # plotting the points 
+        plt.plot(x_dates, y, marker='o')
+
+        # Format the x-axis to show dates, with one point per week
+        plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator())  # Major ticks: weekly
+        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y%m%d'))  # Format as "YYYYMMDD"
+
+        # Rotate date labels for better readability
+        plt.gcf().autofmt_xdate()
+
+        # naming the x axis
+        plt.xlabel('Error log week yearMonthDay')
+        # naming the y axis
+        plt.ylabel(report['label'])
+        plt.xticks(x_dates)
+        # giving a title to my graph
+        plt.title(report['label'])
+
+        # Add a caption
+        plt.text(0.5, -0.35, report['description']+" \nsearch term: "+report['searchKeyWord'], ha='center', va='center', fontsize=10, transform=plt.gca().transAxes)
+
+        # Ensure the figure is fully rendered before saving
+        plt.gcf().canvas.draw()  # Force rendering of the canvas
+
+        # Save the plot to a file
+        plt.savefig(saveDir + "/" + str(n) + ".png", bbox_inches='tight')
+        htmlPageOutput.write('<img src="'+str(n) + '.png">')
+
+        # Clear the current plot to avoid overlaps with the next plot
+        plt.clf()
+
+    htmlPageOutput.close()
+
+    if user == 'qateam':
+        bash("mkdir -p /usr/local/apache/htdocs-genecats/qa/test-results/errorLogSearchResults/"+dateRange)
+        bash("ln -sf "+saveDir+"/* /usr/local/apache/htdocs-genecats/qa/test-results/errorLogSearchResults/"+dateRange+"/")
+        print("See the latest error log search results over the last 10 weeks:\n")
+        print("https://genecats.gi.ucsc.edu/qa/test-results/errorLogSearchResults/")
+    else:
+        bash("mkdir -p /cluster/home/"+user+"/public_html/cronResults/errorLogSearchResults/"+dateRange)
+        bash("ln -sf "+saveDir+"/* /cluster/home/"+user+"/public_html/cronResults/errorLogSearchResults/"+dateRange+"/")
+        print("See the latest error log search results over the last 10 weeks:\n")
+        print("https://hgwdev.gi.ucsc.edu/~"+user+"/cronResults/errorLogSearchResults/"+dateRange+"/")    
+    
+def main():
+    user,latestLogs,itemsToFind = searchForTermsInLogs()
+    generateGraphs(user,latestLogs,itemsToFind)
+
+main()