ec4ebc312a17073e6052de6da0368acd21104806
gperez2
  Fri Dec 30 19:54:21 2022 -0800
Adding a python script to get monthly top hgSearch result terms, refs #30372

diff --git src/utils/qa/searchedTermsCron.py src/utils/qa/searchedTermsCron.py
new file mode 100755
index 0000000..d5d02df
--- /dev/null
+++ src/utils/qa/searchedTermsCron.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+
+# Program Header
+# Name:   Gerardo Perez
+# Description: A program that gets a monthly count of hgSearch result clicks from the RR/asia/euro error logs.
+#
+#
+#
+# searchedTermsCron
+#
+#
+# Development Environment: VIM - Vi IMproved version 7.4.629
+# Version: Python 3.6.5 
+
+import os
+import getpass
+import sys
+import re
+import json
+import io
+import requests
+import subprocess
+from datetime import datetime
+
+user = getpass.getuser()
+
+def bash(cmd):
+    """Input bash cmd and return stdout"""
+    rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
+    return(rawOutput.stdout.split('\n')[0:-1])
+
+#Get the last 5 error logs from the RR/euro
+latestLogs=bash("ls /hive/data/inside/wwwstats/RR/2022/hgw1")
+latestLogs_euro=bash("ls /hive/data/inside/wwwstats/euroNode/2022/")
+latestLogs = latestLogs[len(latestLogs)-5:]
+latestLogs_euro=latestLogs_euro[len(latestLogs_euro)-5:]
+
+#Make a directory for the month (Y-M) 
+try:
+   bash("mkdir /hive/users/qateam/searchedTermsCronArchive/"+datetime.now().strftime("%Y-%m"))
+except subprocess.CalledProcessError:
+        print("mkdir: cannot create directory /hive/users/qateam/searchedTermsCronArchive/"+datetime.now().strftime("%Y-%m")+": File exists")
+        quit()
+
+#Nodes to check for error logs
+nodes = ['RR', 'asiaNode', 'euroNode']
+machines = ['hgw1','hgw2'] #Add hgw machines to check
+
+#For loop that goes through the RR/euro/asia error logs, trims down the hgSearch line, and writes to a file
+for node in nodes:
+    with open('/hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/hgSearchTrimLogs', 'a') as f:
+        if node == 'RR':
+            for machine in machines:
+                for log in latestLogs:#Copy the 5 latest error logs for each of the rr machines
+                    hgSearch=bash("zcat /hive/data/inside/wwwstats/"+node+"/2022/"+machine+"/"+log+"  | grep 'hgSearch' | tr '?' '\t' | cut -f 2 | grep 'search' | uniq")
+                    for i in hgSearch:
+                        f.write(i+'\n')
+        elif node == 'euroNode':
+            for log in latestLogs_euro:
+                hgSearch=bash(" zcat /hive/data/inside/wwwstats/"+node+"/2022/"+log+" | grep 'hgSearch' | tr '?' '\t' | cut -f 2 | grep 'search' | uniq")
+                for i in hgSearch:
+                    f.write(i+'\n')
+        else:
+            hgSearch=bash(" zcat /hive/data/inside/wwwstats/"+node+"/2022/"+log+" | grep 'hgSearch' | tr '?' '\t' | cut -f 2 | grep 'search' | uniq")
+            for i in hgSearch:
+                f.write(i+'\n')
+f.close()
+
+#Remove duplicates with the same hgsid and save the list to a variable 
+search_lines= bash("cat /hive/users/qateam/searchedTermsCronArchive/"+datetime.now().strftime("%Y-%m")+"/hgSearchTrimLogs | sort | uniq ")
+
+#For loop that removes hgsid and counts the search term 
+searches_count = {}
+for term in search_lines:
+    if len(term.split('&'))>2:
+        term=term.split('&')
+        term.pop(1)
+        term.reverse()
+        term=str(term)[1:-1]
+        term=term.split('=')
+        term=term[1:3]
+        term = str(term)[1:-1]
+        term = term.replace("\"", "").replace(", 'search", "").replace("\'", "").replace(",", "")
+        if term.lower() in searches_count:
+            searches_count[term.lower()] += 1
+        else:
+            searches_count[term.lower()] = 1
+
+#Sort the count values from largest to smallest and stores to a list           
+sorted_searches_counts= sorted(searches_count.values(), reverse=True)
+
+#Make a dictionary with the count values from largest to smallest 
+sorted_searches_dict = {}
+for i in sorted_searches_counts:
+    for k in searches_count.keys():
+        if searches_count[k] == i:
+            sorted_searches_dict[k] = searches_count[k]
+
+#Write the sorted count values and search terms to a file
+file_searches = open('/hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/searchCount.txt', 'w')
+file_searches.write("count"+'\t'+"db term"+'\n')
+file_searches.write("--------------------"'\n')
+for k in sorted_searches_dict.keys():
+    file_searches.write("{}\t{}\n".format(sorted_searches_dict[k], k))
+file_searches.close()
+
+#Writes mouse sorted count values and search terms to a file
+mouse=['mm10', 'mm39']
+mouse_searches = open('/hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/mouseCount.txt', 'w')
+mouse_searches.write("count"+'\t'+"db term"+'\n')
+mouse_searches.write("--------------------"'\n')
+for k in sorted_searches_dict.keys():
+    if k.split(' ')[0] in mouse:
+        mouse_searches.write("{}\t{}\n".format(sorted_searches_dict[k], k))
+mouse_searches.close()
+
+#Prints info regarding the cron job, top search terms, and top mouse search terms 
+print('This cronjob outputs a monthly count of the top search terms using the hgSearch result clicks from the RR/asia/euro error logs. It only counts each hgsid occurrence once. The top search terms and mouse search terms files can be found here: /hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'\n')
+print('Top 20 search terms from hgSearch result clicks:')
+topTwenty=bash('head -22 /hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/searchCount.txt')
+for term in topTwenty:
+    print(str(term))
+print('\nTop 15 mouse search terms from hgSearch result clicks:')
+topMouse=bash('head -17 /hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/mouseCount.txt' )
+for term in topMouse:
+    print(str(term))
+print('\nArchive of monthly raw data can be found here: /hive/users/qateam/searchedTermsCronArchive/')
+
+#Remove file that contains hgSearch lines
+bash("rm /hive/users/qateam/searchedTermsCronArchive/"+datetime.now().strftime("%Y-%m")+"/hgSearchTrimLogs")