fd9de5a36525c1f706433d87a293cd2bc4106bb4 gperez2 Tue Jan 2 17:46:05 2024 -0800 Updating the script to add an exception for January 1st since the script gets the five latest error logs within current year directory, No Redmine diff --git src/utils/qa/searchedTermsCron.py src/utils/qa/searchedTermsCron.py index 0366687..967c3b0 100755 --- src/utils/qa/searchedTermsCron.py +++ src/utils/qa/searchedTermsCron.py @@ -1,138 +1,142 @@ #!/usr/bin/env python3 # Program Header # Name: Gerardo Perez # Description: A program that gets a monthly count of hgSearch result clicks from the RR/asia/euro error logs. # # # # searchedTermsCron # # # Development Environment: VIM - Vi IMproved version 7.4.629 # Version: Python 3.6.5 import os import getpass import sys import re import json import io import requests import subprocess from datetime import datetime user = getpass.getuser() def bash(cmd): """Input bash cmd and return stdout""" rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True) return(rawOutput.stdout.split('\n')[0:-1]) -#Variable to get the current year +#Makes a variable for the current year. If the month is January then sets the year to the previous year. +if datetime.now().strftime("%m")=='01': + year=str(int(datetime.now().strftime("%Y"))-1) +else: year=datetime.now().strftime("%Y") + #Get the last 5 error logs from the RR/euro latestLogs=bash("ls /hive/data/inside/wwwstats/RR/"+year+"/hgw1") latestLogs_euro=bash("ls /hive/data/inside/wwwstats/euroNode/"+year+"/") latestLogs = latestLogs[len(latestLogs)-5:] latestLogs_euro=latestLogs_euro[len(latestLogs_euro)-5:] #Make a directory for the month (Y-M) try: os.makedirs("/hive/users/qateam/searchedTermsCronArchive/"+datetime.now().strftime("%Y-%m")) except OSError: print("mkdir: cannot create directory /hive/users/qateam/searchedTermsCronArchive/"+datetime.now().strftime("%Y-%m")+": File exists") sys.exit(1) #Nodes to check for error logs nodes = ['RR', 'asiaNode', 'euroNode'] machines = ['hgw1','hgw2'] #Add hgw machines to check #For loop that goes through the RR/euro/asia error logs, trims down the hgSearch line, and writes to a file for node in nodes: with open('/hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/hgSearchTrimLogs', 'a') as f: if node == 'RR': for machine in machines: for log in latestLogs:#Copy the 5 latest error logs for each of the rr machines hgSearch=bash("zcat /hive/data/inside/wwwstats/"+node+"/"+year+"/"+machine+"/"+log+" | grep 'hgSearch' | tr '?' '\t' | cut -f 2 | grep 'search' | uniq") for i in hgSearch: f.write(i+'\n') elif node == 'euroNode': for log in latestLogs_euro: hgSearch=bash(" zcat /hive/data/inside/wwwstats/"+node+"/"+year+"/"+log+" | grep 'hgSearch' | tr '?' '\t' | cut -f 2 | grep 'search' | uniq") for i in hgSearch: f.write(i+'\n') else: for log in latestLogs: hgSearch=bash(" zcat /hive/data/inside/wwwstats/"+node+"/"+year+"/"+log+" | grep 'hgSearch' | tr '?' '\t' | cut -f 2 | grep 'search' | uniq") for i in hgSearch: f.write(i+'\n') f.close() #Remove duplicates with the same hgsid and save the list to a variable search_lines= bash("cat /hive/users/qateam/searchedTermsCronArchive/"+datetime.now().strftime("%Y-%m")+"/hgSearchTrimLogs | sort | uniq ") #For loop that removes hgsid and counts the search term searches_count = {} #Added indexing by position then concatenating terms after a '&' line split for line in search_lines: if len(line.split('&'))>2: terms=line.split('&') term=terms[0] db=terms[2] term=term.split('=') term=term[1:3] term = str(term)[1:-1] db=db.split('=') db=db[1:3] db=str(db)[1:-1] searchTerm=db+" "+term searchTerm=searchTerm.replace("\'", "") if searchTerm.lower() in searches_count: searches_count[searchTerm.lower()] += 1 else: searches_count[searchTerm.lower()] = 1 #Sort the count values from largest to smallest and stores to a list sorted_searches_counts= sorted(searches_count.values(), reverse=True) #Make a dictionary with the count values from largest to smallest sorted_searches_dict = {} for i in sorted_searches_counts: for k in searches_count.keys(): if searches_count[k] == i: sorted_searches_dict[k] = searches_count[k] #Write the sorted count values and search terms to a file file_searches = open('/hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/searchCount.txt', 'w') file_searches.write("count"+'\t'+"db term"+'\n') file_searches.write("--------------------"'\n') for k in sorted_searches_dict.keys(): file_searches.write("{}\t{}\n".format(sorted_searches_dict[k], k)) file_searches.close() #Writes mouse sorted count values and search terms to a file mouse=['mm10', 'mm39'] mouse_searches = open('/hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/mouseCount.txt', 'w') mouse_searches.write("count"+'\t'+"db term"+'\n') mouse_searches.write("--------------------"'\n') for k in sorted_searches_dict.keys(): if k.split(' ')[0] in mouse: mouse_searches.write("{}\t{}\n".format(sorted_searches_dict[k], k)) mouse_searches.close() #Prints info regarding the cron job, top search terms, and top mouse search terms print('This cronjob outputs a monthly count of the top search terms using the hgSearch result clicks from the RR/asia/euro error logs. It only counts each hgsid occurrence once. The top search terms and mouse search terms files can be found here: /hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'\n') print('Top 20 search terms from hgSearch result clicks:') topTwenty=bash('head -22 /hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/searchCount.txt') for term in topTwenty: print(str(term)) print('\nTop 15 mouse search terms from hgSearch result clicks:') topMouse=bash('head -17 /hive/users/qateam/searchedTermsCronArchive/'+datetime.now().strftime("%Y-%m")+'/mouseCount.txt' ) for term in topMouse: print(str(term)) print('\nArchive of monthly raw data can be found here: /hive/users/qateam/searchedTermsCronArchive/') #Remove file that contains hgSearch lines bash("rm /hive/users/qateam/searchedTermsCronArchive/"+datetime.now().strftime("%Y-%m")+"/hgSearchTrimLogs")