20f3ae78bc51d736e9785a99a8509077cf409eb5
chmalee
  Mon Dec 20 13:44:24 2021 -0800
Get search term log miner to correctly handle non-ascii user input

diff --git src/hg/logCrawl/dbTrackAndSearchUsage/generateSearchUse.py src/hg/logCrawl/dbTrackAndSearchUsage/generateSearchUse.py
index 0f113e5..5aec153 100755
--- src/hg/logCrawl/dbTrackAndSearchUsage/generateSearchUse.py
+++ src/hg/logCrawl/dbTrackAndSearchUsage/generateSearchUse.py
@@ -1,168 +1,168 @@
 #! /usr/bin/env python3.6
 
 import re, argparse, os, gzip, sys, datetime
 from collections import Counter, defaultdict
 from urllib import parse
 
 #####
 # Define dictionaries for recording usage information
 #####
 
 # General search term dictionaries
 searchTermUsers = defaultdict(Counter)
 
 # HGVS search term dictionaries
 hgvsTermUsers = defaultdict(Counter)
 
 #####
 #####
 
 
 #####
 # Define set of Brian Lee's HGVS test search terms
 #####
 hgvsTestTerms = {"NM_000310.3(PPT1):c.271_287del17insTT", "NM_007262.4(PARK7):c.-24+75_-24+92dup",
                 "NM_006172.3(NPPA):c.456_*1delAA", "MYH11:c.503-14_503-12del", "NM_198576.3(AGRN):c.1057C>T",
                 "NM_198056.2:c.1654G>T", "NP_002993.1:p.Asp92Glu", "NP_002993.1:p.D92E",
                 "BRCA1 Ala744Cys", "NM_000828.4:c.-2G>A"}
 
 def findSearchTerms(line):
     """Find and record usage information about search terms found in log lines"""
     splitLine = line.rstrip().split(" ")
     ip = splitLine[0]
     line = parse.unquote(line)
     output = re.findall("hgt\.positionInput\=[^&]+&?", line)
     # If we find search terms in line, process them
     if output != []:
         # Stringify search terms and remove extra characters
         output = str(output).replace("&", "").replace("[", "").replace("]", "").replace("'", "")
         # Split on "=" sign and keep only part after this sign
         outputList = output.split("=")
         searchTerm = outputList[1]
         # Count up search term use per user/IP address
         searchTermUsers[searchTerm][ip] += 1
 
         # Find search terms like NM_000310.3(PPT1):c.271_287del17insTT
         hgvs = re.findall("^[N][M|R|P]_[0-9]*\.[\(\)A-Z0-9]*:[a-z]\..*", searchTerm)
         if hgvs == []:
         # Find search terms like MYH11:c.503-14_503-12del
             hgvs = re.findall("^[A-Z0-9]*\:[a-z]\..*", searchTerm)
         if hgvs == []:
         # Find search terms like RCA1 Ala744Cys
             hgvs = re.findall("^[A-Z0-9]*\s[A-Z][a-z][a-z][0-9].*[A-Z][a-z][a-z]", searchTerm)
         # If we have HGVS results, stringify them and remove extra characters
         if hgvs != []:
             hgvs = str(hgvs).replace("&", "").replace("[", "").replace("]", "").replace("'", "")
             if hgvs in hgvsTestTerms:
                 hgvs = "** " + hgvs 
             hgvsTermUsers[hgvs][ip] += 1
 
 def processFile(fileName, outputDir):
     """Process hgTracks lines in a file"""
     # Need to process gzipped files differently
     if fileName.endswith(".gz"):
         ifh = gzip.open(fileName, "r")
     else:
         ifh = open(fileName, "r")
     for line in ifh:
         if "str" not in str(type(line)):
-            line = line.decode("ASCII")
+            line = line.decode("utf-8")
         if "positionInput" in line:
             findSearchTerms(line)
     # Output results to file
     outputToFile(searchTermUsers, os.path.join(outputDir, "searchTermsUsers.tsv"), os.path.join(outputDir, "searchTermCounts.tsv"))
     outputToFile(hgvsTermUsers, os.path.join(outputDir, "hgvsTermsUsers.tsv"), os.path.join(outputDir, "hgvsTermCounts.tsv"))
 
 def processDir(dirName, outputDir):
     """Process files in a directory one-by-one using processFile function"""
     fileNames = os.listdir(dirName)
     for log in fileNames:
         fileName = os.path.join(dirName, log)
         processFile(fileName, outputDir)
 
 
 def outputToFile(usersDict, usersOutName, countsOutName):
     """Output supplied dictionary to a tab-separated file"""
-    usersOut = open(usersOutName, "w")
-    countsOut = open(countsOutName, "w")
+    usersOut = open(usersOutName, "w", encoding="utf-8")
+    countsOut = open(countsOutName, "w", encoding="utf-8")
 
     usersOut.write("# User/IP\tSearch term\tUse count\n")
     countsOut.write("# Search term\t# of users\t# of times searched\n")
 
     for term in usersDict:
         users = 0
         totalUse = 0
         for user in usersDict[term]:
             count = usersDict[term][user]
             if count!= 1:
                 users += 1
                 totalUse += count
                 usersOut.write(str(user) + "\t" + term + "\t" + str(count) + "\n")
 
         if users != 0 or totalUse != 0:
             countsOut.write(term + "\t" +  str(users) + "\t" + str(totalUse) + "\n")
 
 def dumpToJson(data, outJsonName, outputDir):
     """Dump supplied dictionary to a JSON format file"""
     jsonOut = open(os.path.join(outputDir, outJsonName), "w")
     json.dump(data, jsonOut)
     jsonOut.close()
 
 
 def main():
     # Parse command-line arguments
     parser = argparse.ArgumentParser()
     parser.add_argument("-f","--fileName", type=str, help='input file name, must be\
 space-separated Apache access_log file')
     parser.add_argument("-d","--dirName", type=str , help='input directory name, files must\
 be space-separated Apache access_log files. No other files should be present in this directory.')
     parser.add_argument("-j","--jsonOut", action='store_true', help='output json files\
 for summary dictionaries')
     parser.add_argument("-o","--outDir", type=str, help='directory in which to place output files')
     args = parser.parse_args()
 
     # Print help message if no arguments are supplied
     if len(sys.argv) == 1:
         parser.print_help(sys.stderr)
         sys.exit(1)
 
     # File and directory options can't be used together. Catch this and exit.
     if args.fileName != None and args.dirName != None:
         print("-f/--fileName and -d/--dirName cannot be used together. Choose one and re-run.")
         sys.exit(1)
 
     # Catch it early if input file/directory doesn't exist and exit.
     if args.fileName:
         if not os.path.exists(args.fileName):
             print(args.fileName, "doesn't exist. Please run on a valid file.")
             exit(1)
     elif args.dirName:
         if not os.path.exists(args.dirName):
             print(args.dirName, "doesn't exist. Please run on a valid directory.")
             exit(1)
 
     # Setup output directory
     if args.outDir == None:
         # If an output directory is unspecified, then a new one with the current date/time is made
         currDateTime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
         os.makedirs(currDateTime)
         outDir = currDateTime
     else:
         # Otherwise, user supplied a output directory name
         outDir = args.outDir
         if not os.path.exists(outDir):
             # If specied output directory doesn't exist, create it
             os.makedirs(outDir)
 
     # Process input to create dictionaries containing info about users per db/track/hub track
     if args.fileName:
         processFile(args.fileName, outDir)
     elif args.dirName:
         processDir(args.dirName, outDir)
 
     if args.jsonOut == True:
         dumpToJson(searchTermUsers, "searchTermUsers.json", outDir)
         dumpToJson(hgvsTermUsers, "hgvsTermUsers.json", outDir)
 
 if __name__ == "__main__":
     main()