20f3ae78bc51d736e9785a99a8509077cf409eb5 chmalee Mon Dec 20 13:44:24 2021 -0800 Get search term log miner to correctly handle non-ascii user input diff --git src/hg/logCrawl/dbTrackAndSearchUsage/generateSearchUse.py src/hg/logCrawl/dbTrackAndSearchUsage/generateSearchUse.py index 0f113e5..5aec153 100755 --- src/hg/logCrawl/dbTrackAndSearchUsage/generateSearchUse.py +++ src/hg/logCrawl/dbTrackAndSearchUsage/generateSearchUse.py @@ -54,49 +54,49 @@ if hgvs != []: hgvs = str(hgvs).replace("&", "").replace("[", "").replace("]", "").replace("'", "") if hgvs in hgvsTestTerms: hgvs = "** " + hgvs hgvsTermUsers[hgvs][ip] += 1 def processFile(fileName, outputDir): """Process hgTracks lines in a file""" # Need to process gzipped files differently if fileName.endswith(".gz"): ifh = gzip.open(fileName, "r") else: ifh = open(fileName, "r") for line in ifh: if "str" not in str(type(line)): - line = line.decode("ASCII") + line = line.decode("utf-8") if "positionInput" in line: findSearchTerms(line) # Output results to file outputToFile(searchTermUsers, os.path.join(outputDir, "searchTermsUsers.tsv"), os.path.join(outputDir, "searchTermCounts.tsv")) outputToFile(hgvsTermUsers, os.path.join(outputDir, "hgvsTermsUsers.tsv"), os.path.join(outputDir, "hgvsTermCounts.tsv")) def processDir(dirName, outputDir): """Process files in a directory one-by-one using processFile function""" fileNames = os.listdir(dirName) for log in fileNames: fileName = os.path.join(dirName, log) processFile(fileName, outputDir) def outputToFile(usersDict, usersOutName, countsOutName): """Output supplied dictionary to a tab-separated file""" - usersOut = open(usersOutName, "w") - countsOut = open(countsOutName, "w") + usersOut = open(usersOutName, "w", encoding="utf-8") + countsOut = open(countsOutName, "w", encoding="utf-8") usersOut.write("# User/IP\tSearch term\tUse count\n") countsOut.write("# Search term\t# of users\t# of times searched\n") for term in usersDict: users = 0 totalUse = 0 for user in usersDict[term]: count = usersDict[term][user] if count!= 1: users += 1 totalUse += count usersOut.write(str(user) + "\t" + term + "\t" + str(count) + "\n") if users != 0 or totalUse != 0: