014a8807f791774e378297827c0fc7cb119607b0 gperez2 Sun Apr 19 14:25:53 2026 -0700 Updating hubCheckDraftEmails.py cron to pull contact emails from hubPublic.email with curl+grep as fallback, and to only draft one email per hub when a hub has both "Couldn't open" and "missing description page" errors. No RM diff --git src/utils/qa/hubCheckDraftEmails.py src/utils/qa/hubCheckDraftEmails.py index 564e50e8c6b..b02c0bc1699 100755 --- src/utils/qa/hubCheckDraftEmails.py +++ src/utils/qa/hubCheckDraftEmails.py @@ -1,156 +1,169 @@ #!/usr/bin/env python3 # Program Header # Name: Gerardo Perez # Description: A program that parses the hubCheck output into email drafts for hub authors regarding # missing description pages and couldn't open errors # # hubCheckDraftEmails.py # # # Version: Python 3.6.5 # import os import getpass import sys import re import json import io import requests import subprocess from datetime import datetime user = getpass.getuser() def bash(cmd): """Input bash cmd and return stdout""" rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True) return(rawOutput.stdout.split('\n')[0:-1]) #Make directories for the month (Y-M) try: os.makedirs("/hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m"), exist_ok=True) except OSError as e: print(f"Error creating directory: {e}") try: os.makedirs("/usr/local/apache/htdocs-genecats/qa/test-results/hubCheckCron/"+datetime.now().strftime("%Y-%m"), exist_ok=True) except OSError as e: print(f"Error creating directory: {e}") # Creates list for the hub.txt URLs that have the error of missing description pages descPageMis=[] # Creates list for the hub.txt URLs that have the error of couldn't open couldNotOpen=[] # Gets hubCheck output output_line=bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output") lineCount=0 for line in output_line: lineCount=lineCount+1 line=str(line) if "Couldn't open" in line: #Gets each hub.txt that has the error of couldn't open couldNotOpen.append(bash("head -"+str(lineCount)+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | grep -A 1 '####' | tail -1")) if "missing description page" in line: #Gets each hub.txt that has the error of missing description page descPageMis.append(bash("head -"+str(lineCount)+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | grep -A 1 '####' | tail -1")) def checkDuplicates(list): """Input list to get distinct items""" newList=[] for item in list: if item not in newList: newList.append(item) return newList def stringTerm(term): """Coverts list input to string""" newString=str(term)[1:-1].replace("\'", "") return newString +# Build url->email map from hubPublic (populated by a separate cron). Preferred +# over scraping hub.txt because it still works when the hub itself is unreachable. +hubEmails = {} +for line in bash("/cluster/bin/x86_64/hgsql -h genome-centdb -Ne 'select hubUrl, email from hubPublic' hgcentral"): + parts = line.split('\t', 1) + if len(parts) == 2: + hubEmails[parts[0]] = parts[1] + def getEmail(hubUrl): - """Gets email from hubUrl""" + """Gets email from hubPublic.email, falling back to scraping the live hub.txt.""" + tableEmail = hubEmails.get(hubUrl, "").strip() + if tableEmail and tableEmail != "NULL": + return tableEmail email=stringTerm(bash("curl -Ls "+hubUrl+" | grep '^email' | awk '{print $2}'")) - empty="" - if email==empty: - email="N/A <---------- Check: https://genecats.gi.ucsc.edu/qa/test-results/publicHubContactInfo/publicHubContact.html" + if not email: + return "N/A <---------- Check: https://genecats.gi.ucsc.edu/qa/test-results/publicHubContactInfo/publicHubContact.html" return email count=0 # pattern for line that has number of problems pattern = r"Found (\d+) problem." with open("/hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/filtered_output.txt", 'a') as f: f.write("#############################################\n") # For loop that goes through each line from the hubCheck output for line in (bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output")): count=count+1 # Checks if the line has is number of problems line if stringTerm(re.findall(pattern, line)).isdigit(): if int(stringTerm(re.findall(pattern, line))) >= 6:# If the line is above the limit then write 5 lines of errors, ... and ### for l in (bash("head -"+str(count+5)+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | tail -7")): f.write(l+'\n') f.write('...\n') f.write("#############################################\n") else: # Else write all the errors within the limit for l in (bash("head -"+str(count+ int(stringTerm(re.findall(pattern, line))))+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | tail -"+str(int(stringTerm(re.findall(pattern, line)))+2))): f.write(l+'\n') f.write("#############################################\n") emailIntro= """Dear UCSC Public Hub author, I am writing on behalf of the UCSC Genome Browser. We wanted to alert you that your public track hub at the address:""" endEmail=""" hubCheck is a command-line utility that checks files in the hub are correctly formatted. If you would like to run the hubCheck utility on your own machine, you can download the tool from the utilities directory: https://hgdownload.soe.ucsc.edu/downloads.html#utilities_downloads Please update your public track hub. If you have any questions, please let us know, and we will be happy to assist. Do not hesitate to let us know if we can help you resolve this situation, e.g. by updating the URL where the hub is hosted or possibly hosting the files on our servers. You can reach us at genome-www@soe.ucsc.edu. Thank you for your interest and contributions, The UCSC Genome Browser Group """ # Gets the total lines number from the hubCheck output totalLines=bash("wc -l /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/filtered_output.txt | tr ' ' '\t' | cut -f1") with open("/hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/draftedMessages.txt", 'a') as f: f.write('##########################\n') - # For loop that gets each hub.txt that has the error of missing description page - for item in checkDuplicates(descPageMis): + # For loop that gets each hub.txt that has the error of couldn't open (more actionable, drafted first) + for item in checkDuplicates(couldNotOpen): f.write('Send email to: '+getEmail(stringTerm(item))+'\n') f.write(emailIntro % item) - # For loop that gets hubCheck output for each hub.txt that has the error of missing description page + # For loop that gets hubCheck output for each hub.txt that has the error of couldn't open for line in (bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/filtered_output.txt | grep -A "+stringTerm(totalLines)+" "+stringTerm(item))): if '#' not in line: f.write(line+'\n') else: break f.write("\nWhen running hubCheck "+stringTerm(item)+'\n') f.write(endEmail) f.write('##########################\n') - # For loop that gets each hub.txt that has the error of couldn't open - for item in checkDuplicates(couldNotOpen): + # For loop that gets each hub.txt that has the error of missing description page + # Skip hubs already drafted in the couldNotOpen loop above (same draft body is produced for both) + for item in checkDuplicates(descPageMis): + if item in couldNotOpen: + continue f.write('Send email to: '+getEmail(stringTerm(item))+'\n') f.write(emailIntro % item) - # For loop that gets hubCheck output for each hub.txt that has the error of couldn't open + # For loop that gets hubCheck output for each hub.txt that has the error of missing description page for line in (bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/filtered_output.txt | grep -A "+stringTerm(totalLines)+" "+stringTerm(item))): if '#' not in line: f.write(line+'\n') else: break f.write("\nWhen running hubCheck "+stringTerm(item)+'\n') f.write(endEmail) f.write('##########################\n') bash("cp /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/draftedMessages.txt /usr/local/apache/htdocs-genecats/qa/test-results/hubCheckCron/"+datetime.now().strftime("%Y-%m")) print("Check https://genecats.gi.ucsc.edu/qa/test-results/hubCheckCron/"+datetime.now().strftime("%Y-%m")+"/draftedMessages.txt to email hub authors about missing/broken public hub files") print("Archive of monthly raw data can be found here: /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m"))