src/utils/qa/hubCheckDraftEmails.py 014a8807f791774e378297827c0fc7cb119607b0

014a8807f791774e378297827c0fc7cb119607b0
gperez2
  Sun Apr 19 14:25:53 2026 -0700
Updating hubCheckDraftEmails.py cron to pull contact emails from hubPublic.email with curl+grep as fallback, and to only draft one email per hub when a hub has both "Couldn't open" and "missing description page" errors. No RM

diff --git src/utils/qa/hubCheckDraftEmails.py src/utils/qa/hubCheckDraftEmails.py
index 564e50e8c6b..b02c0bc1699 100755
--- src/utils/qa/hubCheckDraftEmails.py
+++ src/utils/qa/hubCheckDraftEmails.py
@@ -1,156 +1,169 @@
 #!/usr/bin/env python3
 
 # Program Header
 # Name:   Gerardo Perez
 # Description: A program that parses the hubCheck output into email drafts for hub authors regarding
 #              missing description pages and couldn't open errors
 #
 # hubCheckDraftEmails.py
 #
 #
 # Version: Python 3.6.5 
 #
 import os
 import getpass
 import sys
 import re
 import json
 import io
 import requests
 import subprocess
 from datetime import datetime
 
 user = getpass.getuser()
 
 def bash(cmd):
     """Input bash cmd and return stdout"""
     rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
     return(rawOutput.stdout.split('\n')[0:-1])
 
 #Make directories for the month (Y-M) 
 try:
    os.makedirs("/hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m"), exist_ok=True)
 except OSError as e:
     print(f"Error creating directory: {e}")
 
 try:
    os.makedirs("/usr/local/apache/htdocs-genecats/qa/test-results/hubCheckCron/"+datetime.now().strftime("%Y-%m"), exist_ok=True)
 except OSError as e:
     print(f"Error creating directory: {e}")
 
 # Creates list for the hub.txt URLs that have the error of missing description pages
 descPageMis=[]
 
 # Creates list for the hub.txt URLs that have the error of couldn't open
 couldNotOpen=[]
 
 # Gets hubCheck output
 output_line=bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output")
 
 lineCount=0
 for line in output_line:
     lineCount=lineCount+1
     line=str(line)
     if "Couldn't open" in line: #Gets each hub.txt that has the error of couldn't open
        couldNotOpen.append(bash("head -"+str(lineCount)+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | grep -A 1 '####' | tail -1"))
     if "missing description page" in line: #Gets each hub.txt that has the error of missing description page
        descPageMis.append(bash("head -"+str(lineCount)+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | grep -A 1 '####' | tail -1"))
 
  
 
 def checkDuplicates(list):
     """Input list to get distinct items"""
     newList=[]
     for item in list:
         if item not in newList:
            newList.append(item)
     return newList
 
 def stringTerm(term):
     """Coverts list input to string"""
     newString=str(term)[1:-1].replace("\'", "")
     return newString
 
+# Build url->email map from hubPublic (populated by a separate cron). Preferred
+# over scraping hub.txt because it still works when the hub itself is unreachable.
+hubEmails = {}
+for line in bash("/cluster/bin/x86_64/hgsql -h genome-centdb -Ne 'select hubUrl, email from hubPublic' hgcentral"):
+    parts = line.split('\t', 1)
+    if len(parts) == 2:
+        hubEmails[parts[0]] = parts[1]
+
 def getEmail(hubUrl):
-    """Gets email from hubUrl"""
+    """Gets email from hubPublic.email, falling back to scraping the live hub.txt."""
+    tableEmail = hubEmails.get(hubUrl, "").strip()
+    if tableEmail and tableEmail != "NULL":
+        return tableEmail
     email=stringTerm(bash("curl -Ls "+hubUrl+" | grep '^email' | awk '{print $2}'"))
-    empty=""
-    if email==empty:
-       email="N/A <---------- Check: https://genecats.gi.ucsc.edu/qa/test-results/publicHubContactInfo/publicHubContact.html"
+    if not email:
+       return "N/A <---------- Check: https://genecats.gi.ucsc.edu/qa/test-results/publicHubContactInfo/publicHubContact.html"
     return email
 
 count=0
 # pattern for line that has number of problems
 pattern = r"Found (\d+) problem."
 with open("/hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/filtered_output.txt", 'a') as f:
     f.write("#############################################\n")
     # For loop that goes through each line from the hubCheck output
     for line in (bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output")):
        count=count+1
        # Checks if the line has is number of problems line
        if stringTerm(re.findall(pattern, line)).isdigit():
           if int(stringTerm(re.findall(pattern, line))) >= 6:# If the line is above the limit then write 5 lines of errors, ... and ###
              for l in (bash("head -"+str(count+5)+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | tail -7")):
                  f.write(l+'\n')
              f.write('...\n')
              f.write("#############################################\n")
           else: # Else write all the errors within the limit
              for l in (bash("head -"+str(count+ int(stringTerm(re.findall(pattern, line))))+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | tail -"+str(int(stringTerm(re.findall(pattern, line)))+2))):
                 f.write(l+'\n')
              f.write("#############################################\n")
 
 
 emailIntro= """Dear UCSC Public Hub author,
 
 I am writing on behalf of the UCSC Genome Browser. We wanted to alert you that your
 public track hub at the address:"""
 
 
 endEmail="""
 hubCheck is a command-line utility that checks files in the hub are correctly formatted. If you
 would like to run the hubCheck utility on your own machine, you can download the tool from the
 utilities directory: 
 https://hgdownload.soe.ucsc.edu/downloads.html#utilities_downloads
 
 Please update your public track hub. If you have any questions, please let us know, and we will be
 happy to assist. Do not hesitate to let us know if we can help you resolve this situation, e.g. by
 updating the URL where the hub is hosted or possibly hosting the files on our servers.
 
 You can reach us at genome-www@soe.ucsc.edu.
 
 Thank you for your interest and contributions,
 The UCSC Genome Browser Group
 """
 
 # Gets the total lines number from the hubCheck output 
 totalLines=bash("wc -l /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/filtered_output.txt | tr ' ' '\t' | cut -f1")
 with open("/hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/draftedMessages.txt", 'a') as f:
      f.write('##########################\n')
-     # For loop that gets each hub.txt that has the error of missing description page
-     for item in checkDuplicates(descPageMis):
+     # For loop that gets each hub.txt that has the error of couldn't open (more actionable, drafted first)
+     for item in checkDuplicates(couldNotOpen):
              f.write('Send email to: '+getEmail(stringTerm(item))+'\n')
              f.write(emailIntro % item)
-             # For loop that gets hubCheck output for each hub.txt that has the error of missing description page
+             # For loop that gets hubCheck output for each hub.txt that has the error of couldn't open
              for line in (bash("cat  /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/filtered_output.txt | grep -A "+stringTerm(totalLines)+" "+stringTerm(item))):
                 if '#' not in line:
                    f.write(line+'\n')
                 else: break
              f.write("\nWhen running hubCheck "+stringTerm(item)+'\n')
              f.write(endEmail)
              f.write('##########################\n')
-     # For loop that gets each hub.txt that has the error of couldn't open 
-     for item in checkDuplicates(couldNotOpen):
+     # For loop that gets each hub.txt that has the error of missing description page
+     # Skip hubs already drafted in the couldNotOpen loop above (same draft body is produced for both)
+     for item in checkDuplicates(descPageMis):
+             if item in couldNotOpen:
+                 continue
              f.write('Send email to: '+getEmail(stringTerm(item))+'\n')
              f.write(emailIntro % item)
-             # For loop that gets hubCheck output for each hub.txt that has the error of couldn't open
+             # For loop that gets hubCheck output for each hub.txt that has the error of missing description page
              for line in (bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/filtered_output.txt | grep -A "+stringTerm(totalLines)+" "+stringTerm(item))):
                 if '#' not in line:
                    f.write(line+'\n')
                 else: break
              f.write("\nWhen running hubCheck "+stringTerm(item)+'\n')
              f.write(endEmail)
              f.write('##########################\n')
 
 bash("cp /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/draftedMessages.txt /usr/local/apache/htdocs-genecats/qa/test-results/hubCheckCron/"+datetime.now().strftime("%Y-%m"))
 print("Check https://genecats.gi.ucsc.edu/qa/test-results/hubCheckCron/"+datetime.now().strftime("%Y-%m")+"/draftedMessages.txt to email hub authors about missing/broken public hub files")
 print("Archive of monthly raw data can be found here: /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m"))