46ad8ee25abcc7c81cd3005972b9069f2f14b5c5
gperez2
  Thu Apr 23 12:09:16 2026 -0700
Updating hubPublicMail and hubPublicAutoUpdate crons to correctly detect
and contact authors of broken public hubs. hubPublicMail now uses curl
instead of python-requests (handles Cloudflare bot-blocks and 4xx/5xx
responses that requests silently treats as successful), falls back to
hubPublic.email as a secondary source so newly-added broken hubs still
accumulate failCount, and strips the mailto: prefix in parseEmail.
hubPublicAutoUpdate now escapes double quotes on email values. No RM

diff --git src/utils/qa/hubPublicMail src/utils/qa/hubPublicMail
index 22dcdef6310..2a610dca5a2 100755
--- src/utils/qa/hubPublicMail
+++ src/utils/qa/hubPublicMail
@@ -1,21 +1,21 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, os, requests, sys, urllib3, atexit, urllib
 from os.path import isfile
 from email.mime.text import MIMEText
-from subprocess import Popen, PIPE
+from subprocess import Popen, PIPE, run
 
 # makes sure that only one instance of the program runs at the same time
 lockFname = None
 
 # switch off insecure SSL warnings
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 fromEmail="qateam@gi.ucsc.edu"
 
 emailTemplate = """Dear UCSC Public Hub author,
 
 This is an automated email sent by the UCSC Genome Browser Group's hubPublicMail system to alert you
 that your public track hub at the address:
 
 %s
@@ -61,56 +61,94 @@
         logging.basicConfig(level=logging.INFO)
         logging.getLogger().setLevel(logging.INFO)
 
     return args, options
 # ----------- main --------------
 def getHubUrls(centralName):
     " return list of huburls "
     logging.debug("Getting hubUrls from db %s" % centralName)
     if centralName == 'hgcentral':
         cmd = "hgsql -h genome-centdb %s -se 'select hubUrl from hubPublic'" % centralName
     else:
         cmd = "hgsql %s -se 'select hubUrl from hubPublic'" % centralName
     lines = os.popen(cmd).read().splitlines() # please do not suggest using subprocess in code review. Thx.
     return lines
 
+def getHubPublicEmails(centralName):
+    " return dict url -> parsed email from the hubPublic.email column "
+    logging.debug("Getting hubPublic emails from db %s" % centralName)
+    if centralName == 'hgcentral':
+        cmd = "hgsql -h genome-centdb %s -Nse 'select hubUrl, email from hubPublic'" % centralName
+    else:
+        cmd = "hgsql %s -Nse 'select hubUrl, email from hubPublic'" % centralName
+    emails = {}
+    for line in os.popen(cmd).read().splitlines():
+        parts = line.split("\t", 1)
+        if len(parts) != 2:
+            continue
+        url, raw = parts
+        raw = raw.strip()
+        if not raw or raw == "NULL":
+            continue
+        # parseEmail was written for live hub.txt values and can raise on weird
+        # table-stored shapes (e.g. <a HREF=mailto:...> without quotes). Skip
+        # those rows; the live-scrape path or the stored status-tab value
+        # will still cover them downstream. logged at debug level so the hourly
+        # cron email doesn't get spammed with the same known row every run.
+        try:
+            emails[url] = parseEmail(raw)
+        except Exception as e:
+            logging.debug("Could not parse hubPublic.email for %s: %r (%s)" % (url, raw, e))
+    return emails
+
 def parseEmail(s):
     " return only email address given email value of hub.txt. handles these weird cases "
     # jferna10@ucsc.edu or max@soe.ucsc.edu
     # yzz2 at psu.edu
     # <a HREF="mailto:dunham@ebi.ac.uk"TARGET=_BLANK>Ian Dunham</a>
     if "<" in s:
-        return s.split('"')[1]
+        # s.split('"')[1] yields "mailto:dunham@ebi.ac.uk" from the quoted <a> form;
+        # strip the mailto: prefix so sendmail gets a real address, not a URI.
+        val = s.split('"')[1]
+        if val.startswith("mailto:"):
+            val = val[len("mailto:"):]
+        return val
     if " at " in s:
         return s.replace(" at ", "@")
     if " or " in s:
         return s.split(" or ")[1]
     return s
 
 def downloadUrls(urls, emails):
     " try to read all hub.txt in URLs, return list of failed URLs, and dict hub -> email"
     didFail = list()
     emails = dict()
 
     for url in urls:
         logging.debug("Checking %s" % url)
 
         reqFailed = False
         if url.startswith("http"):
             try:
-                f = requests.get(url, verify=False, timeout=5)
-                text = f.text
+                # shelling out to curl matches hubPublicAutoUpdate's approach and
+                # gets past Cloudflare bot-blocks that reject python-requests even
+                # with a browser-like User-Agent (TLS fingerprint differs).
+                # --fail makes curl exit non-zero on 4xx/5xx so errors land in failedUrls.
+                r = run(["curl", "--user-agent", "genome.ucsc.edu/net.c",
+                         "-skL", "--fail", "--connect-timeout", "10", "--max-time", "15", url],
+                        check=True, stdout=PIPE, universal_newlines=True, timeout=20)
+                text = r.stdout
             except KeyboardInterrupt: # handle ctrl-c for debugging
                 sys.exit(1)
             except:
                 reqFailed = True
         else:
             # FTP
             try:
                 f = urllib.request.urlopen(url, timeout=10)
                 text = f.read().decode("utf8")
             except:
                 reqFailed = True
 
         if reqFailed:
             logging.debug("URL %s failed." % url)
             didFail.append(url)
@@ -153,46 +191,49 @@
     if not isfile(fname):
         return hubs
 
     logging.debug("Reading %s" % fname)
     for line in open(fname):
         if line.startswith("#"):
             continue
         row = line.rstrip("\n").split("\t")
         if len(row)!=3:
             logging.error("Cannot parse line in status file: %s" % repr(line))
             assert(False)
         hubUrl, email, failCount = row
         hubs[hubUrl] = (email, int(failCount))
     return hubs
 
-def mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails):
-    """ given a list hubPublic URLs, a list of failed URLs and a dict with url-> email,
-    return a dict with URL -> (email, failedCount) """
+def mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails, hubPublicEmails):
+    """ given a list hubPublic URLs, a list of failed URLs, a dict url->email read
+    from the live hub.txt and a dict url->hubPublic.email, return a dict with
+    URL -> (email, failedCount) """
     urlInfo = {}
     for url in urls:
         oldInfo = oldUrlInfo.get(url)
         if oldInfo is None:
-            oldEmail = urlEmails.get(url)
+            # new hub: prefer the email read from the live hub.txt, fall back to
+            # hubPublic.email so newly-added hubs that are already unreachable still
+            # get tracked and eventually mailed
+            oldEmail = urlEmails.get(url) or hubPublicEmails.get(url)
             if oldEmail is None:
-                print("URL %s is broken and there is no email in the status file. Skipping it." % url)
+                print("URL %s is broken and there is no email in the status file or hubPublic table. Skipping it." % url)
                 continue
             oldInfo = [oldEmail, 0]
 
-        email = urlEmails.get(url) # prefer most current email address
-        if email is None:
-            email = oldInfo[0]
+        # prefer most current email address: live hub.txt > hubPublic.email > stored
+        email = urlEmails.get(url) or hubPublicEmails.get(url) or oldInfo[0]
 
         failCount = oldInfo[1]
 
         if url in failedUrls:
             failCount += 1
         else:
             failCount = 0
 
         urlInfo[url] = (email, failCount)
 
     return urlInfo
 
 def sendEmails(urlInfo):
     " given dict url -> (email, failCount), send email if failCount > 24 and set failCount = -48 "
     for url, (destEmail, failCount) in urlInfo.items():
@@ -225,39 +266,40 @@
         logging.error("lockfile %s already exists. Check if this program is already running." % lockFname)
         sys.exit(1)
 
     open(lockFname, "w") # create file
     atexit.register(removeLock)
 
 def removeLock():
     if isfile(lockFname):
         os.remove(lockFname)
 
 def hubPublicMail(centralName, statusFname):
     " send email if a hub fails more than 24 times "
     createLockFile(statusFname)
 
     urls = getHubUrls(centralName)
+    hubPublicEmails = getHubPublicEmails(centralName)
 
     oldUrlInfo = readStatus(statusFname)
 
     failedUrls, urlEmails = downloadUrls(urls, oldUrlInfo)
 
     if len(failedUrls) > 10:
         logging.error(
             "%d broken hubs, which is more than the 10 allowed. Something is weird. Please check the network setup." % len(failedUrls))
         sys.exit(1)
 
-    newUrlInfo = mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails)
+    newUrlInfo = mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails, hubPublicEmails)
     newUrlInfo = sendEmails(newUrlInfo)
 
     writeStatus(newUrlInfo, statusFname)
     removeLock()
 
 def main():
     args, options = parseArgs()
 
     centralName, statusFname = args
     hubPublicMail(centralName, statusFname)
 
 
 main()