46ad8ee25abcc7c81cd3005972b9069f2f14b5c5
gperez2
Thu Apr 23 12:09:16 2026 -0700
Updating hubPublicMail and hubPublicAutoUpdate crons to correctly detect
and contact authors of broken public hubs. hubPublicMail now uses curl
instead of python-requests (handles Cloudflare bot-blocks and 4xx/5xx
responses that requests silently treats as successful), falls back to
hubPublic.email as a secondary source so newly-added broken hubs still
accumulate failCount, and strips the mailto: prefix in parseEmail.
hubPublicAutoUpdate now escapes double quotes on email values. No RM
diff --git src/utils/qa/hubPublicMail src/utils/qa/hubPublicMail
index 22dcdef6310..2a610dca5a2 100755
--- src/utils/qa/hubPublicMail
+++ src/utils/qa/hubPublicMail
@@ -1,21 +1,21 @@
#!/usr/bin/env python3
import logging, sys, optparse, os, requests, sys, urllib3, atexit, urllib
from os.path import isfile
from email.mime.text import MIMEText
-from subprocess import Popen, PIPE
+from subprocess import Popen, PIPE, run
# makes sure that only one instance of the program runs at the same time
lockFname = None
# switch off insecure SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
fromEmail="qateam@gi.ucsc.edu"
emailTemplate = """Dear UCSC Public Hub author,
This is an automated email sent by the UCSC Genome Browser Group's hubPublicMail system to alert you
that your public track hub at the address:
%s
@@ -61,56 +61,94 @@
logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)
return args, options
# ----------- main --------------
def getHubUrls(centralName):
" return list of huburls "
logging.debug("Getting hubUrls from db %s" % centralName)
if centralName == 'hgcentral':
cmd = "hgsql -h genome-centdb %s -se 'select hubUrl from hubPublic'" % centralName
else:
cmd = "hgsql %s -se 'select hubUrl from hubPublic'" % centralName
lines = os.popen(cmd).read().splitlines() # please do not suggest using subprocess in code review. Thx.
return lines
+def getHubPublicEmails(centralName):
+ " return dict url -> parsed email from the hubPublic.email column "
+ logging.debug("Getting hubPublic emails from db %s" % centralName)
+ if centralName == 'hgcentral':
+ cmd = "hgsql -h genome-centdb %s -Nse 'select hubUrl, email from hubPublic'" % centralName
+ else:
+ cmd = "hgsql %s -Nse 'select hubUrl, email from hubPublic'" % centralName
+ emails = {}
+ for line in os.popen(cmd).read().splitlines():
+ parts = line.split("\t", 1)
+ if len(parts) != 2:
+ continue
+ url, raw = parts
+ raw = raw.strip()
+ if not raw or raw == "NULL":
+ continue
+ # parseEmail was written for live hub.txt values and can raise on weird
+ # table-stored shapes (e.g. without quotes). Skip
+ # those rows; the live-scrape path or the stored status-tab value
+ # will still cover them downstream. logged at debug level so the hourly
+ # cron email doesn't get spammed with the same known row every run.
+ try:
+ emails[url] = parseEmail(raw)
+ except Exception as e:
+ logging.debug("Could not parse hubPublic.email for %s: %r (%s)" % (url, raw, e))
+ return emails
+
def parseEmail(s):
" return only email address given email value of hub.txt. handles these weird cases "
# jferna10@ucsc.edu or max@soe.ucsc.edu
# yzz2 at psu.edu
# Ian Dunham
if "<" in s:
- return s.split('"')[1]
+ # s.split('"')[1] yields "mailto:dunham@ebi.ac.uk" from the quoted form;
+ # strip the mailto: prefix so sendmail gets a real address, not a URI.
+ val = s.split('"')[1]
+ if val.startswith("mailto:"):
+ val = val[len("mailto:"):]
+ return val
if " at " in s:
return s.replace(" at ", "@")
if " or " in s:
return s.split(" or ")[1]
return s
def downloadUrls(urls, emails):
" try to read all hub.txt in URLs, return list of failed URLs, and dict hub -> email"
didFail = list()
emails = dict()
for url in urls:
logging.debug("Checking %s" % url)
reqFailed = False
if url.startswith("http"):
try:
- f = requests.get(url, verify=False, timeout=5)
- text = f.text
+ # shelling out to curl matches hubPublicAutoUpdate's approach and
+ # gets past Cloudflare bot-blocks that reject python-requests even
+ # with a browser-like User-Agent (TLS fingerprint differs).
+ # --fail makes curl exit non-zero on 4xx/5xx so errors land in failedUrls.
+ r = run(["curl", "--user-agent", "genome.ucsc.edu/net.c",
+ "-skL", "--fail", "--connect-timeout", "10", "--max-time", "15", url],
+ check=True, stdout=PIPE, universal_newlines=True, timeout=20)
+ text = r.stdout
except KeyboardInterrupt: # handle ctrl-c for debugging
sys.exit(1)
except:
reqFailed = True
else:
# FTP
try:
f = urllib.request.urlopen(url, timeout=10)
text = f.read().decode("utf8")
except:
reqFailed = True
if reqFailed:
logging.debug("URL %s failed." % url)
didFail.append(url)
@@ -153,46 +191,49 @@
if not isfile(fname):
return hubs
logging.debug("Reading %s" % fname)
for line in open(fname):
if line.startswith("#"):
continue
row = line.rstrip("\n").split("\t")
if len(row)!=3:
logging.error("Cannot parse line in status file: %s" % repr(line))
assert(False)
hubUrl, email, failCount = row
hubs[hubUrl] = (email, int(failCount))
return hubs
-def mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails):
- """ given a list hubPublic URLs, a list of failed URLs and a dict with url-> email,
- return a dict with URL -> (email, failedCount) """
+def mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails, hubPublicEmails):
+ """ given a list hubPublic URLs, a list of failed URLs, a dict url->email read
+ from the live hub.txt and a dict url->hubPublic.email, return a dict with
+ URL -> (email, failedCount) """
urlInfo = {}
for url in urls:
oldInfo = oldUrlInfo.get(url)
if oldInfo is None:
- oldEmail = urlEmails.get(url)
+ # new hub: prefer the email read from the live hub.txt, fall back to
+ # hubPublic.email so newly-added hubs that are already unreachable still
+ # get tracked and eventually mailed
+ oldEmail = urlEmails.get(url) or hubPublicEmails.get(url)
if oldEmail is None:
- print("URL %s is broken and there is no email in the status file. Skipping it." % url)
+ print("URL %s is broken and there is no email in the status file or hubPublic table. Skipping it." % url)
continue
oldInfo = [oldEmail, 0]
- email = urlEmails.get(url) # prefer most current email address
- if email is None:
- email = oldInfo[0]
+ # prefer most current email address: live hub.txt > hubPublic.email > stored
+ email = urlEmails.get(url) or hubPublicEmails.get(url) or oldInfo[0]
failCount = oldInfo[1]
if url in failedUrls:
failCount += 1
else:
failCount = 0
urlInfo[url] = (email, failCount)
return urlInfo
def sendEmails(urlInfo):
" given dict url -> (email, failCount), send email if failCount > 24 and set failCount = -48 "
for url, (destEmail, failCount) in urlInfo.items():
@@ -225,39 +266,40 @@
logging.error("lockfile %s already exists. Check if this program is already running." % lockFname)
sys.exit(1)
open(lockFname, "w") # create file
atexit.register(removeLock)
def removeLock():
if isfile(lockFname):
os.remove(lockFname)
def hubPublicMail(centralName, statusFname):
" send email if a hub fails more than 24 times "
createLockFile(statusFname)
urls = getHubUrls(centralName)
+ hubPublicEmails = getHubPublicEmails(centralName)
oldUrlInfo = readStatus(statusFname)
failedUrls, urlEmails = downloadUrls(urls, oldUrlInfo)
if len(failedUrls) > 10:
logging.error(
"%d broken hubs, which is more than the 10 allowed. Something is weird. Please check the network setup." % len(failedUrls))
sys.exit(1)
- newUrlInfo = mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails)
+ newUrlInfo = mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails, hubPublicEmails)
newUrlInfo = sendEmails(newUrlInfo)
writeStatus(newUrlInfo, statusFname)
removeLock()
def main():
args, options = parseArgs()
centralName, statusFname = args
hubPublicMail(centralName, statusFname)
main()