46ad8ee25abcc7c81cd3005972b9069f2f14b5c5 gperez2 Thu Apr 23 12:09:16 2026 -0700 Updating hubPublicMail and hubPublicAutoUpdate crons to correctly detect and contact authors of broken public hubs. hubPublicMail now uses curl instead of python-requests (handles Cloudflare bot-blocks and 4xx/5xx responses that requests silently treats as successful), falls back to hubPublic.email as a secondary source so newly-added broken hubs still accumulate failCount, and strips the mailto: prefix in parseEmail. hubPublicAutoUpdate now escapes double quotes on email values. No RM diff --git src/utils/qa/hubPublicMail src/utils/qa/hubPublicMail index 22dcdef6310..2a610dca5a2 100755 --- src/utils/qa/hubPublicMail +++ src/utils/qa/hubPublicMail @@ -1,21 +1,21 @@ #!/usr/bin/env python3 import logging, sys, optparse, os, requests, sys, urllib3, atexit, urllib from os.path import isfile from email.mime.text import MIMEText -from subprocess import Popen, PIPE +from subprocess import Popen, PIPE, run # makes sure that only one instance of the program runs at the same time lockFname = None # switch off insecure SSL warnings urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) fromEmail="qateam@gi.ucsc.edu" emailTemplate = """Dear UCSC Public Hub author, This is an automated email sent by the UCSC Genome Browser Group's hubPublicMail system to alert you that your public track hub at the address: %s @@ -61,56 +61,94 @@ logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options # ----------- main -------------- def getHubUrls(centralName): " return list of huburls " logging.debug("Getting hubUrls from db %s" % centralName) if centralName == 'hgcentral': cmd = "hgsql -h genome-centdb %s -se 'select hubUrl from hubPublic'" % centralName else: cmd = "hgsql %s -se 'select hubUrl from hubPublic'" % centralName lines = os.popen(cmd).read().splitlines() # please do not suggest using subprocess in code review. Thx. return lines +def getHubPublicEmails(centralName): + " return dict url -> parsed email from the hubPublic.email column " + logging.debug("Getting hubPublic emails from db %s" % centralName) + if centralName == 'hgcentral': + cmd = "hgsql -h genome-centdb %s -Nse 'select hubUrl, email from hubPublic'" % centralName + else: + cmd = "hgsql %s -Nse 'select hubUrl, email from hubPublic'" % centralName + emails = {} + for line in os.popen(cmd).read().splitlines(): + parts = line.split("\t", 1) + if len(parts) != 2: + continue + url, raw = parts + raw = raw.strip() + if not raw or raw == "NULL": + continue + # parseEmail was written for live hub.txt values and can raise on weird + # table-stored shapes (e.g. without quotes). Skip + # those rows; the live-scrape path or the stored status-tab value + # will still cover them downstream. logged at debug level so the hourly + # cron email doesn't get spammed with the same known row every run. + try: + emails[url] = parseEmail(raw) + except Exception as e: + logging.debug("Could not parse hubPublic.email for %s: %r (%s)" % (url, raw, e)) + return emails + def parseEmail(s): " return only email address given email value of hub.txt. handles these weird cases " # jferna10@ucsc.edu or max@soe.ucsc.edu # yzz2 at psu.edu # Ian Dunham if "<" in s: - return s.split('"')[1] + # s.split('"')[1] yields "mailto:dunham@ebi.ac.uk" from the quoted form; + # strip the mailto: prefix so sendmail gets a real address, not a URI. + val = s.split('"')[1] + if val.startswith("mailto:"): + val = val[len("mailto:"):] + return val if " at " in s: return s.replace(" at ", "@") if " or " in s: return s.split(" or ")[1] return s def downloadUrls(urls, emails): " try to read all hub.txt in URLs, return list of failed URLs, and dict hub -> email" didFail = list() emails = dict() for url in urls: logging.debug("Checking %s" % url) reqFailed = False if url.startswith("http"): try: - f = requests.get(url, verify=False, timeout=5) - text = f.text + # shelling out to curl matches hubPublicAutoUpdate's approach and + # gets past Cloudflare bot-blocks that reject python-requests even + # with a browser-like User-Agent (TLS fingerprint differs). + # --fail makes curl exit non-zero on 4xx/5xx so errors land in failedUrls. + r = run(["curl", "--user-agent", "genome.ucsc.edu/net.c", + "-skL", "--fail", "--connect-timeout", "10", "--max-time", "15", url], + check=True, stdout=PIPE, universal_newlines=True, timeout=20) + text = r.stdout except KeyboardInterrupt: # handle ctrl-c for debugging sys.exit(1) except: reqFailed = True else: # FTP try: f = urllib.request.urlopen(url, timeout=10) text = f.read().decode("utf8") except: reqFailed = True if reqFailed: logging.debug("URL %s failed." % url) didFail.append(url) @@ -153,46 +191,49 @@ if not isfile(fname): return hubs logging.debug("Reading %s" % fname) for line in open(fname): if line.startswith("#"): continue row = line.rstrip("\n").split("\t") if len(row)!=3: logging.error("Cannot parse line in status file: %s" % repr(line)) assert(False) hubUrl, email, failCount = row hubs[hubUrl] = (email, int(failCount)) return hubs -def mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails): - """ given a list hubPublic URLs, a list of failed URLs and a dict with url-> email, - return a dict with URL -> (email, failedCount) """ +def mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails, hubPublicEmails): + """ given a list hubPublic URLs, a list of failed URLs, a dict url->email read + from the live hub.txt and a dict url->hubPublic.email, return a dict with + URL -> (email, failedCount) """ urlInfo = {} for url in urls: oldInfo = oldUrlInfo.get(url) if oldInfo is None: - oldEmail = urlEmails.get(url) + # new hub: prefer the email read from the live hub.txt, fall back to + # hubPublic.email so newly-added hubs that are already unreachable still + # get tracked and eventually mailed + oldEmail = urlEmails.get(url) or hubPublicEmails.get(url) if oldEmail is None: - print("URL %s is broken and there is no email in the status file. Skipping it." % url) + print("URL %s is broken and there is no email in the status file or hubPublic table. Skipping it." % url) continue oldInfo = [oldEmail, 0] - email = urlEmails.get(url) # prefer most current email address - if email is None: - email = oldInfo[0] + # prefer most current email address: live hub.txt > hubPublic.email > stored + email = urlEmails.get(url) or hubPublicEmails.get(url) or oldInfo[0] failCount = oldInfo[1] if url in failedUrls: failCount += 1 else: failCount = 0 urlInfo[url] = (email, failCount) return urlInfo def sendEmails(urlInfo): " given dict url -> (email, failCount), send email if failCount > 24 and set failCount = -48 " for url, (destEmail, failCount) in urlInfo.items(): @@ -225,39 +266,40 @@ logging.error("lockfile %s already exists. Check if this program is already running." % lockFname) sys.exit(1) open(lockFname, "w") # create file atexit.register(removeLock) def removeLock(): if isfile(lockFname): os.remove(lockFname) def hubPublicMail(centralName, statusFname): " send email if a hub fails more than 24 times " createLockFile(statusFname) urls = getHubUrls(centralName) + hubPublicEmails = getHubPublicEmails(centralName) oldUrlInfo = readStatus(statusFname) failedUrls, urlEmails = downloadUrls(urls, oldUrlInfo) if len(failedUrls) > 10: logging.error( "%d broken hubs, which is more than the 10 allowed. Something is weird. Please check the network setup." % len(failedUrls)) sys.exit(1) - newUrlInfo = mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails) + newUrlInfo = mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails, hubPublicEmails) newUrlInfo = sendEmails(newUrlInfo) writeStatus(newUrlInfo, statusFname) removeLock() def main(): args, options = parseArgs() centralName, statusFname = args hubPublicMail(centralName, statusFname) main()