46ad8ee25abcc7c81cd3005972b9069f2f14b5c5 gperez2 Thu Apr 23 12:09:16 2026 -0700 Updating hubPublicMail and hubPublicAutoUpdate crons to correctly detect and contact authors of broken public hubs. hubPublicMail now uses curl instead of python-requests (handles Cloudflare bot-blocks and 4xx/5xx responses that requests silently treats as successful), falls back to hubPublic.email as a secondary source so newly-added broken hubs still accumulate failCount, and strips the mailto: prefix in parseEmail. hubPublicAutoUpdate now escapes double quotes on email values. No RM diff --git src/utils/qa/hubPublicMail src/utils/qa/hubPublicMail index 22dcdef6310..2a610dca5a2 100755 --- src/utils/qa/hubPublicMail +++ src/utils/qa/hubPublicMail @@ -1,263 +1,305 @@ #!/usr/bin/env python3 import logging, sys, optparse, os, requests, sys, urllib3, atexit, urllib from os.path import isfile from email.mime.text import MIMEText -from subprocess import Popen, PIPE +from subprocess import Popen, PIPE, run # makes sure that only one instance of the program runs at the same time lockFname = None # switch off insecure SSL warnings urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) fromEmail="qateam@gi.ucsc.edu" emailTemplate = """Dear UCSC Public Hub author, This is an automated email sent by the UCSC Genome Browser Group's hubPublicMail system to alert you that your public track hub at the address: %s Has been inaccessible for at least 48 consecutive hours. If it continues to be offline, we may have to remove it from our public hubs list at https://genome.ucsc.edu/cgi-bin/hgHubConnect Do not hesitate to let us know if we can help you resolve this situation, e.g. by updating the URL where the hub is hosted or possibly hosting the files on our servers. You can reach us at genome-www@soe.ucsc.edu. Thank you for your interest and contributions, The UCSC Genome Browser Group """ # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] hgcentralname statusFile - send email if public hub is down Goes through the following steps: 1) Get the list of all public hubs 2) Try to get their URLs and write all to statusFile. 3) If a hub fails, increase count in statusFile. 4) If count is > 24, send an email to hub email and set the failCount to -48. Example: hubPublicMail hgcentraltest /tmp/hubPublicStatus.tab """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options # ----------- main -------------- def getHubUrls(centralName): " return list of huburls " logging.debug("Getting hubUrls from db %s" % centralName) if centralName == 'hgcentral': cmd = "hgsql -h genome-centdb %s -se 'select hubUrl from hubPublic'" % centralName else: cmd = "hgsql %s -se 'select hubUrl from hubPublic'" % centralName lines = os.popen(cmd).read().splitlines() # please do not suggest using subprocess in code review. Thx. return lines +def getHubPublicEmails(centralName): + " return dict url -> parsed email from the hubPublic.email column " + logging.debug("Getting hubPublic emails from db %s" % centralName) + if centralName == 'hgcentral': + cmd = "hgsql -h genome-centdb %s -Nse 'select hubUrl, email from hubPublic'" % centralName + else: + cmd = "hgsql %s -Nse 'select hubUrl, email from hubPublic'" % centralName + emails = {} + for line in os.popen(cmd).read().splitlines(): + parts = line.split("\t", 1) + if len(parts) != 2: + continue + url, raw = parts + raw = raw.strip() + if not raw or raw == "NULL": + continue + # parseEmail was written for live hub.txt values and can raise on weird + # table-stored shapes (e.g. <a HREF=mailto:...> without quotes). Skip + # those rows; the live-scrape path or the stored status-tab value + # will still cover them downstream. logged at debug level so the hourly + # cron email doesn't get spammed with the same known row every run. + try: + emails[url] = parseEmail(raw) + except Exception as e: + logging.debug("Could not parse hubPublic.email for %s: %r (%s)" % (url, raw, e)) + return emails + def parseEmail(s): " return only email address given email value of hub.txt. handles these weird cases " # jferna10@ucsc.edu or max@soe.ucsc.edu # yzz2 at psu.edu # <a HREF="mailto:dunham@ebi.ac.uk"TARGET=_BLANK>Ian Dunham</a> if "<" in s: - return s.split('"')[1] + # s.split('"')[1] yields "mailto:dunham@ebi.ac.uk" from the quoted <a> form; + # strip the mailto: prefix so sendmail gets a real address, not a URI. + val = s.split('"')[1] + if val.startswith("mailto:"): + val = val[len("mailto:"):] + return val if " at " in s: return s.replace(" at ", "@") if " or " in s: return s.split(" or ")[1] return s def downloadUrls(urls, emails): " try to read all hub.txt in URLs, return list of failed URLs, and dict hub -> email" didFail = list() emails = dict() for url in urls: logging.debug("Checking %s" % url) reqFailed = False if url.startswith("http"): try: - f = requests.get(url, verify=False, timeout=5) - text = f.text + # shelling out to curl matches hubPublicAutoUpdate's approach and + # gets past Cloudflare bot-blocks that reject python-requests even + # with a browser-like User-Agent (TLS fingerprint differs). + # --fail makes curl exit non-zero on 4xx/5xx so errors land in failedUrls. + r = run(["curl", "--user-agent", "genome.ucsc.edu/net.c", + "-skL", "--fail", "--connect-timeout", "10", "--max-time", "15", url], + check=True, stdout=PIPE, universal_newlines=True, timeout=20) + text = r.stdout except KeyboardInterrupt: # handle ctrl-c for debugging sys.exit(1) except: reqFailed = True else: # FTP try: f = urllib.request.urlopen(url, timeout=10) text = f.read().decode("utf8") except: reqFailed = True if reqFailed: logging.debug("URL %s failed." % url) didFail.append(url) continue lines = text.splitlines() for l in lines: l = l.strip() if l=="": continue keyVal = l.strip().split(None, maxsplit=1) if len(keyVal)!=2: # some hubs may have broken hub.txt files. Treat these as if they were broken. didFail.append(url) break key, val = keyVal if not key=="email": continue emails[url] = parseEmail(val) return didFail, emails def sendEmail(dest, body): " send email to dest " msg = MIMEText(body) msg["From"] = fromEmail msg["To"] = dest+","+"browserqa-group@ucsc.edu" msg["Subject"] = "Your UCSC public hub is down" p = Popen(["/usr/sbin/sendmail", "-t", "-oi"], stdin=PIPE) p.communicate(msg.as_bytes()) # Both Python 2.X and 3.X # p.communicate(msg.as_bytes() if sys.version_info >= (3,0) else msg.as_string()) # Python 2.X # p.communicate(msg.as_string()) def readStatus(fname): " read tab sep file with columns hubUrl, email, failCount " hubs = dict() if not isfile(fname): return hubs logging.debug("Reading %s" % fname) for line in open(fname): if line.startswith("#"): continue row = line.rstrip("\n").split("\t") if len(row)!=3: logging.error("Cannot parse line in status file: %s" % repr(line)) assert(False) hubUrl, email, failCount = row hubs[hubUrl] = (email, int(failCount)) return hubs -def mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails): - """ given a list hubPublic URLs, a list of failed URLs and a dict with url-> email, - return a dict with URL -> (email, failedCount) """ +def mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails, hubPublicEmails): + """ given a list hubPublic URLs, a list of failed URLs, a dict url->email read + from the live hub.txt and a dict url->hubPublic.email, return a dict with + URL -> (email, failedCount) """ urlInfo = {} for url in urls: oldInfo = oldUrlInfo.get(url) if oldInfo is None: - oldEmail = urlEmails.get(url) + # new hub: prefer the email read from the live hub.txt, fall back to + # hubPublic.email so newly-added hubs that are already unreachable still + # get tracked and eventually mailed + oldEmail = urlEmails.get(url) or hubPublicEmails.get(url) if oldEmail is None: - print("URL %s is broken and there is no email in the status file. Skipping it." % url) + print("URL %s is broken and there is no email in the status file or hubPublic table. Skipping it." % url) continue oldInfo = [oldEmail, 0] - email = urlEmails.get(url) # prefer most current email address - if email is None: - email = oldInfo[0] + # prefer most current email address: live hub.txt > hubPublic.email > stored + email = urlEmails.get(url) or hubPublicEmails.get(url) or oldInfo[0] failCount = oldInfo[1] if url in failedUrls: failCount += 1 else: failCount = 0 urlInfo[url] = (email, failCount) return urlInfo def sendEmails(urlInfo): " given dict url -> (email, failCount), send email if failCount > 24 and set failCount = -48 " for url, (destEmail, failCount) in urlInfo.items(): if failCount>24: #logging.info("HUB %s BROKEN - sending email to %s" % (url, destEmail)) #Commenting out line to silence email emailText = emailTemplate % url sendEmail(destEmail, emailText) urlInfo[url] = (destEmail, -48) return urlInfo def writeStatus(urlInfo, statusFname): " write new status file " statusTmp = statusFname+".tmp" logging.debug("Writing %s" % statusTmp) with open(statusTmp, "wt") as ofh: ofh.write("#url\temail\tfailCount\n") for url, (email, failCount) in urlInfo.items(): ofh.write("\t".join([url, email, str(failCount)])) ofh.write("\n") logging.debug("Renaming %s to %s" % (statusTmp, statusFname)) os.rename(statusTmp, statusFname) def createLockFile(statusFname): """ when downloading files, weird things can happen. even wget sometimes gets stuck. So make sure that this program can't run multiple times """ global lockFname lockFname = statusFname+".lock" if isfile(lockFname): logging.error("lockfile %s already exists. Check if this program is already running." % lockFname) sys.exit(1) open(lockFname, "w") # create file atexit.register(removeLock) def removeLock(): if isfile(lockFname): os.remove(lockFname) def hubPublicMail(centralName, statusFname): " send email if a hub fails more than 24 times " createLockFile(statusFname) urls = getHubUrls(centralName) + hubPublicEmails = getHubPublicEmails(centralName) oldUrlInfo = readStatus(statusFname) failedUrls, urlEmails = downloadUrls(urls, oldUrlInfo) if len(failedUrls) > 10: logging.error( "%d broken hubs, which is more than the 10 allowed. Something is weird. Please check the network setup." % len(failedUrls)) sys.exit(1) - newUrlInfo = mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails) + newUrlInfo = mergeInfo(urls, oldUrlInfo, failedUrls, urlEmails, hubPublicEmails) newUrlInfo = sendEmails(newUrlInfo) writeStatus(newUrlInfo, statusFname) removeLock() def main(): args, options = parseArgs() centralName, statusFname = args hubPublicMail(centralName, statusFname) main()