46ad8ee25abcc7c81cd3005972b9069f2f14b5c5
gperez2
  Thu Apr 23 12:09:16 2026 -0700
Updating hubPublicMail and hubPublicAutoUpdate crons to correctly detect
and contact authors of broken public hubs. hubPublicMail now uses curl
instead of python-requests (handles Cloudflare bot-blocks and 4xx/5xx
responses that requests silently treats as successful), falls back to
hubPublic.email as a secondary source so newly-added broken hubs still
accumulate failCount, and strips the mailto: prefix in parseEmail.
hubPublicAutoUpdate now escapes double quotes on email values. No RM

diff --git src/utils/qa/hubPublicAutoUpdate src/utils/qa/hubPublicAutoUpdate
index 6781e56f88b..326a25c7418 100755
--- src/utils/qa/hubPublicAutoUpdate
+++ src/utils/qa/hubPublicAutoUpdate
@@ -1,195 +1,195 @@
 #!/usr/bin/env python3
 #Lou Aug 2021
 """
 Given a server (dev/hgwbeta/rr) will query hubPublic table URLs and verify that the shortLabel,
 longLabel, dbCount, dbList, and email matches what is in the hubPublic table. Any inconsistencies
 will be automatically updated. Will print out the commands as well as run them.
 
 Can optionally be run in test mode where it will only print commands without executing.
 
 Example run:
     hubPublicAutoUpdate dev
     hubPublicAutoUpdate hgwbeta -t
     hubPublicAutoUpdate rr
     
 """
 
 import subprocess,sys,argparse
 
 def parseArgs():
     """
     Parse the command line arguments.
     """
     parser = argparse.ArgumentParser(description = __doc__, 
                                      formatter_class=argparse.RawDescriptionHelpFormatter)
     optional = parser._action_groups.pop()
     
     required = parser.add_argument_group('required arguments')
     
     required.add_argument ("server",
         help = "Server from which to query the hubPublic table. Can be dev, hgwbeta, or rr.")
     optional.add_argument ("-t", "--test", dest = "testMode", default = False, action = "store_true",
         help = "Optional: Run in test mode. Print out any discrepancies but do not run any " + \
             "commands.")
     optional.add_argument ("-s", "--silent", dest = "silent", default = False, action = "store_true",
         help = "Optional: Suppress output messages when hubs are not responding.")
     if (len(sys.argv) == 1):
         parser.print_usage()
         print("\nGiven a server (dev/hgwbeta/rr) will query hubPublic table URLs and verify that\n" + \
               "the shortLabel, longLabel, dbCount and dbList matches what is in the hubPublic table.\n" + \
               "Any inconsistencies will be automatically updated. Will print out the commands as\n" + \
               "well as run them.\n\n" + \
               "Can optionally be run in test mode where it will only print commands without executing.\n\n" + \
               "Example run:\n" + \
               "    hubPublicAutoUpdate dev\n" + \
               "    hubPublicAutoUpdate hgwbeta -t\n" + \
               "    hubPublicAutoUpdate rr\n")
         exit(0)
     parser._action_groups.append(optional)
     options = parser.parse_args()
     return  options
 
 def evaluateServer(serverToQuery):
     """Evaluate the input server, ensure it is valid and return corresponding hgsqlInputs"""
     if serverToQuery == 'dev':
         hgsqlInput = ['','hgcentraltest']
     elif serverToQuery == 'hgwbeta':
         hgsqlInput = ['-h hgwbeta ','hgcentralbeta']
     elif serverToQuery == 'rr':
         hgsqlInput = ['-h genome-centdb ','hgcentral']
     else:
         sys.exit("Server called must be either 'dev', 'hgwbeta', or 'rr'")
     return(hgsqlInput)
 
 def buildPubHubDic(hgsqlInput):
     """Build a dictionary out of the requested server's hubPublic table"""
     hubPublicDic = {}
     hubPublic = subprocess.run("/cluster/bin/x86_64/hgsql "+hgsqlInput[0]+"-e 'select * from hubPublic' "+hgsqlInput[1],\
                          check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
     hubPublicOutput = hubPublic.stdout.split('\n')[1:-1]
 
     for hub in hubPublicOutput:
         hub = hub.split('\t')
         hubPublicDic[hub[0]] = {'hubUrl':hub[0],'shortLabel':hub[1],'longLabel':hub[2],\
                             'registrationTime':hub[3],'dbCount':hub[4],\
                             'dbList':hub[5],'descriptionUrl':hub[6],
                             'email':hub[7] if len(hub) > 7 else ''}
     return(hubPublicDic)
 
 def escapeDoubleQuotesOnLabels(label):
     """Look for single and double quotes in labels"""
     if '"' in label:
         label = label.replace('"','\\"')
     if "'" in label:
         label = label.replace("'","\\'")
     return(label)
 
 def curl(url):
     """Run curl command on URL - for http + ftp support"""
     rawCurlOutput = subprocess.run("curl --user-agent \"genome.ucsc.edu/net.c\" -skL --fail --connect-timeout 10 "+url,\
                          check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
     curlStdout = rawCurlOutput.stdout
     return(curlStdout)
 
 def buildCurrentHubTxtDic(hub):
     """Query hub.txt file and build dic of values"""
     currentHub = {}
     response = curl(hub)
     for line in response.splitlines():
         # Stop parsing at genome or track lines to avoid overwriting hub labels
         if line.startswith("genome ") or line.startswith("track "):
             break
         if "\t" in line.rstrip():
             line = line.split("\t")
             currentHub[line[0]] = line[1]
         else:
             line = line.split(" ")
             currentHub[line[0]] = " ".join(line[1:])
     if not (currentHub['descriptionUrl'].startswith("http") or currentHub['descriptionUrl'].startswith("ftp")):
         currentHub['descriptionUrl'] = '/'.join(hub.split('/')[0:len(hub.split('/'))-1])+"/"+currentHub['descriptionUrl']
     return(currentHub)
                     
 def queryHubTxt(currentHub,hub):
     """Query genomes.txt file and fill out dbList and dbCount values"""
     
     currentHub['dbList'] = []
     # A check for useOneFile to parse genome info from hub.txt instead of genomes.txt
     if currentHub.get('useOneFile') == 'on':
         genomeInfo = curl(hub)
     else:
         # Gets genome info from genomes.txt
         genomeFileLocation = currentHub['genomesFile'].rstrip().lstrip()
         if genomeFileLocation.startswith("http"):
             genomeUrl = genomeFileLocation
         else:
             genomeUrl = "/".join(hub.split('/')[:-1])+"/"+genomeFileLocation
         genomeInfo = curl(genomeUrl)
     
     for line in genomeInfo.splitlines():
         if "\t" in line:
             line = line.split("\t")
             if line[0].rstrip().lstrip() == 'genome':
                 while "" in line:
                     line.remove("")
                 currentHub['dbList'].append(line[1].rstrip().lstrip())
         else:
             line = line.split(" ")
             if line[0].rstrip().lstrip() == 'genome':
                 while "" in line:
                     line.remove("")
                 currentHub['dbList'].append(line[1].rstrip().lstrip())
     currentHub['dbCount'] = len(currentHub['dbList'])
     return(currentHub)
 
 def printHgsql(hub,varToEdit,newVarValue,hgsqlInput,testMode):
     """hgsql command to fix the difference"""
     cmd = "/cluster/bin/x86_64/hgsql "+hgsqlInput[0]+"-e \"update hubPublic set "+varToEdit+" = '"+str(newVarValue)+ \
     "' where hubUrl = '"+hub+"'\" "+hgsqlInput[1]
     if not testMode:
         subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
     print(cmd)
 
 def compareResults(hubPublicDic,currentHub,hub,hgsqlInput,testMode):
     """Compare the hubPublic values to the queried currentHub values and report"""
 
     if hubPublicDic[hub]['shortLabel'] != currentHub['shortLabel'].rstrip().lstrip():
         printHgsql(hub,'shortLabel',escapeDoubleQuotesOnLabels(currentHub['shortLabel'].rstrip().lstrip()),hgsqlInput,testMode)
         
     if hubPublicDic[hub]['longLabel'] != currentHub['longLabel'].rstrip().lstrip():     
         printHgsql(hub,'longLabel',escapeDoubleQuotesOnLabels(currentHub['longLabel'].rstrip().lstrip()),hgsqlInput,testMode)
 
     if int(hubPublicDic[hub]['dbCount']) != int(currentHub['dbCount']):
         printHgsql(hub,'dbCount',currentHub['dbCount'],hgsqlInput,testMode)
         
     if set(hubPublicDic[hub]['dbList'][:-1].split(',')) != set(currentHub['dbList']):
         printHgsql(hub,'dbList',",".join(currentHub['dbList'])+",",hgsqlInput,testMode)
 
     if hubPublicDic[hub]['descriptionUrl'] != currentHub['descriptionUrl'].rstrip().lstrip():
         printHgsql(hub,'descriptionUrl',currentHub['descriptionUrl'].rstrip().lstrip(),hgsqlInput,testMode)
 
     if 'email' in currentHub and currentHub['email'].strip():
         if hubPublicDic[hub]['email'] != currentHub['email'].strip():
-            printHgsql(hub,'email',currentHub['email'].strip(),hgsqlInput,testMode)
+            printHgsql(hub,'email',escapeDoubleQuotesOnLabels(currentHub['email'].strip()),hgsqlInput,testMode)
     
 def hubPublicCompare(hubPublicDic,hgsqlInput,testMode,silent):
     """Query hub.txt files and compare values to hubPublic values"""
     for hub in hubPublicDic.keys():
         try: #Try for timeout connections
             currentHub = buildCurrentHubTxtDic(hub)
             currentHub = queryHubTxt(currentHub,hub)
             compareResults(hubPublicDic,currentHub,hub,hgsqlInput,testMode)
         except:
             if not silent:
                 print("The following hub has an error or is not responsive: "+str(hub))
             
 def main():
     """Initialize options and call other functions"""
     options = parseArgs()
     serverToQuery = options.server
     testMode = options.testMode
     silent = options.silent
     hgsqlInput = evaluateServer(serverToQuery)
     hubPublicDic = buildPubHubDic(hgsqlInput)
     hubPublicCompare(hubPublicDic,hgsqlInput,testMode,silent)
 
 main()