68c9515cb37ea293c7eeb106c8761ed1daea644b
gperez2
  Wed Jun 21 12:15:44 2023 -0700
Adding cronjob scripts to run hubCheck for Public Hubs, refs #29319

diff --git src/utils/qa/hubCheckDraftEmails.py src/utils/qa/hubCheckDraftEmails.py
new file mode 100755
index 0000000..5403863
--- /dev/null
+++ src/utils/qa/hubCheckDraftEmails.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+
+# Program Header
+# Name:   Gerardo Perez
+# Description: A program that parses the hubCheck output into email drafts for hub authors regarding
+#              missing description pages and couldn't open errors
+#
+# hubCheckDraftEmails.py
+#
+#
+# Version: Python 3.6.5 
+#
+import os
+import getpass
+import sys
+import re
+import json
+import io
+import requests
+import subprocess
+from datetime import datetime
+
+user = getpass.getuser()
+
+def bash(cmd):
+    """Input bash cmd and return stdout"""
+    rawOutput = subprocess.run(cmd,check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
+    return(rawOutput.stdout.split('\n')[0:-1])
+
+#Make directories for the month (Y-M) 
+try:
+   os.makedirs("/hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m"), exist_ok=True)
+except OSError as e:
+    print(f"Error creating directory: {e}")
+
+try:
+   os.makedirs("/usr/local/apache/htdocs-genecats/qa/test-results/hubCheckCron/"+datetime.now().strftime("%Y-%m"), exist_ok=True)
+except OSError as e:
+    print(f"Error creating directory: {e}")
+
+# Creates list for the hub.txt URLs that have the error of missing description pages
+descPageMis=[]
+
+# Creates list for the hub.txt URLs that have the error of couldn't open
+couldNotOpen=[]
+
+# Gets hubCheck output
+output_line=bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output")
+
+lineCount=0
+for line in output_line:
+    lineCount=lineCount+1
+    line=str(line)
+    if "Couldn't open" in line: #Gets each hub.txt that has the error of couldn't open
+       couldNotOpen.append(bash("head -"+str(lineCount)+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | grep -A 1 '####' | tail -1"))
+    if "missing description page" in line: #Gets each hub.txt that has the error of missing description page
+       descPageMis.append(bash("head -"+str(lineCount)+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | grep -A 1 '####' | tail -1"))
+
+ 
+
+def checkDuplicates(list):
+    """Input list to get distinct items"""
+    newList=[]
+    for item in list:
+        if item not in newList:
+           newList.append(item)
+    return newList
+
+def stringTerm(term):
+    """Coverts list input to string"""
+    newString=str(term)[1:-1].replace("\'", "")
+    return newString
+
+def getEmail(hubUrl):
+    """Gets email from hubUrl"""
+    email=stringTerm(bash("curl -Ls "+hubUrl+" | grep '^email' | awk '{print $2}'"))
+    empty=""
+    if email==empty:
+       email="N/A <---------- Check: https://genecats.gi.ucsc.edu/qa/test-results/publicHubContactInfo/publicHubContact.html"
+    return email
+
+count=0
+# pattern for line that has number of problems
+pattern = r"Found (\d+) problem."
+with open("/hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/filtered_output.txt", 'a') as f:
+    f.write("#############################################\n")
+    # For loop that goes through each line from the hubCheck output
+    for line in (bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output")):
+       count=count+1
+       # Checks if the line has is number of problems line
+       if stringTerm(re.findall(pattern, line)).isdigit():
+          if int(stringTerm(re.findall(pattern, line))) >= 6:# If the line is above the limit then write 5 lines of errors, ... and ###
+             for l in (bash("head -"+str(count+5)+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | tail -7")):
+                 f.write(l+'\n')
+             f.write('...\n')
+             f.write("#############################################\n")
+          else: # Else write all the errors within the limit
+             for l in (bash("head -"+str(count+ int(stringTerm(re.findall(pattern, line))))+" /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | tail -"+str(int(stringTerm(re.findall(pattern, line)))+2))):
+                f.write(l+'\n')
+             f.write("#############################################\n")
+
+
+emailIntro= """Dear UCSC Public Hub author,
+
+I am writing on behalf of the UCSC Genome Browser. We wanted to alert you that your
+public track hub at the address:"""
+
+
+endEmail="""
+hubCheck is a command-line utility that checks files in the hub are correctly formatted. If you
+would like to run the hubCheck utility on your own machine, you can download the tool from the
+utilities directory: 
+https://hgdownload.soe.ucsc.edu/downloads.html#utilities_downloads
+
+Please update your public track hub. If you have any questions, please let us know, and we will be
+happy to assist. Do not hesitate to let us know if we can help you resolve this situation, e.g. by
+updating the URL where the hub is hosted or possibly hosting the files on our servers.
+
+You can reach us at genome-www@soe.ucsc.edu.
+
+Thank you for your interest and contributions,
+The UCSC Genome Browser Group
+"""
+
+# Gets the total lines number from the hubCheck output 
+totalLines=bash("wc -l /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | tr ' ' '\t' | cut -f1")
+with open("/hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/draftedMessages.txt", 'a') as f:
+     f.write('##########################\n')
+     # For loop that gets each hub.txt that has the error of missing description page
+     for item in checkDuplicates(descPageMis):
+             f.write('Send email to: '+getEmail(stringTerm(item))+'\n')
+             f.write(emailIntro % item)
+             # For loop that gets hubCheck output for each hub.txt that has the error of missing description page
+             for line in (bash("cat /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | grep -A "+stringTerm(totalLines)+" "+stringTerm(item))):
+                if '#' not in line:
+                   f.write(line+'\n')
+                else: break
+             f.write("\nWhen running hubCheck "+stringTerm(item)+'\n')
+             f.write(endEmail)
+             f.write('##########################\n')
+     # For loop that gets each hub.txt that has the error of couldn't open 
+     for item in checkDuplicates(couldNotOpen):
+             f.write('Send email to: '+getEmail(stringTerm(item))+'\n')
+             f.write(emailIntro % item)
+             # For loop that gets hubCheck output for each hub.txt that has the error of couldn't open
+             for line in (bash("cat  /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m")+"/hubCheck_output | grep -A "+stringTerm(totalLines)+" "+stringTerm(item))):
+                if '#' not in line:
+                   f.write(line+'\n')
+                else: break
+             f.write("\nWhen running hubCheck "+stringTerm(item)+'\n')
+             f.write(endEmail)
+             f.write('##########################\n')
+
+print("Check https://genecats.gi.ucsc.edu/qa/test-results/hubCheckCron/"+datetime.now().strftime("%Y-%m")+"/draftedMessages.txt to email hub authors about missing/broken public hub files")
+print("Archive of monthly raw data can be found here: /hive/users/qateam/hubCheckCronArchive/"+datetime.now().strftime("%Y-%m"))