93f7f9e49078d2ad71f0b73f4d56b211d36a57b5
gperez2
  Mon Dec 8 09:17:23 2025 -0800
Adding python script that reports differences in static HTML docs across hgwdev/hgwbeta/hgw0, refs #35294

diff --git src/utils/qa/checkStaticDocsDiff.py src/utils/qa/checkStaticDocsDiff.py
new file mode 100755
index 00000000000..3500d75e1fe
--- /dev/null
+++ src/utils/qa/checkStaticDocsDiff.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+
+# Program Header
+# Name:   Gerardo Perez
+# Description: A program that reports differences in static HTML docs
+#              across hgwdev, hgwbeta, and hgw0.
+#
+#
+# checkStaticDocsDiff.py
+#
+#
+
+import subprocess
+import argparse
+
+# Run a bash command locally
+def bash(cmd):
+    """Executes a shell command locally and returns output as a list of lines."""
+    rawOutput = subprocess.run(cmd, check=True, shell=True,
+                               stdout=subprocess.PIPE, universal_newlines=True)
+    return rawOutput.stdout.split('\n')[0:-1]
+
+# Run a command on a remote host via SSH
+def get_checksums_via_ssh(host, command):
+    """Executes a checksum command on a remote host via SSH."""
+    result = subprocess.run(
+        ["ssh", "qateam@" + host, command],
+        capture_output=True, text=True
+    )
+    return result.stdout
+
+# Parse md5sum output into a dict. Example line: "d41d8cd98f00b204e9800998ecf8427e  /path/to/file.html"
+# Produces: { "/path/to/file.html": "d41d8cd..." }
+def parse_checksums(output):
+    """Parses md5sum output into a dictionary mapping filename -> checksum."""
+    checksums = {}
+    for line in output.splitlines():
+        parts = line.split(None, 1)
+        if len(parts) == 2:
+            md5sum, file = parts
+            checksums[file] = md5sum
+    return checksums
+
+# Format output as a URL (optional - controlled by --links)
+# hgwdev uses ".gi.ucsc.edu". Others use ".soe.ucsc.edu"
+def format_file_link(host, file, use_links):
+    """Formats the file as a clickable URL when --links is enabled."""
+    if use_links:
+        if host == "hgwdev":
+            domain = "hgwdev.gi.ucsc.edu"
+        else:
+            domain = host + ".soe.ucsc.edu"
+        return "https://" + domain + file.replace("/usr/local/apache/htdocs", "")
+    return file
+
+# Ignore unwanted files and directories
+def should_ignore(file, ignore_list):
+    """Returns True if the file path should be ignored."""
+    return any(term in file for term in ignore_list)
+
+
+# MAIN PROGRAM
+def main():
+
+    # Argument Parsing
+    parser = argparse.ArgumentParser(
+        description="Compare static HTML docs across hgwdev, hgwbeta, and hgw0."
+    )
+    parser.add_argument(
+        "-l", "--links", action="store_true",
+        help="Format output as clickable https:// URLs"
+    )
+    args = parser.parse_args()
+    
+    
+    
+    # Files/directories to ignore during comparison
+    ignore_files = [
+        "/index.html",
+        "admin/",
+        "thumbNailLinks.html",
+        "gbPageStartHardcoded.html",
+        "ENCODE/",
+        "/docs/",
+        "allTipsRaw.html",
+        "/mirrorDocs/staticPage.html",
+        "googleAnalytics.html",
+        "google09a546cf57abf548.html",
+        "googlef42dd384148a3435.html",
+        "/tipOfDay.html",
+    ]
+    
+    # Commands to list HTML files and calculate md5sum
+    # grep -rl + xargs md5sum is used because find does not work on hgwbeta
+    command = (
+        "grep -rl --include='*.html' '' /usr/local/apache/htdocs 2>/dev/null | "
+        "xargs -d '\n' md5sum | sort -k 2"
+    )
+
+    # Fetch checksums hgwdev, hgwbeta, and hgw0
+    checksums_hgwdev  = parse_checksums(get_checksums_via_ssh('hgwdev', command))
+    checksums_hgwbeta = parse_checksums(get_checksums_via_ssh('hgwbeta', command))
+    checksums_hgw0    = parse_checksums(get_checksums_via_ssh('hgw0', command))
+    
+    # 1. Missing on hgwdev but present on hgwbeta
+    missing_from_hgwdev = [
+        file for file in checksums_hgwbeta.keys() - checksums_hgwdev.keys()
+        if not should_ignore(file, ignore_files)
+    ]
+
+    if missing_from_hgwdev:
+        print("Files present on hgwbeta but missing from hgwdev:")
+        for file in missing_from_hgwdev:
+            print(format_file_link("hgwbeta", file, args.links))
+        print()
+
+    # 2. Files on both hgwdev & hgwbeta but with different md5sum
+    diff_hgwbeta_hgwdev = [
+        file for file in (checksums_hgwdev.keys() & checksums_hgwbeta.keys())
+        if checksums_hgwdev[file] != checksums_hgwbeta[file]
+        and not should_ignore(file, ignore_files)
+    ]
+
+    if diff_hgwbeta_hgwdev:
+        print("Files present on both hgwbeta and hgwdev but different md5sum:")
+        for file in diff_hgwbeta_hgwdev:
+            print(format_file_link("hgwbeta", file, args.links), checksums_hgwbeta[file])
+            print(format_file_link("hgwdev",  file, args.links), checksums_hgwdev[file], "\n")
+        print()
+
+    # 3. Present only on hgwbeta (missing on hgw0)
+    missing_from_hgw0 = [
+        file for file in checksums_hgwbeta.keys() - checksums_hgw0.keys()
+        if not should_ignore(file, ignore_files)
+    ]
+
+    if missing_from_hgw0:
+        print("Files present only on hgwbeta and missing on hgw0:")
+        for file in missing_from_hgw0:
+            print(format_file_link("hgwbeta", file, args.links))
+        print()
+
+    # 4. Present only on hgw0 (missing on hgwbeta)
+    missing_from_hgwbeta = [
+        file for file in checksums_hgw0.keys() - checksums_hgwbeta.keys()
+        if not should_ignore(file, ignore_files)
+    ]
+
+    if missing_from_hgwbeta:
+        print("Files present only on hgw0 and missing on hgwbeta:")
+        for file in missing_from_hgwbeta:
+            print(format_file_link("hgw0", file, args.links))
+        print()
+
+    # 5. Different md5sum between hgwbeta and hgw0
+    diff_hgw0_hgwbeta = [
+        file for file in (checksums_hgw0.keys() & checksums_hgwbeta.keys())
+        if checksums_hgw0[file] != checksums_hgwbeta[file]
+        and not should_ignore(file, ignore_files)
+    ]
+
+    if diff_hgw0_hgwbeta:
+        print("Files present on both hgw0 and hgwbeta but different md5sum:")
+        for file in diff_hgw0_hgwbeta:
+            print(format_file_link("hgwbeta", file, args.links), checksums_hgwbeta[file])
+            print(format_file_link("hgw0",    file, args.links), checksums_hgw0[file], "\n")
+        print()
+
+# Run the program
+if __name__ == "__main__":
+    main()
+
+# Program Output (Commented out)
+#Files present on hgwbeta but missing from hgwdev:
+#/usr/local/apache/htdocs/admin/maintenance.html
+#/usr/local/apache/htdocs/ancestors/index.html
+#
+#Files present on both hgwbeta and hgwdev but different md5sum:
+#/usr/local/apache/htdocs/docs/index.html 9c1340981564d95cabb906eb15415476
+#/usr/local/apache/htdocs/docs/index.html 234def442a25eead78ccf7caf09cce4c
+#
+#/usr/local/apache/htdocs/docs/tableBrowserTutorial.html c6f36e979edc4912c1d4a80f0aa6eba4
+#/usr/local/apache/htdocs/docs/tableBrowserTutorial.html 5c08659421aea9dc1408b0ac7cc266f3
+#
+#Files present only on hgwbeta and missing on hgw0:
+#/usr/local/apache/htdocs/admin/maintenance.html
+#/usr/local/apache/htdocs/ENCODE/controlledVocabulary.html
+#
+#Files present only on hgw0 and missing on hgwbeta:
+#/usr/local/apache/htdocs/admin/stats/Report.html
+#
+#Files present on both hgw0 and hgwbeta but different md5sum:
+#/usr/local/apache/htdocs/admin/jk-install.html 9a561100775c6c730342e2d94a02fa08
+#/usr/local/apache/htdocs/admin/jk-install.html c97be8e421ce0ad9159e87cda40345d4
+#
+#/usr/local/apache/htdocs/admin/404.html 95c7561df8631b27f9fa1e76d75e7270
+#/usr/local/apache/htdocs/admin/404.html 3a1049daecd22bff590ebf0f2e6c2602