src/hg/htdocs/addHtml af3a143571e5aa064eab75c34f9444b35413b562

af3a143571e5aa064eab75c34f9444b35413b562
chmalee
  Tue Nov 30 15:28:15 2021 -0800
Add snippet support to trix searching. Required changing the
wordPos from the first highest matching wordIndex to the
wordIndex of the actual span. Have trixContextIndex create a
second level index for fast retrieval of line offsets in
original text file used by ixIxx. Create a simple UI for navigating
hgFind search results.

diff --git src/hg/htdocs/addHtml src/hg/htdocs/addHtml
new file mode 100755
index 0000000..2ed14ed
--- /dev/null
+++ src/hg/htdocs/addHtml
@@ -0,0 +1,92 @@
+#!/cluster/software/bin/python3
+
+"""
+This program crawls through a list of HTML files
+and adds them to the html data source table for
+sphinx.
+
+Usage:
+./addHtml listOfFileNames outTabFile
+
+inFileName is newline separated path strings like:
+/path/to/file
+
+outTabFile has the format:
+id<tab>title<tab>destinationUrl<tab>content
+
+inFileName can be 'stdin' in which case lines will be read from standard input.
+"""
+
+import sys,argparse,gzip,os,re
+# for reading whole files as strings easier
+from pathlib import Path
+
+def parseCommandLine():
+    parser = argparse.ArgumentParser(description="Crawl a list of HTML files and generate " +
+            "the appropriate tab file that can be loaded into the sphinx indexed search table.",
+        epilog="inFileName can be stdin to read from stdin. Only outTabFile can be stdout")
+    parser.add_argument("inFileName", help="file with a list of files to index.")
+    parser.add_argument("outTabFile", help="Tab separated output for loading into sphinx table.")
+    args = parser.parse_args()
+    return args
+
+headers = ["id", "title", "destination", "content"]
+
+titleRegex = re.compile("<!--#set var=\"TITLE\" value=\"(.*)\"")
+
+def addHtml(infh, outfh, binaryMode=False):
+    if binaryMode:
+        t = "\t".join(headers) + "\n"
+        outfh.write(t.encode())
+    else:
+        outfh.write("\t".join(headers) + "\n")
+    idx = 0
+    for line in infh:
+        path = line.strip()
+        title = os.path.basename(path)
+        # slurp the file content into memory, probably a
+        # better way to go about this:
+        content = Path(path).read_text(encoding="utf-8")
+        try:
+            match = re.search(titleRegex, content)
+            if match:
+                title = match.group(1)
+                if binaryMode:
+                    t = "%d\t%s\t" % (idx,title)
+                    outfh.write(t.encode())
+                    outfh.write(content.encode("unicode_escape"))
+                    t = "\t" + path
+                    outfh.write(t.encode())
+                    outfh.write("\n".encode())
+                else:
+                    outfh.write("%d\t%s\t%s\t" % (idx, title, path))
+                    outfh.write(content.encode("unicode_escape").decode("utf-8"))
+                    outfh.write("\n")
+                idx += 1
+        except UnicodeDecodeError:
+            continue
+    infh.close()
+    outfh.close()
+
+def main():
+    args = parseCommandLine()
+    inFname = args.inFileName
+    outFname = args.outTabFile
+    infh = None
+    outfh = None
+    if inFname == "stdin" or inFname == "/dev/stdin":
+        infh = sys.stdin
+    else:
+        infh = open(inFname, "r")
+
+    if outFname == "stdout" or outFname == "/dev/stdout":
+        outfh = sys.stdout
+    elif outFname[-3:] == ".gz":
+        outfh = gzip.open(outFname, "wb")
+    else:
+        outfh = open(outFname, "w")
+
+    addHtml(infh, outfh, outFname[-3:] == ".gz")
+
+if __name__ == "__main__":
+    main()