af3a143571e5aa064eab75c34f9444b35413b562 chmalee Tue Nov 30 15:28:15 2021 -0800 Add snippet support to trix searching. Required changing the wordPos from the first highest matching wordIndex to the wordIndex of the actual span. Have trixContextIndex create a second level index for fast retrieval of line offsets in original text file used by ixIxx. Create a simple UI for navigating hgFind search results. diff --git src/hg/htdocs/addHtml src/hg/htdocs/addHtml new file mode 100755 index 0000000..2ed14ed --- /dev/null +++ src/hg/htdocs/addHtml @@ -0,0 +1,92 @@ +#!/cluster/software/bin/python3 + +""" +This program crawls through a list of HTML files +and adds them to the html data source table for +sphinx. + +Usage: +./addHtml listOfFileNames outTabFile + +inFileName is newline separated path strings like: +/path/to/file + +outTabFile has the format: +idtitledestinationUrlcontent + +inFileName can be 'stdin' in which case lines will be read from standard input. +""" + +import sys,argparse,gzip,os,re +# for reading whole files as strings easier +from pathlib import Path + +def parseCommandLine(): + parser = argparse.ArgumentParser(description="Crawl a list of HTML files and generate " + + "the appropriate tab file that can be loaded into the sphinx indexed search table.", + epilog="inFileName can be stdin to read from stdin. Only outTabFile can be stdout") + parser.add_argument("inFileName", help="file with a list of files to index.") + parser.add_argument("outTabFile", help="Tab separated output for loading into sphinx table.") + args = parser.parse_args() + return args + +headers = ["id", "title", "destination", "content"] + +titleRegex = re.compile("