af3a143571e5aa064eab75c34f9444b35413b562
chmalee
  Tue Nov 30 15:28:15 2021 -0800
Add snippet support to trix searching. Required changing the
wordPos from the first highest matching wordIndex to the
wordIndex of the actual span. Have trixContextIndex create a
second level index for fast retrieval of line offsets in
original text file used by ixIxx. Create a simple UI for navigating
hgFind search results.

diff --git src/index/trixContextIndex/trixContextIndex.c src/index/trixContextIndex/trixContextIndex.c
index a3037ff..1f57cb6 100644
--- src/index/trixContextIndex/trixContextIndex.c
+++ src/index/trixContextIndex/trixContextIndex.c
@@ -1,53 +1,158 @@
 /* trixContextIndex - Index in.txt file used with ixIxx to produce a two column file with symbol name 
  * and file offset for that line.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
+#include "obscure.h"
+
+static int prefixSize = 15;
+int binSize = 64*1024;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
-  "trixContextIndex - Index in.txt file used with ixIxx to produce a two column file with symbol\n"
-  "name and file offset for that line.\n"
+  "trixContextIndex - Index in.txt file used with ixIxx to produce two files:\n"
+  "    - a two column file with symbol name and file offset for that line\n"
+  "    - an index into the file with the offsets for efficient retrieval\n"
+  "Note that outBaseName.offsets and outBaseName.offsets.ixx will be created\n"
   "usage:\n"
-  "   trixContextIndex in.txt out.tab\n"
+  "   trixContextIndex in.txt outBaseName\n"
   "options:\n"
-  "   -xxx=XXX\n"
+  "    -prefixSize=N length of prefix to use for second level index\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
+   {"prefixSize", OPTION_INT},
    {NULL, 0},
 };
 
+void writeIxxEntry(FILE *f, char *prefix, unsigned long long pos)
+/* Write out one index entry to file. */
+{
+fprintf(f, "%s%010llX\n", prefix, pos);
+}
+
+void setPrefix(char *word, char *prefix)
+/* Copy first part of word to prefix.  If need be end pad with spaces. */
+{
+int len = strlen(word);
+if (len >= prefixSize)
+    memcpy(prefix, word, prefixSize);
+else
+    {
+    memset(prefix, ' ', prefixSize);
+    memcpy(prefix, word, len);
+    }
+}
+
+void makeIxx(char *inIx)
+{
+char infile[PATH_LEN];
+safef(infile, sizeof(infile), "%s.offsets", inIx);
+struct lineFile *lf = lineFileOpen(infile, TRUE);
+char outIxx[PATH_LEN];
+safef(outIxx, sizeof(outIxx), "%s.offsets.ixx", inIx);
+FILE *f = mustOpen(outIxx, "w");
+char *curPrefix = needMem(prefixSize+1);
+char *lastPrefix = needMem(prefixSize+1);
+char *writtenPrefix = needMem(prefixSize+1);
+off_t startPrefixPos = 0, writtenPos = 0, curPos;
+char *line, *word;
+
+/* Read first line and index it. */
+if (!lineFileNextReal(lf, &line))
+    errAbort("%s is empty", inIx);
+word = nextWord(&line);
+//tolowers(word);
+setPrefix(word, writtenPrefix);
+strcpy(lastPrefix, writtenPrefix);
+writtenPos = lineFileTell(lf);
+writeIxxEntry(f, writtenPrefix, writtenPos);
+
+/* Loop around adding to index as need be */
+while (lineFileNextReal(lf, &line))
+    {
+    int diff;
+    curPos = lineFileTell(lf);
+    word = nextWord(&line);
+    setPrefix(word, curPrefix);
+    if (!sameString(curPrefix, lastPrefix))
+        startPrefixPos = curPos;
+    diff = curPos - writtenPos;
+    if (diff >= binSize)
+        {
+	if (!sameString(curPrefix, writtenPrefix))
+	    {
+	    writeIxxEntry(f, curPrefix, startPrefixPos);
+	    writtenPos = curPos;
+	    strcpy(writtenPrefix, curPrefix);
+	    }
+	}
+    strcpy(lastPrefix, curPrefix);
+    }
+carefulClose(&f);
+lineFileClose(&lf);
+freeMem(curPrefix);
+freeMem(lastPrefix);
+freeMem(writtenPrefix);
+}
+
+struct wordPos
+{
+    char *word;
+    long long pos;
+};
+
+void makeIx(FILE *f, struct hash *itemIdHash)
+{
+struct hashEl *hel, *helList = hashElListHash(itemIdHash);
+slSort(&helList, hashElCmp);
+for (hel = helList; hel != NULL; hel = hel->next)
+    {
+    char *word = hel->name;
+    struct wordPos *wordPos = (struct wordPos *)hel->val;
+    fprintf(f, "%s\t%lld\n", word, wordPos->pos);
+    }
+}
+
 void trixContextIndex(char *input, char *output)
 /* trixContextIndex - Index in.txt file used with ixIxx to produce a two column file with symbol name 
  * and file offset for that line.. */
 {
+char outputFile[PATH_LEN];
+safef(outputFile, sizeof(outputFile), "%s.offsets", output);
 struct lineFile *lf = lineFileOpen(input, TRUE);
-FILE *f = mustOpen(output, "w");
+FILE *f = mustOpen(outputFile, "w");
 char *line;
+struct hash *itemIdHash = hashNew(20);
 while (lineFileNextReal(lf, &line))
     {
     long long pos = lineFileTell(lf);
     char *word = nextWord(&line);
     if (word == NULL)
         errAbort("Short line %d of %s", lf->lineIx, input);
-     fprintf(f, "%s\t%lld\n", word, pos);
+    struct wordPos *wordPos = NULL;
+    AllocVar(wordPos);
+    wordPos->word = word;
+    wordPos->pos = pos;
+    hashAdd(itemIdHash, word, wordPos);
     }
-
+makeIx(f, itemIdHash);
 carefulClose(&f);
+makeIxx(output);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
+prefixSize = optionInt("prefixSize", prefixSize);
 if (argc != 3)
     usage();
 trixContextIndex(argv[1], argv[2]);
 return 0;
 }